# Data Acquisition

Get the data first


# Tokenization in Spacy

`Tokenization` is a process of splitting text into meaning segments

In [1]:
text = "Dr. Strange loves pav bhaji of Mumbai as it costs only 2$ per plate."

In [2]:
import spacy

# nlp = spacy.load("en_core_web_sm")  
nlp = spacy.blank("en")

doc = nlp(text)

# direct work tokenization
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
Mumbai
as
it
costs
only
2
$
per
plate



When doing ```nlp = spacy.blank("en")```
This happens:
```mermaid
graph LR;
    text --> nlp[nlp]
    tok[Tokenizer] -.included in nlp.-> nlp
    nlp --> doc
```

https://firstlanguage.in

In [3]:
# we could do index operation
doc[0]

Dr.

In [4]:
# power of spacy
for t in nlp("\"Let's go to N.Y.!\""):
    print(t)

"
Let
's
go
to
N.Y.
!
"


In [6]:
type(nlp), type(doc), type(token)

(spacy.lang.en.English, spacy.tokens.doc.Doc, spacy.tokens.token.Token)

In [7]:
span = doc[1:4]
type(span)

spacy.tokens.span.Span

In [9]:
[i for i in dir(token) if not i.startswith("_")]

['ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'left_edge',
 'lefts',
 'lemma',
 'lemma_',
 'lex',
 'lex_id',
 'like_email',
 'like_num',
 'like_url',
 'lower',
 'lower_',
 'morph',
 'n_lefts',
 'n_rights',
 'nbor',
 'norm',
 'norm_',
 'orth',
 'orth_',
 'pos',
 'pos_',
 'prefix',
 'prefix_',
 'prob',
 'rank',
 'remove_extension',
 'right_edge',
 'rights',
 'sent',
 'sent_start',
 'sentiment',
 'set_extension',
 'set_morph',
 'shape',
 'sh

In [12]:
doc = nlp("Tony gave two $ to Peter.")
doc[2].like_num

True

In [11]:
doc[3].is_currency

True

In [13]:
doc[-1]

.

In [14]:
token.i  #gives index number of token

14

In [15]:
doc[-1].is_punct

True

### Customizing Spacy Tokenizer

In [16]:
doc = nlp("gimme double cheese extra large healthy pizza")
[t for t in doc]

[gimme, double, cheese, extra, large, healthy, pizza]

In [17]:
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},  # it cannot change word but only split so `gim` instead of `give`. changing can be on another step and not here
    {ORTH: "me"},
])

doc = nlp("gimme double cheese extra large healthy pizza")
[t for t in doc]

[gim, me, double, cheese, extra, large, healthy, pizza]

### Issue with blank NLP pipeline
- it has only tokenizer. nothing else. for sentence tokenization won't work here

In [3]:
import spacy

# nlp = spacy.load("en_core_web_sm")  
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")
for sent in doc.sents:
    print(sent)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [4]:
# The error warning is suggesting to add 'sentencizer' to the NLP pipeline

# see pipeline is empty
nlp.pipe_names

[]

In [5]:
# now let us add 'sentencizer' to the pipeline
nlp.add_pipe("sentencizer")
nlp.pipe_names

['sentencizer']

In [7]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")
for sent in doc.sents:
    print(sent)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chaat of delhi


In [None]:
# if you don't want build pipeline from scratch, then we can use `nlp = spacy.load("en_core_web_sm")`  

# Spacy: Language Processing Pipeline

`nlp = spacy.blank("en")` does not provide full pipeline. Now we can build language pipepline

- pipeline comes after `tokenizer`. 
- pipleine can include multiple number of language processing steps

In [13]:
import spacy

# nlp = spacy.load("en_core_web_sm")  
nlp = spacy.blank("en")

# doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chaat of delhi")
# for sent in doc.sents:
#     print(sent)

nlp.pipe_names

[]

In [14]:
# there are are pre-trained pipelines available
# for example, `python -m spacy download en_core_web_sm` command will download a pretrained pipeline for english language
# we can it using command: `spacy.load('en_core_web_sm')`
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [15]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fe4206cfac0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fe4206cd600>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fe4203e8120>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fe42070f280>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fe4203e2340>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fe420a13ca0>)]

In [17]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day


In [20]:
# ner[Named Entity Recognition] pipeline step: what does it do?
doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")
for entity in doc.ents:
    print(entity.text, " | ", entity.label_, " | ", spacy.explain(entity.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


In [21]:
# view it in a nice display
from spacy import displacy

displacy.render(doc, style="ent")

In [23]:
nlp = spacy.load("fr_core_news_sm")
doc = nlp("Tesla Inc va racheter Twitter pour $40 milliards de dollars")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art


In [24]:
# now u can add different components from english pretrained pipeline to a an empty pipeline
nlp = spacy.blank('en')
doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")

# the following prints nothing bcz nothing in pipeline to identify entities
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

In [27]:
source_nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank('en')

nlp.add_pipe("ner", source=source_nlp)

doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")

In [28]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


# Stemming and Lemmatization