In [3]:
import spacy

# Loading the Model

In [4]:
nlp = spacy.load("en_core_web_sm")

### Building an doc object

In [5]:
doc = nlp(u"Tesla is looking to buy a U.S. startup for $6 million")

In [7]:
for token in doc:
    print(token.text, token.pos_)

Tesla PROPN
is AUX
looking VERB
to PART
buy VERB
a DET
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


### Pipeline Object

In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x272a710c6a8>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x272a710c948>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x272a6cc2908>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x272a6cc2c18>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x272a6e85308>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x272a6e81c88>)]

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

### Tokenization

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [12]:
doc2 = nlp(u"Tesla isn't looking at any startups ")

In [14]:
for word in doc2:
    print(word.text, word.pos, word.pos_, word.dep_)

Tesla 96 PROPN nsubj
is 87 AUX aux
n't 94 PART neg
looking 100 VERB ROOT
at 85 ADP prep
any 90 DET det
startups 92 NOUN pobj


In [15]:
doc3 = nlp("This is the first sentence. This is the second sentence. this isthe last sentence")

In [18]:
for sent in doc3.sents:
    print(sent)

This is the first sentence.
This is the second sentence.
this is the last sentence


In [19]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [20]:
life_quote = doc3[16:30]
life_quote

"Life is what happens to us while we are making other plans"

In [21]:
print(type(life_quote))
print(type(doc3))

<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


### Tokenization info:

-  **Prefix**:	Character(s) at the beginning &#9656; `$ ( “ ¿`
-  **Suffix**:	Character(s) at the end &#9656; `km ) , . ! ”`
-  **Infix**:	Character(s) in between &#9656; `- -- / ...`
-  **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied &#9656; `St. U.S.`


In [23]:
# tokens
len(doc3)

50

### NER

In [24]:
doc5 = nlp(u"Apple to build a factory in Hong Kong for $ 6 milion")

In [27]:
for token in doc5:
    print(token, end =" | ")

Apple | to | build | a | factory | in | Hong | Kong | for | $ | 6 | milion | 

In [30]:
for entity in doc5.ents:
    print(entity, entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

Apple ORG
Companies, agencies, institutions, etc.


Hong Kong GPE
Countries, cities, states


6 MONEY
Monetary values, including unit




### Noun Chunks

Noun + the word describing that noun

In [35]:
doc9 = nlp(u" Autonomous cars shift insurance liability toward manufacturers")

In [36]:
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars shift insurance liability
manufacturers


#### Visualizing tokens

In [37]:
from spacy import displacy

In [48]:
doc = nlp(u"Apple is going to buy Google for around $10 billion")

In [49]:
displacy.render(doc, style="dep", jupyter=True, options={"distance":100})

In [50]:
displacy.render(doc, style="ent", jupyter=True)

In [59]:
doc = nlp(u"Oranges are different to apples")

In [60]:
displacy.render(doc, style="ent", jupyter=True)

## Stemming

In [63]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x272a710c6a8>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x272a710c948>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x272a6cc2908>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x272a6cc2c18>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x272a6e85308>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x272a6e81c88>)]

In [64]:
import nltk

In [65]:
from nltk.stem.porter import PorterStemmer

In [66]:
words = ['run', 'ran', 'runner', 'runs', 'running', 'easily', 'fairly']

In [67]:
p_stemmer = PorterStemmer()

In [68]:
for word in words:
    print(f"{word} ------> {p_stemmer.stem(word)}")

run ------> run
ran ------> ran
runner ------> runner
runs ------> run
running ------> run
easily ------> easili
fairly ------> fairli


In [69]:
from nltk.stem.snowball import SnowballStemmer

In [71]:
s_stemmer = SnowballStemmer("english")

In [72]:
for word in words:
    print(f"{word} -------> {s_stemmer.stem(word)}")

run -------> run
ran -------> ran
runner -------> runner
runs -------> run
running -------> run
easily -------> easili
fairly -------> fair


In [79]:
words = ['generate', 'generous', 'generation', "generosity", "generic", "genome", "gene", "genius"]

In [80]:
for word in words:
    print(f"{word} -------> {s_stemmer.stem(word)}")

generate -------> generat
generous -------> generous
generation -------> generat
generosity -------> generos
generic -------> generic
genome -------> genom
gene -------> gene
genius -------> genius


### Lemmatization

In [83]:
doc11 = nlp(u"I ran a race today because I love running and saw a mouse run away from a cat")

In [84]:
for token in doc11:
    print(token.text, "\t", token.pos_, "\t", token.lemma, "\t", token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
today 	 NOUN 	 11042482332948150395 	 today
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
running 	 VERB 	 12767647472892411841 	 run
and 	 CCONJ 	 2283656566040971221 	 and
saw 	 VERB 	 11925638236994514241 	 see
a 	 DET 	 11901859001352538922 	 a
mouse 	 NOUN 	 1384165645700560590 	 mouse
run 	 VERB 	 12767647472892411841 	 run
away 	 ADV 	 15726112966383307120 	 away
from 	 ADP 	 7831658034963690409 	 from
a 	 DET 	 11901859001352538922 	 a
cat 	 NOUN 	 5439657043933447811 	 cat


In [117]:
def show_lemma(text):
    for token in text:
        print(f"{token.text:{10}} {token.pos:<{10}} {token.lemma:<{25}} {token.lemma_:<{10}}")

In [118]:
show_lemma(doc11)

I          95         4690420944186131903       I         
ran        100        12767647472892411841      run       
a          90         11901859001352538922      a         
race       92         8048469955494714898       race      
today      92         11042482332948150395      today     
because    98         16950148841647037698      because   
I          95         4690420944186131903       I         
love       100        3702023516439754181       love      
running    100        12767647472892411841      run       
and        89         2283656566040971221       and       
saw        100        11925638236994514241      see       
a          90         11901859001352538922      a         
mouse      92         1384165645700560590       mouse     
run        100        12767647472892411841      run       
away       86         15726112966383307120      away      
from       85         7831658034963690409       from      
a          90         11901859001352538922      a       

### Stop Words

In [120]:
print(nlp.Defaults.stop_words)

{'our', '‘re', 'five', 'say', 'most', 'also', 'should', 'by', 'such', 'mine', 'cannot', 'seeming', 'often', 'yourselves', 'can', 're', 'few', '’ll', 'alone', 'yet', 'n’t', 'of', 'being', 'does', 'an', 'empty', 'latterly', 'sometimes', 'whereby', 'forty', 'however', 'less', 'wherever', 'get', 'nine', 'name', 'sometime', 'all', 'least', 'whatever', 'on', 'somewhere', 'during', 'something', 'serious', 'much', "'ve", 'ever', 'give', 'throughout', 'along', 'yourself', 'yours', 'whom', 'how', 'n‘t', "'ll", 'via', 'they', 'with', 'beyond', 'he', 'only', '‘ll', 'still', 'hereafter', 'seemed', 'side', 'over', 'while', 'would', 'us', 'whereas', 'could', 'therein', 'a', 'after', 'do', 'ours', 'thereafter', 'around', 'nowhere', 'nobody', 'using', 'why', 'beside', 'eight', 'done', 'since', 'was', 'herein', 'namely', 'perhaps', 'had', 'front', 'then', 'them', 'might', 'she', '’re', 'doing', 'about', 'thereupon', 'one', 'hereupon', 'third', "'m", 'unless', 'and', 'no', 'to', 'him', 'someone', 'used',

In [122]:
nlp.vocab['beyond'].is_stop

True

#### Adding stop words

In [123]:
nlp.Defaults.stop_words.add("btw")

In [124]:
nlp.vocab['btw'].is_stop = True

In [125]:
nlp.vocab['btw'].is_stop

True

#### Deleting stop Words

In [126]:
nlp.Defaults.stop_words.remove("third")

In [127]:
nlp.vocab['third'].is_stop = False

In [128]:
nlp.vocab['third'].is_stop

False