In [1]:
import spacy

In [2]:
#here nlp is model 
nlp=spacy.load('en_core_web_sm')

In [3]:
#so here nlp will tokenize the sentence into words
doc=nlp(u"Tesla is looking at  buying U.S. startup for $6 million")

In [7]:
#you can see it tokenize $ and 6 as diffrent tokens,token.pos_ gives part of speech,token.dep_ gives 
#other details like subject object.
for token in doc:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [9]:
#when we load nlp model it do these operations like tagger,parser and ner means name-entity-recogniser
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x221adfc2448>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x221ae2af828>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x221ae2af8e8>)]

In [11]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [12]:
#1. Tagger tokenise
doc2=nlp(u"This isn't      Cool")

In [14]:
for token in doc2:
    print(token.text,token.pos_,token.dep_)
#you can see it recognize is and n't as diffrent token and spaces also.

This DET nsubj
is AUX ROOT
n't PART neg
      SPACE 
Cool ADJ acomp


In [18]:
#you can slice it
doc2=nlp(u"Hey there, this is a long text.This isn't Cool")
new_doc=doc2[3:8]
print(new_doc)

this is a long text


In [21]:
print(type(new_doc)) #automatically knows it is a span of large sentence
print(type(doc2)) #whole document

<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


In [22]:
#to tokenize the sentence
doc4=nlp(u"this is first sentence. this is second U.S. president. This is third sentence")
for token in doc4.sents:
    print(token.text)
#you can see it identifies full stop and not U.S. as full stop

this is first sentence.
this is second U.S. president.
This is third sentence


In [25]:
#tells whether it is the start of sentence
doc4[5].is_sent_start

True

## Tokenisation
- breaking the document into pieces
- Suffix,prefix,infix = characters at end,beg,in  between resp.
- and there is exception our special cases

In [2]:
nlp=spacy.load('en_core_web_sm')

In [36]:
mystring='"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [37]:
doc=nlp(mystring)

In [38]:
for token in doc:
    print(token.text)
#here spacy understand and put punctuations/symbols diffrently 

"
We
're
moving
to
L.A.
!
"


In [39]:
#but in email we want it together .spacy identifies it automatically
doc2=nlp(u'my work-email is rare! email abc@email.com. My link is http:\\this.com! My cab fare of 5km is $40.69')

In [40]:
for token in doc2:
    print(token.text)

my
work
-
email
is
rare
!
email
abc@email.com
.
My
link
is
http:\this.com
!
My
cab
fare
of
5
km
is
$
40.69


In [47]:
len(nlp.vocab)

499

In [4]:
doc3=nlp(u"Apple is starting a new factory in HongKong for $6 million")

In [52]:
for entity in doc3.ents:
    print(entity,entity.label_,str(spacy.explain(entity.label_)))
    #automatically find special words

Apple ORG Companies, agencies, institutions, etc.
HongKong GPE Countries, cities, states
$6 million MONEY Monetary values, including unit


### Token visualisation

In [9]:
import spacy
from spacy import displacy

In [10]:
doc3=nlp(u"Apple is starting a new factory in HongKong for $6 million")

In [12]:
displacy.render(doc3,style='dep',jupyter=True,options={'distance':110})

In [13]:
displacy.render(doc3,style='ent',jupyter=True)
#Highlight the entity

In [None]:
#see this on a server
displacy.serve(doc3,style='dep')
#type this on web http://127.0.0.1:5000/
#5000 can be varies acc to your machine


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



## Stemming
- It is used to stem or cut the words to get its base form like play from(playing,plays,played etc.)
- we do not use stemming in spacy we use nltk instead

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
pstemer=PorterStemmer()

In [3]:
words=['run','ran','runnning','runs','runner','fairly','easily']

In [6]:
for word in words:
    print(word+"---->",pstemer.stem(word))
#you cans see it ientifies runner as a noun and not change it but it do no do well on easily

run----> run
ran----> ran
runnning----> runn
runs----> run
runner----> runner
fairly----> fairli
easily----> easili


In [7]:
#better way
from nltk.stem.snowball import SnowballStemmer

In [8]:
s_stemmer=SnowballStemmer(language='english')

In [10]:
for word in words:
    print(word+"---->",s_stemmer.stem(word))
#did better on fairly

run----> run
ran----> ran
runnning----> runn
runs----> run
runner----> runner
fairly----> fair
easily----> easili


### Lemmatisation
- in this we dont cut the word but actually reduce to its natural form like lemma of 'mice' is 'mouse' or 'was' is 'be', 'meeting' is 'meet'
- it is better them stemming thats why spacy has lemmatisation and not stemming

In [11]:
import spacy

In [13]:
nlp=spacy.load('en_core_web_sm')

In [14]:
doc=nlp(u"i am a runner running in a race and i ran easily and runs fairly and run.")

In [17]:
for token in doc:
    print(f' {token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_:{10}}')

 i            PRON   5097672513440128799    i         
 am           AUX    10382539506755952630   be        
 a            DET    11901859001352538922   a         
 runner       NOUN   12640964157389618806   runner    
 running      VERB   12767647472892411841   run       
 in           ADP    3002984154512732771    in        
 a            DET    11901859001352538922   a         
 race         NOUN   8048469955494714898    race      
 and          CCONJ  2283656566040971221    and       
 i            PRON   5097672513440128799    i         
 ran          VERB   12767647472892411841   run       
 easily       ADV    8007658219579238015    easily    
 and          CCONJ  2283656566040971221    and       
 runs         VERB   12767647472892411841   run       
 fairly       ADV    17576046047103189829   fairly    


### Stopwords
- common words don't give any information so we remove it .

In [20]:
print(nlp.Defaults.stop_words)
#list of stopwords in nlp model

{'yourselves', 'of', 'three', 'such', 'using', 'cannot', 'could', 'serious', 'any', 'others', 'ours', 'next', 'well', 'really', 'less', 'please', '‘s', 'whoever', '‘ll', 'eleven', 'above', 'sometimes', 'as', 'must', 'becoming', 'a', 'nevertheless', 'amongst', 'n’t', "'re", 'whereas', 'might', '’re', 'somehow', 'somewhere', 'before', 'their', 'no', 'eight', 'back', 'via', 'put', 'thence', 'during', 'may', 'see', 'nothing', 'meanwhile', 'on', "'s", 'though', 'off', 'together', 'thereby', 'he', 'very', 'perhaps', 'so', 'where', 'it', 'me', 'with', 'hereupon', 'whither', 'moreover', 'under', 'least', 'through', 'yet', 'once', 'him', 'made', 'wherever', 'latterly', 'beside', 'make', 'i', 'either', 'than', 'front', 'now', 'enough', 'beforehand', 'here', 'down', 'themselves', 'amount', 'seeming', 'when', 'wherein', 'are', 'within', 'whole', 'except', 'mostly', 'they', 'besides', "n't", 'hereafter', 'ten', 'hence', '’ve', 'thru', 'four', 'hundred', 'nowhere', 'other', 'were', 'because', 'after

In [21]:
print(len(nlp.Defaults.stop_words))

326


In [22]:
#check it is a stopword
nlp.vocab['is'].is_stop

True

In [23]:
#if you want to add your own stopwords
nlp.Defaults.stop_words.add("btw")

In [25]:
nlp.vocab['btw'].is_stop=True

In [26]:
print(len(nlp.Defaults.stop_words))
#length is increased

327


In [27]:
#to remove stopwords
nlp.Defaults.stop_words.remove("btw")


In [28]:
nlp.vocab['btw'].is_stop=False

In [30]:
print(len(nlp.Defaults.stop_words))
print(nlp.vocab['btw'].is_stop)

326
False


### Vocabulary and matching
-think of it as a powerful version of regular expression where we use part of speech to identify a pattern

In [31]:
from spacy.matcher import Matcher

In [56]:
matcher=Matcher(nlp.vocab)

In [57]:
#now it will search solar power in cases
#SolarPower
#Solar-power
#Solar power
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3=[{'LOWER':'solar'},{'LOWER':'power'}]

In [58]:
matcher.add("SolarPower",None,pattern1,pattern2,pattern3)

In [59]:
doc=nlp(u"The Solar Power works very fine in solar-power and solarpower")

In [60]:
found_matches=matcher(doc)

In [61]:
print(found_matches)
#contains string id, starting token index and ending token index of matches

[(8656102463236116519, 1, 3), (8656102463236116519, 7, 10), (8656102463236116519, 11, 12)]


In [62]:
for match_id,start,end in found_matches:
    string_name=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(string_name, span)

SolarPower Solar Power
SolarPower solar-power
SolarPower solarpower


In [63]:
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]
# * means 0 or more number of -hyphen

In [64]:
matcher=Matcher(nlp.vocab)
matcher.add("Solar_new",None,pattern1,pattern2,pattern3)

In [77]:
doc=nlp(u"The Solar Power works very fine in solar-power and solar---power solarpower")

In [78]:
found_matches=matcher(doc)
for match_id,start,end in found_matches:
    string_name=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(string_name, span)

Solar_new Solar Power
Solar_new solar-power
Solar_new solar---power
Solar_new solarpower


In [89]:
#Phrase Matcher
from spacy.matcher import PhraseMatcher

In [90]:
matcher=PhraseMatcher(nlp.vocab)

In [91]:
phrase_list=['vodoo economics','slide_side economics','tricle_down economics']

In [92]:
phrase_pattern=[nlp(pattern) for pattern in phrase_list]
print(phrase_pattern)

[vodoo economics, slide_side economics, tricle_down economics]


In [93]:
doc=nlp(u"this is a new documnet vodoo economics. But here we find slide_side economics. so third one is tricle_down economics")

In [94]:
matcher.add("econ_matcher",None,*phrase_pattern)

In [95]:
found_matches=matcher(doc)
for match_id,start,end in found_matches:
    string_name=nlp.vocab.strings[match_id]
    span=doc[start:end]
    print(string_name, span)

econ_matcher vodoo economics
econ_matcher slide_side economics
econ_matcher tricle_down economics
