In [1]:
import spacy

In [2]:
# Loading the model
nlp = spacy.load('en_core_web_sm')

In [5]:
# Applying model to our doc
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [9]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [10]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x177cb1bdd48>),
 ('parser', <spacy.pipeline.DependencyParser at 0x177cb1c2048>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x177cb1c25e8>)]

In [11]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [14]:
doc2 = nlp(u"Tesla isn't looking for the starups. ")

In [15]:
for token in doc2:
    print(token.text, token.pos_, token.dep_) 

Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
for ADP prep
the DET det
starups NOUN pobj
. PUNCT punct


In [17]:
doc2[1].pos_

'VERB'

In [20]:
# Syntentic dependencies
doc2[0].dep_

'nsubj'

In [21]:
doc4 = nlp(u"This is the first sentence. This is the second sentence. This is another sentence. This is the last sentence")

In [23]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second sentence.
This is another sentence.
This is the last sentence


In [25]:
doc4[6].is_sent_start

True

In [33]:
## Tokenization
mystring = 'We\'re moving to L.A.!'

In [34]:
mystring

"We're moving to L.A.!"

In [35]:
doc = nlp(mystring)

In [36]:
for token in doc:
    print(token)

We
're
moving
to
L.A.
!


In [39]:
doc2 = nlp(u"We're here to help! Send email, support@oursite.com or visit our website at http://oursite.com! ")

In [40]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
email
,
support@oursite.com
or
visit
our
website
at
http://oursite.com
!


In [41]:
doc3 = nlp(u"A 5km can ride in NYC cost $10.0")

In [42]:
for t in doc3:
    print(t)

A
5
km
can
ride
in
NYC
cost
$
10.0


In [43]:
doc4= nlp(u"Let's visit U.S. next year")

In [44]:
for t in doc4:
    print(t)

Let
's
visit
U.S.
next
year


In [45]:
doc4.vocab

<spacy.vocab.Vocab at 0x177c97d7b48>

In [50]:
doc8=nlp(u"Apple to build a Hong Kong facility for $6 Million")

In [51]:
for token in doc8:
    print(token.text,end=' | ')


Apple | to | build | a | Hong | Kong | facility | for | $ | 6 | Million | 

In [54]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 Million
MONEY
Monetary values, including unit




In [55]:
doc9 = nlp(u"Autonomous cars shift insurance liability towards manufacturers!!")

In [56]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [57]:
from spacy import displacy 

In [58]:
doc = nlp(u"Apple is going to build U.K. factory for $6 million")

In [61]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':70})

In [62]:
doc = nlp(u"Over the last quater Apple sold nearly 20 thousands iPods for the profit of $2 million")

In [64]:
displacy.render(doc,style="ent",jupyter=True)

In [None]:
doc = nlp(u"This is the sentence")
displacy.serve(doc,style='dep')


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [21/Feb/2021 08:07:33] "GET / HTTP/1.1" 200 3058
127.0.0.1 - - [21/Feb/2021 08:07:33] "GET /favicon.ico HTTP/1.1" 200 3058


## Stemming

In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
p_stemmer = PorterStemmer()

In [12]:
words = ['run','runner','ran','runs','easily','fairly','fairness']

In [13]:
for word in words:
    print(word +" ---> "+p_stemmer.stem(word))

run ---> run
runner ---> runner
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fairli
fairness ---> fair


In [14]:
from nltk.stem.snowball import SnowballStemmer

In [15]:
s_stemmer = SnowballStemmer(language='english')

In [16]:
for word in words:
    print(word +" ---> "+s_stemmer.stem(word))

run ---> run
runner ---> runner
ran ---> ran
runs ---> run
easily ---> easili
fairly ---> fair
fairness ---> fair


In [17]:
words = ['generous','generation','generously','generate']

In [18]:
for word in words:
    print(word +" ---> "+s_stemmer.stem(word))

generous ---> generous
generation ---> generat
generously ---> generous
generate ---> generat


## Lemmatization

In [19]:
import spacy

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
doc = nlp(u"I am a runner in a race because I love to run since I ran last")

In [22]:
for token in doc:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
last 	 ADJ 	 10321518907502812892 	 last


### Stop Words

In [24]:
print(nlp.Defaults.stop_words)

{'i', 'with', 'two', 'at', 'did', 'into', 'twelve', 'made', 'whether', 'since', 'moreover', 'hence', 'there', 'indeed', 'has', 'a', 'between', 'other', 'sometime', 'except', 'yourselves', 'otherwise', 'alone', 'also', 'am', 'next', 'same', 'whereafter', 'against', 'serious', 'who', 'and', 'mostly', 'perhaps', 'meanwhile', 'besides', 'others', 'whence', 'it', 'both', 'everyone', 'so', 'through', 'on', 'among', 'eleven', 'he', 'behind', 'how', 'within', 'latter', 'show', 'the', 'him', 'its', 'something', 'thru', 'either', 'because', 'some', 'top', 'several', 'until', 'would', 'becomes', 'most', 'his', 'done', 'name', 'regarding', 'in', 'which', 'further', 'her', 'become', 'less', 'such', 'anyway', 'nothing', 'six', 'anyone', 'about', 'hereupon', 'see', 'else', 'however', 'many', 'another', 'must', 'none', 'go', 'from', 'part', 'eight', 'those', 'was', 'fifty', 'various', 'beforehand', 'anyhow', 'do', 'least', 'via', 'enough', 'unless', 'down', 'throughout', 'what', 'me', 'whither', 'migh

In [25]:
len(nlp.Defaults.stop_words)

305

In [27]:
nlp.vocab['mystery'].is_stop

False

In [29]:
# Add stop words by default
nlp.Defaults.stop_words.add('btw')

In [30]:
nlp.vocab['btw'].is_stop = True

In [31]:
len(nlp.Defaults.stop_words)

306

In [32]:
# Remove the stop words
nlp.Defaults.stop_words.remove('beyond')

In [33]:
nlp.vocab['beyaond'].is_stop = False

In [34]:
nlp.vocab['beyaond'].is_stop 

False

## Vocabulary Matching

In [35]:
from spacy.matcher import Matcher

In [36]:
matcher = Matcher(nlp.vocab)

In [37]:
#SolarPower
pattern1 = [{'LOWER':'solarpower'}]
#Solar-power
pattern2 = [{'LOWER':'solar'},{"IS_PUNCT":True},{"LOWERL":'power'}]
#Solar power
pattern3 = [{'LOWER':'solar'},{"LOWER":'power'}]

In [38]:
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [39]:
doc = nlp(u"Solar power industry continues to grow as solarpower increases. Solar-power is amazing thing")

In [41]:
print(matcher(doc))

[(8656102463236116519, 0, 2), (8656102463236116519, 7, 8), (10088115319505432589, 10, 12)]


In [42]:
# Phrase matching
from spacy.matcher import PhraseMatcher

In [51]:
matcher = PhraseMatcher(nlp.vocab)

In [44]:
with open('C:\\Shubham\\Studies\\Data Science\\NLP\\udemy\\UPDATED_NLP_COURSE\\TextFiles\\reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [48]:
phrase_list = ['voodo economics','supply-side economics','trickle-side economics','trickle-down economics']

In [49]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [50]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [52]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [53]:
found_matches = matcher(doc3)

In [54]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [56]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start-5:end+5]
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2985 2989 became widely known as "trickle-down economics", due to the
