## Spacy Basics

In [2]:
import spacy

In [2]:
#Loading the model 
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [14]:
for token in doc:
    print(token.text, '---',  token.pos, '---', token.pos_, '---',token.dep_)

Tesla --- 95 --- PROPN --- nsubj
is --- 99 --- VERB --- aux
looking --- 99 --- VERB --- ROOT
at --- 84 --- ADP --- prep
buying --- 99 --- VERB --- pcomp
U.S. --- 95 --- PROPN --- compound
startup --- 91 --- NOUN --- dobj
for --- 84 --- ADP --- prep
$ --- 98 --- SYM --- quantmod
6 --- 92 --- NUM --- compound
million --- 92 --- NUM --- pobj


##### Pipelining 

In [15]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11a146fd0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x11a3388f0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x11a338e90>)]

In [16]:
nlp.pipe_names

['tagger', 'parser', 'ner']

#### Examples 

In [17]:
doc2 = nlp(u"Tesla isn't looking into startup anymore.")

In [18]:
for token in doc2:
    print(token.text, "--- ", token.pos_, "---", token.dep_)

Tesla ---  PROPN --- nsubj
is ---  VERB --- aux
n't ---  ADV --- neg
looking ---  VERB --- ROOT
into ---  ADP --- prep
startup ---  NOUN --- pobj
anymore ---  ADV --- advmod
. ---  PUNCT --- punct


In [20]:
doc2[0].pos_

'PROPN'

In [21]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [22]:
life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [23]:
type(life_quote)

spacy.tokens.span.Span

In [24]:
type(doc3)

spacy.tokens.doc.Doc

In [25]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence")

In [26]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence


In [29]:
doc4[6].is_sent_start

True

In [30]:
doc4[6]

This

#### Storage

In [31]:
doc = nlp("I love coffee")
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


In [32]:
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


### Tokenizatin

In [36]:
mystring = '"We\'re moving to L.A!"'
print(mystring)

"We're moving to L.A!"


In [37]:
doc = nlp(mystring)

In [38]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A
!
"


In [39]:
 doc2 = nlp(u"We're hear to help! Send snail-mail, emial support@sdjshdbj.com or visit us at https://www.sjhbsbsc.com")

In [40]:
for t in doc2:
    print(t)

We
're
hear
to
help
!
Send
snail
-
mail
,
emial
support@sdjshdbj.com
or
visit
us
at
https://www.sjhbsbsc.com


In [41]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [42]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [43]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year. ")

In [44]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [45]:
len(doc4)

11

In [46]:
doc4.vocab

<spacy.vocab.Vocab at 0x118132290>

In [47]:
len(doc4.vocab)

57852

In [48]:
doc5 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [49]:
for token in doc5:
    print(token.text, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [54]:
for entity in doc5.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [57]:
doc6 = nlp(u"Autonoumous cars shift insurance liability towards manufacturers.")

In [58]:
for chunk in doc6.noun_chunks:
    print(chunk)

Autonoumous cars
insurance liability
manufacturers


#### Visulization

In [59]:
from spacy import displacy

In [60]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million.")

In [63]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [65]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. ")

In [66]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
doc = nlp(u"This is a sentence.")
displacy.serve(doc, style='dep',port=5001)

### Stemming

##### Spacy doesn't provided the Stemming library, for this we need to proceed with nltk 

In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
p_stemmer = PorterStemmer()

In [10]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness']

In [6]:
for word in words:
    print(word + '-----> '+p_stemmer.stem(word))

run-----> run
runner-----> runner
ran-----> ran
runs-----> run
easily-----> easili
fairly-----> fairli


In [7]:
# Using the snowball stemmer 
from nltk.stem.snowball import SnowballStemmer

In [8]:
s_stemmer = SnowballStemmer(language='english')

In [11]:
for word in words: 
    print(word + '-----> ' +s_stemmer.stem(word))

run-----> run
runner-----> runner
ran-----> ran
runs-----> run
easily-----> easili
fairly-----> fair
fairness-----> fair


In [12]:
words = ['generous','generation','generously','generate']

In [13]:
for word in words:
    print(word +'----> ' +s_stemmer.stem(word))

generous----> generous
generation----> generat
generously----> generous
generate----> generat


#### Lemmatization

In [1]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today") 

In [7]:
for token in doc1:
    print(token.text, '\t',token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


#### Stop words 


In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [13]:
print(len(nlp.Defaults.stop_words))

306


In [14]:
nlp.vocab['is'].is_stop

True

In [15]:
nlp.vocab['mystery'].is_stop

False

In [18]:
### Want to create the stopword. 
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

In [19]:
nlp.vocab['btw'].is_stop

True

In [20]:
### exclude from stop words 
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False

In [21]:
nlp.vocab['btw'].is_stop

False

### Phrase Matching and Vocabulary 

In [22]:
import spacy 
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher 

In [23]:
matcher = Matcher(nlp.vocab)

In [24]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [25]:
 matcher.add('SolarPower', None,pattern1,pattern3,pattern2)

In [26]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-power is amazing")

In [27]:
found_matches = matcher(doc)

In [30]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [31]:
for match_id, start, end in found_matches:
    start_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, start_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [32]:
### remove particular pattern 
matcher.remove('SolarPower')

In [33]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNC':True, 'OP':'*'},{'LOWER':'power'}]

In [34]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [38]:
doc1 = nlp(u"Soalr--power is solarpower")

In [39]:
found_matches = matcher(doc1)

In [40]:
print(found_matches)

[(8656102463236116519, 4, 5)]


In [41]:
from spacy.matcher import PhraseMatcher
matcher  = PhraseMatcher(nlp.vocab)

In [66]:
with open('./reaganomics.txt', encoding= 'unicode_escape') as file:
    doc3 = nlp(file.read())

In [59]:
phrase_list = ['voodoo. economics', 'supply-side economics', 'trickle-down economics']

In [60]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [61]:
phrase_patterns

[voodoo. economics, supply-side economics, trickle-down economics]

In [62]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [63]:
found_matches = matcher(doc3)

In [65]:
for match_id, start, end in found_matches:
    string_id =nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics
