In [2]:
# SPACY BASICS

In [1]:
# import the spacy module
import spacy
import en_core_web_sm

# load the language library from the spacy module
nlp = en_core_web_sm.load()

# use the nlp language library to parse the string
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [2]:
# grab the raw token from the string
for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
U.S.
startup
for
$
6
million


In [3]:
# grab the parts of speech for the string
for token in doc:
    print(token.text, token.pos)

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93


In [4]:
# grab the parts of speech raw name for the string
for token in doc:
    print(token.text, token.pos_)

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


In [5]:
# grab the parts of speech raw name and details for the string
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [6]:
# nlp pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x111f77b50>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x111e09be0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x111e09b80>)]

In [7]:
# create another string
doc2 = nlp(u"Tesla isn't looking into startups anymore")

In [8]:
# grab the parts of speech raw name and details for the string
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [9]:
# grab an individual token from the string and view it's parts of speech
doc[1].pos

87

In [10]:
# create a string
doc3 = nlp(u'Sachin Tendulkar is considered one of the best batsman in World Cricket.\n'
          u'He will always be remembered for the wonderful innings he played at Sharjah.')

In [11]:
# extract certain tokens
sachin = doc3[5:9]

# print it
print(sachin)

of the best batsman


In [12]:
# print the separate sentences in the string
for sentence in doc3.sents:
    print(sentence)

Sachin Tendulkar is considered one of the best batsman in World Cricket.

He will always be remembered for the wonderful innings he played at Sharjah.


In [13]:
# print out the words of doc3
for t in doc3:
    print(t)

Sachin
Tendulkar
is
considered
one
of
the
best
batsman
in
World
Cricket
.


He
will
always
be
remembered
for
the
wonderful
innings
he
played
at
Sharjah
.


In [14]:
# count the number of token in a string
len(doc3)

28

In [15]:
# entities in the string
for entity in doc3.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Sachin Tendulkar
PERSON
People, including fictional


World Cricket
ORG
Companies, agencies, institutions, etc.


Sharjah
ORG
Companies, agencies, institutions, etc.




In [16]:
# noun chunks
for chunk in doc3.noun_chunks:
    print(chunk)

Sachin Tendulkar
the best batsman
World Cricket
He
the wonderful innings
he
Sharjah


In [17]:
# import displacy from spacy, to visualize the tokens
from spacy import displacy

# display
displacy.render(doc3,style='dep',jupyter=True,options={'distance':100})

In [18]:
# use ent for style
displacy.render(doc3, style='ent', jupyter=True)

In [19]:
# STEMMING

In [20]:
# import PorterStemmer from nltk
from nltk.stem.porter import PorterStemmer

# create an instance 
p_stemmer = PorterStemmer()


In [21]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly']

In [23]:
for word in words:
    print(word + '--->' + p_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fairli


In [24]:
# import SnowballStemmer from nltk
from nltk.stem.snowball import SnowballStemmer

# create an instance
s_stemmer = SnowballStemmer(language= 'english')

for word in words:
    print(word + '--->' + s_stemmer.stem(word))

run--->run
runner--->runner
ran--->ran
runs--->run
easily--->easili
fairly--->fair


In [25]:
# LEMMATIZATION

In [27]:
for token in doc3:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

Sachin 	 PROPN 	 15988461644514894486 	 Sachin
Tendulkar 	 PROPN 	 629921096243289731 	 Tendulkar
is 	 AUX 	 10382539506755952630 	 be
considered 	 VERB 	 17360727451542821422 	 consider
one 	 NUM 	 17454115351911680600 	 one
of 	 ADP 	 886050111519832510 	 of
the 	 DET 	 7425985699627899538 	 the
best 	 ADJ 	 5711639017775284443 	 good
batsman 	 NOUN 	 10958173730388585239 	 batsman
in 	 ADP 	 3002984154512732771 	 in
World 	 PROPN 	 9796807554284407750 	 World
Cricket 	 PROPN 	 4435642289053210327 	 Cricket
. 	 PUNCT 	 12646065887601541794 	 .

 	 SPACE 	 962983613142996970 	 

He 	 PRON 	 561228191312463089 	 -PRON-
will 	 VERB 	 18307573501153647118 	 will
always 	 ADV 	 17471638809377599778 	 always
be 	 AUX 	 10382539506755952630 	 be
remembered 	 VERB 	 11373740451506967222 	 remember
for 	 ADP 	 16037325823156266367 	 for
the 	 DET 	 7425985699627899538 	 the
wonderful 	 ADJ 	 17835575765003257990 	 wonderful
innings 	 NOUN 	 13105248461348590805 	 innings
he 	 PRON 	 561228191

In [28]:
# STOP WORDS

In [30]:
# check to see if a word is a stop word
nlp.vocab['is'].is_stop

True

In [31]:
# PATTERN MATCHING

In [32]:
# import Matcher from spacy
from spacy.matcher import Matcher

# create an instance
matcher = Matcher(nlp.vocab)

# check if pattern 1 is either 'SolarPower', 'Solar-power', 'Solar power'
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True}, {'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [33]:
# add the 3 patterns under the name 'SolarPower' to matcher instance
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [34]:
# example string
doc4 = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing")

In [35]:
# find the matches between the 3 patterns and the string
found_matches = matcher(doc4)

print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [36]:
# the output shows tuples with information 1) match id 2) start 3) stop

In [37]:
# to remove a pattern from the matcher
matcher.remove('SolarPower')

# create new patterns
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'},{'LOWER':'power'}]

# add the 2 patterns under the name 'SolarPower' to matcher instance
matcher.add('SolarPower', None, pattern1, pattern2)

# example string
doc5 = nlp(u"Solar--power is solarpower yay!")

# find the matches between the 3 patterns and the string
found_matches = matcher(doc5)

print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]
