In [17]:
import spacy
# loading the model and calling it nlp
nlp = spacy.load('en_core_web_sm')
#could use en_core_web_lg

In [3]:
#Here we are creating a document library
# 'U' is for a unicode string
doc = nlp(u'Tesla is looking at buying U.S. startup for $6million')

In [7]:
for token in doc:
    print(token.text,token.pos_,token.dep_)
#Spacy here knows a lot of information from the beginning
#dep stands for syntatic dependency

Tesla PROPN nsubj
is VERB aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM nmod
6million NUM pobj


In [9]:
#Now talking about the pipeline object
# this is a processing pipeline that tags, parsers, and name entity recognizer
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x115f3a190>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1163ac1d0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1163ac770>)]

In [10]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [12]:
# First lets talk about tokenization
doc2 = nlp(u"Tesla isn't looking into startups anymore.")
for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is VERB aux
n't ADV neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [13]:
doc2[0]

Tesla

In [14]:
doc2[0].pos_

'PROPN'

In [15]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [16]:
life_quote = doc3[16:30]

In [17]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [18]:
type(life_quote)

spacy.tokens.span.Span

In [20]:
type(doc3)

spacy.tokens.doc.Doc

In [21]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [22]:
type(doc4)

spacy.tokens.doc.Doc

In [24]:
# Spacy automatically understands and separates out sentences. These can be referenced with the .sents object
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
doc4[6].is_sent_start

True

In [29]:
doc4[7].is_sent_start
# won't return anything. Returns none

Tokenization Part 1

In [30]:
mystring = '"We\'re moving to L.A."'

In [31]:
print(mystring)

"We're moving to L.A."


In [32]:
doc = nlp(mystring)

In [33]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
"


In [34]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

In [35]:
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [36]:
doc3 = nlp(u"A 5km NYC cab ride costs $10.30")

In [37]:
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [38]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

In [39]:
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [40]:
len(doc4) #this is how many tokens were created

11

In [42]:
# Can count vocab entry
doc4.vocab
print(len(doc4.vocab))

57852


In [43]:
doc5 = nlp(u"It is better to give than to receive.")

In [44]:
doc5[0]

It

In [46]:
doc5[2:5]
# Tokens cannot be reassigned. Once a doc is created it will be fixed
doc5[0]="test"

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [54]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million")

In [55]:
for token in doc8:
    print(token.text, end='|')

Apple|to|build|a|Hong|Kong|factory|for|$|6|million|

In [56]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')
# Spacy is able to figure out "named entities" or that there are more context for each of the particular words

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [57]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

In [58]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


# This section goes over displacy

In [59]:
# Reference: https://spacy.io/usage/visualizers
from spacy import displacy

In [60]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [67]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110})

In [63]:
doc = nlp(u"Over the last quarter Apple sold nearlly 20 thousand iPods for a profit of $6 million.")

In [66]:
displacy.render(doc, style='ent', jupyter=True)

In [68]:
doc = nlp(u"This is a sentence.")

In [None]:
displacy.serve(doc,style='dep')
# can view by going to: http://localhost:5000/

# Stemming

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
p_stemmer = PorterStemmer()

In [9]:
words = ['run','runner','ran','runs','easily','fairly','fairness']

In [10]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fairli
fairness---->fair


In [12]:
# Snowball stemmer
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language = 'english')

In [13]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
runs---->run
easily---->easili
fairly---->fair
fairness---->fair


In [14]:
words = ['generous','generation','generously','generate']

In [15]:
for word in words:
    print(word + '---->' + s_stemmer.stem(word))

generous---->generous
generation---->generat
generously---->generous
generate---->generat


# Working with Lemmatization

In [18]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [19]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [20]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [21]:
doc2 = nlp(u"I saw ten mice today!")

In [22]:
show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


# Stop Words

In [23]:
print(nlp.Defaults.stop_words)

{'fifty', 'above', 'whereas', 'becomes', 'forty', 'beyond', 'by', 'must', 'whom', 'nothing', 'am', 'may', 'anyhow', 'either', 'further', 'unless', 'none', 'out', 'throughout', 'after', 'elsewhere', 'hereupon', 'nevertheless', 'amount', 'somewhere', 'ten', 'make', 'thereby', 'therefore', 'will', 'empty', 'yourself', 'move', 'amongst', 'still', 'quite', 'such', 'been', 'next', 'at', 'could', 'my', 'ever', 'herself', 'six', 'becoming', 'us', 'whereby', 'as', 'afterwards', 'since', 'doing', 'due', 'only', 'once', 'get', 'someone', 'or', 'there', 'you', 'himself', 'twelve', 'most', 'although', 'more', 'also', 'how', 'against', 'very', 'part', 'regarding', 'first', 'third', 'whence', 'across', 'off', 'much', 'have', 'mine', 'latter', 'again', 'less', 'around', 'has', 'many', 'on', 'eleven', 'one', 'now', 'they', 'every', 'he', 'keep', 'please', 'beside', 'no', 'seems', 'hundred', 'a', 'we', 'thus', 'can', 'had', 'within', 'thereafter', 'down', 'from', 'her', 'somehow', 'these', 'me', 'howeve

In [24]:
len(nlp.Defaults.stop_words)

305

In [28]:
nlp.vocab['mystery'].is_stop

False

In [30]:
# You can add stop words
''' As an example lets say you are working with text messages. 
If you see a lot of "btw" or "lol" you can add these into the
stop words set.
'''
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

In [31]:
len(nlp.Defaults.stop_words)
# This is now 306 and shows are new word is added

306

In [32]:
nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False

In [33]:
nlp.vocab['beyond'].is_stop

False

# Phrase Matching and Vocabulary - Part One

In [34]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [40]:
# Need to create patterns we want to match on.
# Need to provide a set of dictionaries in a list for spacey
# We want to detect solarpower in the following formats (pattern):
# SolarPower (pattern1)
# Solar-power (pattern2)
# Solar power (pattern3)
pattern1 = [{'LOWER': 'solarpower'}] # this pattern is if you transform it 
# all to lower case does it match the pattern
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}] 
''' this pattern is saying:
is this lowercase solar then some punctuation then lowercase power'''
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}] 
'''Now this pattern 
is searching for solar and then power right after when converted to lowercase'''


'Now this pattern \nis searching for solar and then power right after when converted to lowercase'

In [60]:
# 2nd parameter is a callback parameter. Can do shift tab to see the parameters
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [69]:
print(len(matcher))

1


In [75]:
doc = ''
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [76]:
found_matches = []
found_matches = matcher(doc)
print(found_matches)

[]


In [77]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

In [78]:
# How to remove a pattern:
matcher.remove('SolarPower')

KeyError: 8656102463236116519

In [79]:
# pattern1 can grab solarpower or SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# pattern2 can get solar (then any number of punctuation) and then power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'OP': '*'}, {'LOWER': 'power'}]
# The OP with the * will allow the pattern to match 0 or more times! 

In [80]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [81]:
doc2 = nlp(u"Solar--power is solarpower yayaya!")

In [82]:
found_matches = matcher(doc2)
print(found_matches)

[(4294967296, 0, 2), (8656102463236116519, 4, 5)]


# Phrase Matching and Vocabulary - Part Two

In [84]:
from spacy.matcher import PhraseMatcher

In [85]:
matcher = PhraseMatcher(nlp.vocab)

In [91]:
with open('../TextFiles/reaganomics.txt', encoding='unicode_escape') as f:
    doc3 = nlp(f.read())

In [92]:
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics', 'free-market economics']

In [94]:
phrase_patterns = [nlp(text) for text in phrase_list]
print(phrase_patterns)

[voodoo economics, supply-side economics, trickle-down economics, free-market economics]


In [95]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [96]:
matcher.add('EconMatcher',None,*phrase_patterns)

In [97]:
found_matches = matcher(doc3)

In [98]:
print(found_matches)

[(3680293220734633682, 41, 45), (3680293220734633682, 49, 53), (3680293220734633682, 54, 56), (3680293220734633682, 61, 65), (3680293220734633682, 673, 677), (3680293220734633682, 2984, 2988)]


In [100]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


In [101]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2984 2988 became widely known as "trickle-down economics", due to the
