In [47]:
import spacy
nlp = spacy.load('en_core_web_sm') #loading a model called nlp

In [None]:
# Lecture 16 Spacy Basics

In [13]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [20]:
for token in doc:
    print(token.text,token.pos)

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
startup 92
for 85
$ 99
6 93
million 93


In [22]:
for token in doc:
    print(token.text,token.pos_)

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM


In [23]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [24]:
nlp.pipeline 

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2e72a486908>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x2e72a46eb28>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2e72a48b108>)]

In [25]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [28]:
#tokenization; split parts into tokens

doc2= nlp(u"Tesla    isn't looking into startups anymore.")

In [29]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
    SPACE 
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [30]:
doc2[0]

Tesla

In [31]:
doc2[0].pos_ #proper noun

'PROPN'

In [32]:
doc2[0].dep_

'nsubj'

In [33]:
doc3 = nlp(u'You have been logged out of ACORN due to inactivity. Log back in with the same account, or close this browser completely and restart it to log in with a different account.')

In [37]:
quote = doc3[2:12]

In [38]:
quote

been logged out of ACORN due to inactivity. Log

In [39]:
type(quote) #a span of a larger document

spacy.tokens.span.Span

In [40]:
type(doc3)

spacy.tokens.doc.Doc

In [42]:
doc4 = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [43]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [44]:
doc4[6].is_sent_start

True

In [46]:
doc4[8].is_sent_start #return None because iir is not the start

In [None]:
#Lecture17 Tokenization

In [None]:
#split on whitespace -> prefix -> exception -> suffix -> exception -> done

In [None]:
# tokens are the basic building blocks of a doc object. 
# Prefix: Characters at the beginning 
# Suffix: Characters at the end
# Infix: Characters in between
# Exception: Special-case rule to split a string into several tokens to prevent a token from being split when punctuation rules are applied.

In [49]:
mystring = '"We\'re moving to L.A.!"'

In [50]:
print(mystring)

"We're moving to L.A.!"


In [51]:
doc = nlp(mystring)

In [52]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [57]:
mail = 'cassielchiu@outlook.com is my email! www.cassiel.ca is the website.'

In [58]:
doc2 = nlp(mail)

In [59]:
for token in doc2:
    print(token.text)

cassielchiu@outlook.com
is
my
email
!
www.cassiel.ca
is
the
website
.


In [60]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30.')

In [61]:
for t in doc3:
    print(t.text)

A
5
km
NYC
cab
ride
costs
$
10.30
.


In [63]:
doc4 = nlp(u"Let's visit St.Louis in the U.S. next year.")

In [66]:
for t in doc4:
    print(t.text)

Let
's
visit
St
.
Louis
in
the
U.S.
next
year
.


In [67]:
len(doc4)

12

In [68]:
len(doc4.vocab)

505

In [81]:
doc5 = nlp(u"It is better to give than receive")

In [82]:
doc5[0]

It

In [83]:
doc5[2:6]

better to give than

In [84]:
#do not support document reassignment

doc5[0] = 'test'

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [90]:
doc8 = nlp(u'Apple to build a Hong Kong faactory for $6 million')

In [88]:
for token in doc8:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | faactory | for | $ | 6 | million | 

In [92]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print('\n')

Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [93]:
doc9 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')

In [94]:
for chunk in doc9.noun_chunks:
        print(chunk)

Autonomous cars
insurance liability
manufacturers


In [None]:
# Tokenization Part 2 (Spacy Build-in Visualiszer)

In [95]:
from spacy import displacy

In [96]:
doc = nlp(u'Apple is going to build a UK factory for $6 million')

In [102]:
displacy.render(doc, style='dep', jupyter =True, options={'distance': 80})

In [104]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.")

In [106]:
displacy.render(doc, style='ent', jupyter =True, options={'distance': 90})

In [None]:
doc = nlp(u"This is a sentence")
#displacy.serve(doc,style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [26/Apr/2020 20:59:57] "GET / HTTP/1.1" 200 3393
127.0.0.1 - - [26/Apr/2020 20:59:58] "GET /favicon.ico HTTP/1.1" 200 3393
127.0.0.1 - - [26/Apr/2020 20:59:58] "GET / HTTP/1.1" 200 3393


In [None]:
#Lecture 19 Stemming

In [None]:
#spacy does not include stemmer but lemmatization
#so we work on stemming using NLTK

In [None]:
#Porter's Algorithm
#going from one word to its stem
#snowball is the name of a stemming

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer

In [5]:
p_stemmer = PorterStemmer()

In [3]:
words =['run','runner','ran','runs','easily','fairly']

In [6]:
for word in words:
    print(word + '----->' + p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli


In [7]:
#snowball 
from nltk.stem.snowball import SnowballStemmer

In [8]:
s_stemmer = SnowballStemmer(language='english')

In [9]:
for word in words:
        print(word + '------>' + s_stemmer.stem(word))

run------>run
runner------>runner
ran------>ran
runs------>run
easily------>easili
fairly------>fair


In [10]:
words= ['generous','generaation','generously','generate']

In [11]:
for word in words:
        print(word + '------>' + s_stemmer.stem(word))

generous------>generous
generaation------>generaat
generously------>generous
generate------>generat


In [None]:
#Lecture 20 Lemmatization

In [None]:
#looking at the full context.

In [12]:
import spacy

In [22]:
nlp = spacy.load('en_core_web_sm')

In [14]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today.")

In [17]:
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today
. 	 PUNCT 	 12646065887601541794 	 .


In [19]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [20]:
doc2 = nlp(u"I saw eighteen mice today!")

show_lemmas(doc2)

I            PRON   561228191312463089     -PRON-
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [21]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")

show_lemmas(doc3)

I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   561228191312463089     -PRON-
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [None]:
#Lecture21 stop words

In [23]:
print(nlp.Defaults.stop_words)

{'forty', 'ever', 'whereas', 'whereby', 'just', 'noone', 'few', '‘re', 'which', 'made', 'mine', 'only', 'hereby', 'indeed', 'keep', 'thence', 'least', 'could', 'own', 'therefore', 'whenever', 'move', 'moreover', 'three', 'herein', 'here', 'anyway', 'without', 'very', 'about', 'herself', 'on', 'thru', 'hereafter', 'wherein', 'five', 'becoming', 'yet', 'top', 'do', 'has', 'take', 'using', 'sometime', 'since', 'be', "'d", '’ve', 'empty', 'before', 'but', 'out', '’d', 're', 'amongst', 'did', 'give', 'back', 'nothing', 'were', 'those', 'whole', 'hers', 'or', 'rather', 'get', 'under', 'mostly', 'anyone', 'eleven', 'such', 'sometimes', 'though', 'than', '’re', 'all', 'afterwards', 'fifty', 'up', 'namely', 'cannot', 'thereafter', "'s", 'throughout', 'same', 'former', '‘ve', 'upon', 'another', 'everything', 'full', 'was', 'she', 'already', 'always', 'unless', 'nobody', 'somewhere', "'m", 'behind', 'name', 'at', 'i', 'us', 'ca', 'last', 'down', 'hence', 'through', 'between', 'became', 'their', '

In [24]:
len(nlp.Defaults.stop_words)


326

In [25]:
nlp.vocab['is'].is_stop

True

In [26]:
nlp.vocab['mystery'].is_stop

False

In [28]:
nlp.Defaults.stop_words.add('btw') 

In [29]:
nlp.vocab['btw'].is_stop

True

In [30]:
len(nlp.Defaults.stop_words)


327

In [31]:
nlp.Defaults.stop_words.remove('beyond')

In [32]:
nlp.vocab['beyond'].is_stop = False

In [33]:
nlp.vocab['beyond'].is_stop

False

In [None]:
#Lecturee 22 Phrase Matching and Vocab

In [None]:
#rule based matching 
#a tool called "matcher"

In [34]:
from spacy.matcher import Matcher

In [36]:
matcher = Matcher(nlp.vocab)

In [39]:
#SolarPower
pattern1 = [{'LOWER':'solarpower'}]
#Solar-power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT': True},{'LOWER':'power'}]
#Solar power
pattern3 = [{'LOWER':'solar'},{'LOWER': 'power'}]

In [42]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3) # actual name of this matcher

In [43]:
doc = nlp(u"The Solar Power industry continues to grow as a solarpower increases. Solar-power is amazing.")

In [48]:
found_matches = matcher(doc)

In [50]:
print(found_matches) #start and stop

[(8656102463236116519, 1, 3), (8656102463236116519, 9, 10), (8656102463236116519, 12, 15)]


In [51]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 9 10 solarpower
8656102463236116519 SolarPower 12 15 Solar-power


In [52]:
matcher.remove('SolarPower')

In [54]:
#solarpower SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# solar.power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]
#allow to occur 0 or more times

In [55]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [56]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [57]:
found_matches = matcher(doc2)

In [58]:
found_matches

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]

In [59]:
#Lecture 23

In [61]:
from spacy.matcher import PhraseMatcher

In [63]:
matcher = PhraseMatcher(nlp.vocab)

In [74]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [82]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics'

In [83]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [84]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [85]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [86]:
matcher.add('EconMatcher', None, *phrase_patterns)

In [87]:
found_matches = matcher(doc3)

In [88]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2987, 2991)]

In [89]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2987 2991 trickle-down economics


In [90]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2987 2991 became widely known as "trickle-down economics", due to the
