# Spacy basics

## Tokenizers

In [121]:
import spacy

In [122]:
nlp = spacy.load('en_core_web_sm')

In [123]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
for token in doc:
  print(token.text, token.pos, token.pos_)

Tesla 96 PROPN
is 87 AUX
looking 100 VERB
at 85 ADP
buying 100 VERB
U.S. 96 PROPN
startup 92 NOUN
for 85 ADP
$ 99 SYM
6 93 NUM
million 93 NUM


In [124]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7ff28b472410>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7ff28a87c3d0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7ff289790980>)]

In [125]:
#name entity recogniser
nlp.pipe_names

['tagger', 'parser', 'ner']

In [126]:
doc2 = nlp(u"Tesla isn't looking into startups anymore" )
#u-unicode string
for token in doc2:
  print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod


In [127]:
spacy.explain('advmod')

'adverbial modifier'

In [128]:
spacy.explain('pobj')

'object of preposition'

In [129]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [130]:
quote = doc3[16: 30]
print(quote)

"Life is what happens to us while we are making other plans"


In [131]:
print(doc3[5].is_sent_start)

None


In [132]:
type(quote)

spacy.tokens.span.Span

In [133]:
doc4 = nlp(u'Sentence 1. Sentence 2. Sentence 3')
for sentence in doc4.sents:
  print(sentence)

Sentence 1.
Sentence 2.
Sentence 3


In [134]:
doc5 = nlp(u'This will be complex. So if any doubts, email us at sara123@gmail.com and visit us at https://sara123.com')
doc5

This will be complex. So if any doubts, email us at sara123@gmail.com and visit us at https://sara123.com

In [135]:
for token in doc5:
  print(token.text)

This
will
be
complex
.
So
if
any
doubts
,
email
us
at
sara123@gmail.com
and
visit
us
at
https://sara123.com


In [136]:
#doc4[2] ='Mark me'

In [137]:
doc6 = nlp(u'Apple to build a Hong Kong factory for $10 million')
for entity in doc6.ents:
  print(entity)
  print(entity.label_)
  print()

Apple
ORG

Hong Kong
GPE

$10 million
MONEY



In [138]:
for chunk in doc6.noun_chunks:
  print(chunk)

Apple
a Hong Kong factory


## Built-in Visulaizers

In [139]:
from spacy import displacy
displacy.render(doc6, style='dep', jupyter=True, options={'distance':100})

In [140]:
displacy.render(doc6, style='ent', jupyter=True)

## Stemming

In [141]:
import nltk
from nltk.stem.porter import PorterStemmer

In [142]:
p = PorterStemmer()
words = ['joyous', 'to', 'wordly', 'earthlyness', 'genorisity', 'ran', 'runs', 'running', 'fairly', 'easiness', 'fairness', 'easily']

for w in words:
  print(w + '---------->' + p.stem(w))

joyous---------->joyou
to---------->to
wordly---------->wordli
earthlyness---------->earthly
genorisity---------->genoris
ran---------->ran
runs---------->run
running---------->run
fairly---------->fairli
easiness---------->easi
fairness---------->fair
easily---------->easili


In [143]:
from nltk.stem.snowball import SnowballStemmer
s = SnowballStemmer(language = 'english')

for w in words:
  print(w + '---------->' + s.stem(w))

joyous---------->joyous
to---------->to
wordly---------->word
earthlyness---------->earthly
genorisity---------->genoris
ran---------->ran
runs---------->run
running---------->run
fairly---------->fair
easiness---------->easi
fairness---------->fair
easily---------->easili


## Lemmatizers

In [144]:
def show_lemmas(words):
  for token in words:
    print(f'{token.text:{12}} {token.lemma:<{22}} {token.lemma_}')

In [145]:
doc7 = nlp(u'I am a runner running a race because I love to run since I ran today.')
show_lemmas(doc7)

I            561228191312463089     -PRON-
am           10382539506755952630   be
a            11901859001352538922   a
runner       12640964157389618806   runner
running      12767647472892411841   run
a            11901859001352538922   a
race         8048469955494714898    race
because      16950148841647037698   because
I            561228191312463089     -PRON-
love         3702023516439754181    love
to           3791531372978436496    to
run          12767647472892411841   run
since        10066841407251338481   since
I            561228191312463089     -PRON-
ran          12767647472892411841   run
today        11042482332948150395   today
.            12646065887601541794   .


## Stop Words

In [146]:
print(nlp.Defaults.stop_words)

{'is', 'nowhere', 'ever', 'whereafter', 'who', 'since', 'make', 'between', 'least', 'somehow', 'eight', 'perhaps', 'that', 'latterly', 'her', 'former', 'many', '’ll', 'several', 'namely', 'besides', 'already', 'seeming', 'off', 'were', 'four', 'not', 'us', 'hundred', 'to', 'ourselves', 'see', 'else', 'hence', '’re', 'whether', 'so', 'throughout', 'next', 'hers', 'nothing', 'formerly', 'give', 'of', 'beforehand', 'an', 'whereas', 'below', 'his', 'those', 'why', 'too', 'whatever', 'all', 'whoever', 'name', 'their', 'across', 'moreover', 'there', '’s', 'unless', 'three', 'does', 'anyway', 'only', 'really', "'ll", "'ve", 'mine', 'hereby', 'which', 'forty', 'over', 'doing', 'none', 'must', 'often', 'against', 'n’t', 'until', '‘d', '’ve', 'less', 'per', 'under', 'put', 'while', 'first', 'am', 'yours', 'becoming', 'beyond', 'both', 'it', 'or', 'always', "'s", 'yourself', 'otherwise', 'than', 'once', '’d', 'this', 'around', 'keep', 'by', 'anywhere', 'how', 'mostly', 'various', 'above', 'before

In [147]:
len(nlp.Defaults.stop_words)

326

In [148]:
nlp.vocab['nowhere'].is_stop

True

In [149]:
nlp.vocab['Harrison Wells'].is_stop

False

In [150]:
nlp.Defaults.stop_words.add('accelerator')

In [151]:
len(nlp.Defaults.stop_words)

327

In [152]:
nlp.vocab['accelerator'].is_stop

True

In [153]:
nlp.Defaults.stop_words.remove('accelerator')

In [154]:
len(nlp.Defaults.stop_words)

326

## Phrase Matching and Vocaulary

In [155]:
from spacy.matcher import Matcher
m = Matcher(nlp.vocab)

In [160]:
pattern1 = [{'LOWER' : 'unitedkingdom'}]
pattern2 = [{'LOWER' : 'united'} , {'LOWER' : 'kingdom'}]
pattern3 = [{'LOWER' : 'united'} , {'IS_PUNCT': True}, {'LOWER': 'kingdom'}]

m.add('UK', None, pattern1, pattern2, pattern3)

doc8 = nlp(u'The United Kingdom, made up of England, Scotland, Wales and Northern Ireland. The unitedkingdom is an island nation in northwestern Europe. The United-Kingdom has England which is home to the Beatles')

match = m(doc8)

print(match)


[(13801868238736630370, 1, 3), (14067982841790495447, 1, 3), (13801868238736630370, 17, 18), (14067982841790495447, 17, 18), (14067982841790495447, 27, 30), (13801868238736630370, 27, 30)]


In [161]:
for match_id, start, end in match:
 string_id = nlp.vocab.strings[match_id] 
 span = doc8[start:end]
 print(match_id, string_id, start, end, span.text)

13801868238736630370 UKop 1 3 United Kingdom
14067982841790495447 UK 1 3 United Kingdom
13801868238736630370 UKop 17 18 unitedkingdom
14067982841790495447 UK 17 18 unitedkingdom
14067982841790495447 UK 27 30 United-Kingdom
13801868238736630370 UKop 27 30 United-Kingdom


In [162]:
pattern1 = [{'LOWER' : 'unitedkingdom'}]
pattern2 = [{'LOWER' : 'united'} , {'IS_PUNCT': True, 'OP' : '*'}, {'LOWER': 'kingdom'}]

m.remove('UK')
m.add('UKop', None, pattern1, pattern2)

doc9 = nlp(u'The United--Kingdom, made up of England, Scotland, Wales and Northern Ireland')

match = m(doc9)

print(match)

[(13801868238736630370, 1, 4)]


In [163]:
m.remove('UKop')

In [165]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] 


m.add('SolarPower', None, pattern1, pattern2)
doc10 = nlp(u'Solar-powered energy runs solar-powered cars.')
match = m(doc10)
print(match)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [168]:
m.remove('SolarPower')

### Phrase Macther

In [166]:
from spacy.matcher import PhraseMatcher
phm = PhraseMatcher(nlp.vocab) 

In [180]:
phm.add('o', [nlp('Barack Obama')])
doc11 = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = phm(doc11)

In [181]:
matches

[(1489474827855109852, 0, 2)]

In [182]:
phm.remove('o')

In [183]:
with open('/content/notes.txt') as f:
  doc12 = nlp(f.read())

phrase_list = ['color map', 'alpha scale', 'in bgr']
phrase_patterns = [nlp(text) for text in phrase_list]

phm.add('opencv', None, *phrase_patterns)

match = phm(doc12)

In [184]:
for match_id, start, end in match:
 string_id = nlp.vocab.strings[match_id] 
 span = doc12[start:end]
 print(match_id, string_id, start, end, span.text)

6285437781414698949 opencv 25 27 alpha scale
6285437781414698949 opencv 51 53 color map
6285437781414698949 opencv 56 58 color map
6285437781414698949 opencv 76 78 in bgr
