# Regular Expression

In [1]:
import re

In [2]:
text = "You have my phone!"

In [4]:
"phone" in text

True

In [6]:
match = re.search("phone", text)

In [7]:
match.span()

(12, 17)

In [60]:
text = "My phone is a new phone"

In [12]:
match.span()

(12, 17)

In [14]:
all_matche = re.findall("phone",text)

In [15]:
all_matche

['phone', 'phone']

In [20]:
for matche in re.finditer("phone",text):
    print(matche.span())

(3, 8)
(18, 23)


In [23]:
re.findall(r"..at","The cat in the hat sat splat")

[' cat', ' hat', ' sat', 'plat']

# Spacy

In [24]:
import spacy

In [25]:
nlp = spacy.load('en_core_web_sm')

In [30]:
# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [31]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2025180af20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2025180b2e0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2025168be60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x202519c8340>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x20251990e00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2025168bed0>)]

In [32]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [43]:
doc2 = nlp(u"Tesla isn't  looking into startups anymore.")

for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
  SPACE dep
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [44]:
# Lemmas (the base form of the word):
print(doc2[4].text)
print(doc2[4].lemma_)

looking
look


# SPANS

In [45]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [52]:
life_quote = doc3[16:30]

In [53]:
print(life_quote)

"Life is what happens to us while we are making other plans"


In [54]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [56]:
for snts in doc4.sents:
    print(snts)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [59]:
doc4[6].is_sent_start

True

# Tokenization

In [61]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [62]:
nlp = spacy.load('en_core_web_sm')

In [63]:
doc = nlp(mystring)

In [65]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [66]:
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [67]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [68]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [69]:
len(doc4)

11

In [71]:
len(doc4.vocab)

783

In [72]:
doc5 = nlp(u'It is better to give than to receive.')

# Retrieve the third token:
doc5[2]

better

# Named Entities

In [74]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [77]:
for entity in doc8.ents:
    print(entity)
    print(entity.label_)

Apple
ORG
Hong Kong
GPE
$6 million
MONEY


# Noun Chunks

In [78]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [79]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [80]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")

for chunk in doc11.noun_chunks:
    print(chunk.text)

He
a one-eyed, one-horned, flying, purple people-eater


# Stemmer

In [81]:
import nltk

In [82]:
from nltk.stem.porter import PorterStemmer

In [86]:
p_stemmer = PorterStemmer()

In [87]:
words = ['run','runner','ran','runs','easily','fairly']

In [88]:
for word in words:
    print(word + '-------->' +p_stemmer.stem(word))

run-------->run
runner-------->runner
ran-------->ran
runs-------->run
easily-------->easili
fairly-------->fairli


In [89]:
from nltk.stem.snowball import SnowballStemmer

In [90]:
s_stemmer = SnowballStemmer(language = 'english')

In [92]:
for word in words:
    print(word + '-------->' +s_stemmer.stem(word))

run-------->run
runner-------->runner
ran-------->ran
runs-------->run
easily-------->easili
fairly-------->fair


# Lammatization

In [93]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

In [94]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma,'\t',token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [95]:
print(nlp.Defaults.stop_words)

{'latter', 'over', 'back', 'full', 'really', 'made', 'very', 'less', 'no', 'his', 'by', 'side', 'show', 'their', 'of', 'hundred', 'seeming', 'namely', 'there', 'nothing', 'through', 'becomes', 'toward', 'various', 'thru', 'this', 'my', '’re', 'whether', 'noone', "'d", 'rather', 'put', 'yours', 'both', 'elsewhere', 'wherein', 'nevertheless', 'during', 'third', 'sixty', 'being', '‘ve', 'fifty', 'beyond', 'sometimes', 'seems', 'at', 'indeed', 'thereafter', 'themselves', 'should', 'just', 'every', 'using', 'somewhere', 'last', 'before', 'hereafter', 'within', 'bottom', 'go', 'mine', 'under', 'seemed', 'whence', 'whereby', 'itself', 'used', 'cannot', 'hereby', 'towards', 'one', 'few', 'now', 'however', 'another', 'therefore', 'hence', 'only', 'did', 'you', 'nor', 'throughout', 'enough', 'part', 'afterwards', 'thus', 'them', '‘ll', 'does', 'beforehand', 'the', 'whither', 'most', 'yourself', 'why', '’ll', 'such', 'fifteen', 'after', 'without', 'how', 'someone', 'then', 'moreover', 'onto', 'fr

In [96]:
len(nlp.Defaults.stop_words)

326

In [99]:
len(nlp.vocab)

822

In [100]:
nlp.Defaults.stop_words.remove('beyond')

In [101]:
nlp.vocab['beyond'].is_stop

False

# Pat of Speech 

In [116]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [103]:
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [104]:
print(doc[4])

jumped


In [105]:
print(doc[4].pos_)

VERB


In [107]:
print(doc[4].tag_)

VBD


In [122]:
for token in doc:
    print(f"{token.text:{10}}  {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The         DET        DT         determiner
quick       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox         NOUN       NN         noun, singular or mass
jumped      VERB       VBD        verb, past tense
over        ADP        IN         conjunction, subordinating or preposition
the         DET        DT         determiner
lazy        ADJ        JJ         adjective (English), other noun-modifier (Chinese)
dog         NOUN       NN         noun, singular or mass
's          PART       POS        possessive ending
back        NOUN       NN         noun, singular or mass
.           PUNCT      .          punctuation mark, sentence closer


In [136]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+ ' _ ' +ent.label_+ ' _ '+str(spacy.explain(ent.label_)))
    else:
        print("no entities found")

In [137]:
doc = nlp(u"Hi how are you")

In [138]:
show_ents(doc)

no entities found


In [139]:
doc = nlp(u"May I go to Wahington, DC next May to see the Washington Monument?")

In [140]:
show_ents(doc)

Wahington, DC _ GPE _ Countries, cities, states
next May _ DATE _ Absolute or relative dates or periods
the Washington Monument _ ORG _ Companies, agencies, institutions, etc.


In [141]:
doc = nlp(u"Our Campany created a brand new vacuum cleaner."
         U"This new vacuum-cleaner is the best in show.")

In [142]:
show_ents(doc)

no entities found


In [143]:
from spacy.matcher import PhraseMatcher

In [144]:
matcher = PhraseMatcher(nlp.vocab)