In [1]:
import numpy as np 
import pandas as pd 
import nltk
import spacy
import os



In [2]:
s1="Apple is looking at buying U.K. startup fir 1$ billion !"
s2="my mail adress is sanjeev.tyagi170@gmail.com"
s3="Let's go to N.Y.!"

In [14]:
# !python -m spacy download en_core_web_md
nlp=spacy.load(name='en_core_web_sm')
nlp_l=spacy.load(name='en_core_web_md')

In [4]:
doc_1=nlp(s1)
for i in doc_1:
    print(i)

Apple
is
looking
at
buying
U.K.
startup
fir
1
$
billion
!


In [5]:
doc_2=nlp(s3)
for i in doc_2:
    print(i)

Let
's
go
to
N.Y.
!


In [6]:
bin(529).strip('0').split('1')

['b', '0000', '000', '']

# Stemming

In [7]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [8]:
words=["runs","running","ran","runner","easily","fairly"]
p_stem=PorterStemmer()
s_stem=SnowballStemmer(language='english')

In [9]:
for i in words:
    print(p_stem.stem(i))

run
run
ran
runner
easili
fairli


In [10]:
for i in words:
    print(s_stem.stem(i))

run
run
ran
runner
easili
fair


# Lemmatization

In [11]:
doc1=nlp("The striped bats are hanging on their feet for best")
for token in doc1:
    print(token.text,"---->",token.lemma_)

The ----> the
striped ----> stripe
bats ----> bat
are ----> be
hanging ----> hang
on ----> on
their ----> their
feet ----> foot
for ----> for
best ----> good


In [12]:
doc1="The striped bats are hanging on their feet for best"
for token in doc1.split():
    print(token,"----->",s_stem.stem(token))

The -----> the
striped -----> stripe
bats -----> bat
are -----> are
hanging -----> hang
on -----> on
their -----> their
feet -----> feet
for -----> for
best -----> best


# Stopwords

In [17]:
# nlp_l.Defaults.stop_words
# nlp.vocab()

# Pattern Matching
# Rule based Matching

In [32]:
from spacy.matcher import Matcher
nlp=spacy.load(name='en_core_web_sm')
matcher=Matcher(nlp.vocab)

In [48]:
# list of dictionaries
pattern_1=[{"LOWER": "hello"},{"LOWER": "world"}]
pattern_2=[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("Hello World",[pattern_1,pattern_2])
doc = nlp("Hello, world! Hello world!")

In [53]:
matches=matcher(doc)
print(matches)

[(15578876784678163569, 0, 3), (8585552006568828647, 0, 3), (8585552006568828647, 4, 6)]


In [54]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
8585552006568828647 Hello World 0 3 Hello, world
8585552006568828647 Hello World 4 6 Hello world


# Phrase Matching

In [60]:
from spacy.matcher import PhraseMatcher
nlp=spacy.load(name='en_core_web_sm')
phrasematcher=PhraseMatcher(nlp.vocab)
phrase_list=["Barack Obama","Angela Merkel","Washington D.C."]
phrase_patterns=[nlp(text) for text in phrase_list]

In [68]:
phrasematcher.add("TerminologyList",None,*phrase_patterns)
doc_3=nlp("German Chancellor Angela Merkel and us president Barack Obama were seen in Washington D.C.")
matches=phrasematcher(doc_3)
print(matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 12, 14)]


# POS Tagging

In [78]:
s1="Headlines should be clear and specific,"

In [79]:
doc=nlp(s1)

In [74]:
for token in doc:
    print(token.text,"---->",token.pos_,"---->",spacy.explain(token.tag_))

Apple ----> PROPN ----> noun, proper singular
is ----> AUX ----> verb, 3rd person singular present
looking ----> VERB ----> verb, gerund or present participle
at ----> ADP ----> conjunction, subordinating or preposition
buying ----> VERB ----> verb, gerund or present participle
U.K. ----> PROPN ----> noun, proper singular
startup ----> NOUN ----> noun, singular or mass
fir ----> NOUN ----> noun, singular or mass
1 ----> NUM ----> cardinal number
$ ----> SYM ----> symbol, currency
billion ----> NUM ----> cardinal number
! ----> PUNCT ----> punctuation mark, sentence closer


In [75]:
for key,val in doc.count_by(spacy.attrs.POS).items():
    print(key, doc.vocab[key].text,val)

96 PROPN 2
87 AUX 1
100 VERB 2
85 ADP 1
92 NOUN 2
93 NUM 2
99 SYM 1
97 PUNCT 1


In [76]:
from spacy import displacy

In [80]:
displacy.render(docs=doc,style='dep')

# Named Entity Recognition

In [96]:
s1="Apple is good but it costs 2000 ruppee"
s2="Orange is juicy"
s3="Microsoft create softwares"
import spacy
nlp=spacy.load("en_core_web_sm")

In [97]:
doc1=nlp(s1)
for i in doc1.ents:
    print(i.text,i.label_,spacy.explain(i.label_))

Apple ORG Companies, agencies, institutions, etc.
2000 DATE Absolute or relative dates or periods


In [101]:
from spacy.tokens import Span
Span(doc1,0,1,label='ORG')

Apple

In [103]:
displacy.render(docs=doc1,style='ent')

# Sentence Segmentation

In [110]:
s1="this is sentence. my name is sanjeev this is a second sentence."
import spacy
nlp=spacy.load('en_core_web_sm')
doc1=nlp(s1)

In [111]:
for i in doc1.sents:
    print(i.text)

this is sentence.
my name is sanjeev this is a second sentence.


In [112]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [113]:
# nlp.add_pipe()

## We can change behaviour of spacy using nlp.add_pipe()
## Usually what nlp does steo by step ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
## but we can guide spacy to do some operation of our choice