In [24]:
!pip install spacy

!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


**Tokenizing the Text**

In [25]:
%%time
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']
CPU times: user 291 ms, sys: 5.97 ms, total: 297 ms
Wall time: 297 ms


In [26]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
%%time
def Preprocessing_text(text):
        lemmatizer = WordNetLemmatizer()
        text = text.lower()
        text = re.sub('\W+',' ', text)
        text = nltk.word_tokenize(text) 
        return text
Preprocessing_text(text)

CPU times: user 568 µs, sys: 0 ns, total: 568 µs
Wall time: 648 µs


# Sentence tokenizer

In [28]:
%%time
# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

doc = nlp(text)
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]
CPU times: user 68.4 ms, sys: 338 µs, total: 68.7 ms
Wall time: 75.3 ms


In [29]:
%%time
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
def Preprocessing_text(text):
        lemmatizer = WordNetLemmatizer()
        text = text.lower()
        #text = re.sub('\W+',' ', text)
        text = nltk.sent_tokenize(text) 
        return text
Preprocessing_text(text)

print(Preprocessing_text(text))

["when learning data science, you shouldn't get discouraged!", "challenges and setbacks aren't failures, they're just part of the journey.", "you've got this!"]
CPU times: user 1.63 ms, sys: 0 ns, total: 1.63 ms
Wall time: 1.6 ms


# Cleaning Text Data: Removing *Stopwords* 

In [30]:
%%time
print(stopwords.words('english')[:20])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']
CPU times: user 428 µs, sys: 965 µs, total: 1.39 ms
Wall time: 1.58 ms


In [31]:
%%time
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ['myself', 'too', '‘ll', 'keep', 'they', 'hereafter', 'third', 'namely', 'meanwhile', 'your', 'her', 'seem', 'during', 'none', 'she', 'being', 'own', 'moreover', 'now', 'twenty']
CPU times: user 1.48 ms, sys: 0 ns, total: 1.48 ms
Wall time: 1.63 ms


In [32]:
%%time
from spacy.lang.en.stop_words import STOP_WORDS

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

#[word for word in doc if word.is_stop==False]

Filtered Sentence: [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]
CPU times: user 60.3 ms, sys: 363 µs, total: 60.7 ms
Wall time: 64.1 ms


In [33]:
%%time
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

def Preprocessing_text(text):
        lemmatizer = WordNetLemmatizer()
        text = text.lower()
        text = re.sub('\W+',' ', text)
        text = nltk.word_tokenize(text)    
        text = [lemmatizer.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]   
        return text
print(Preprocessing_text(text))

['learning', 'data', 'science', 'get', 'discouraged', 'challenge', 'setback', 'failure', 'part', 'journey', 'got']
CPU times: user 8.8 ms, sys: 51 µs, total: 8.85 ms
Wall time: 8.95 ms


# Lexicon Normalization
Lexicon normalization is another step in the text data cleaning process. In the big picture, normalization converts high dimensional features into low dimensional features which are appropriate for any machine learning model. For our purposes here, we’re only going to look at lemmatization, a way of processing words that reduces them to their roots.

In [34]:
%%time
# Implementing lemmatization
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
text = re.sub('\W+',' ', text)
lem = nlp(text)
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

When When
learning learning
data data
science science
you you
shouldn shouldn
t t
get get
discouraged discouraged
Challenges Challenges
and and
setbacks setbacks
aren aren
t t
failures failures
they they
re re
just just
part part
of of
the the
journey journey
You You
ve ve
got got
this this
CPU times: user 76.3 ms, sys: 1.85 ms, total: 78.2 ms
Wall time: 76.3 ms


# **POS TAGGING**

In [35]:
%%time

# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADJ
that DET
ends VERB
well ADV
. PUNCT
CPU times: user 750 ms, sys: 48.9 ms, total: 799 ms
Wall time: 797 ms


In [36]:
from nltk import pos_tag
from nltk import RegexpParser
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [37]:
%%time
text ="learn php from guru99 and make study easy".split()
print("After Split:",text)
tokens_tag = pos_tag(text)
print("After Token:",tokens_tag)
patterns= """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
chunker = RegexpParser(patterns)
print("After Regex:",chunker)
output = chunker.parse(tokens_tag)
print("After Chunking",output)

After Split: ['learn', 'php', 'from', 'guru99', 'and', 'make', 'study', 'easy']
After Token: [('learn', 'JJ'), ('php', 'NN'), ('from', 'IN'), ('guru99', 'NN'), ('and', 'CC'), ('make', 'VB'), ('study', 'NN'), ('easy', 'JJ')]
After Regex: chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
       <ChunkRule: '<NN.?>*<VBD.?>*<JJ.?>*<CC>?'>
After Chunking (S
  (mychunk learn/JJ)
  (mychunk php/NN)
  from/IN
  (mychunk guru99/NN and/CC)
  make/VB
  (mychunk study/NN easy/JJ))
CPU times: user 6.58 ms, sys: 1.09 ms, total: 7.67 ms
Wall time: 7.45 ms


# **Entity Detection**
Entity detection, also called entity recognition, is a more advanced form of language processing that identifies important elements like places, people, organizations, and languages within an input string of text. This is really helpful for quickly extracting information from text, since you can quickly pick out important topics or indentify key sections of text.

In [46]:
%%time
from spacy import displacy

nytimes= nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities=[(i, i.label_, i.label) for i in nytimes.ents]
print(entities)

[(New York City, 'GPE', 384), (Tuesday, 'DATE', 391), (At least 285, 'CARDINAL', 397), (September, 'DATE', 391), (Brooklyn, 'GPE', 384), (Williamsburg, 'GPE', 384), (four, 'CARDINAL', 397), (Bill de Blasio, 'PERSON', 380), (Tuesday, 'DATE', 391), (Orthodox Jews, 'PERSON', 380), (6 months old, 'DATE', 391), (up to $1,000, 'MONEY', 394)]
CPU times: user 98.8 ms, sys: 0 ns, total: 98.8 ms
Wall time: 102 ms


In [39]:
%%time
displacy.render(nytimes, style = "ent",jupyter = True)

CPU times: user 131 ms, sys: 1.36 ms, total: 132 ms
Wall time: 136 ms


In [40]:
 nltk.download('maxent_ne_chunker')
 nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [41]:
%%time
sentence='New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.'
 
for sent in nltk.sent_tokenize(sentence):
   for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE New York City
GPE Brooklyn
PERSON Williamsburg
PERSON Mayor Bill
GPE Blasio
ORGANIZATION Orthodox Jews
CPU times: user 57.1 ms, sys: 2.8 ms, total: 59.9 ms
Wall time: 59 ms


# **Dependency Parsing**
Depenency parsing is a language processing technique that allows us to better determine the meaning of a sentence by analyzing how it’s constructed to determine how the individual words relate to each other.

In [42]:
docp = nlp (" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
   print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [43]:
displacy.render(docp, style="dep", jupyter= True)

# **Word Vector Representation**
When we’re looking at words alone, it’s difficult for a machine to understand connections that a human would understand immediately.

In [44]:
%%time
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 0.20538223 -1.6033714   0.27122334  0.4102599   3.2985601   3.4889512
  1.8090308  -2.1398475   2.31565     1.5809067   4.1519527  -1.0185633
 -0.0325011  -2.7471437  -0.4177467  -2.4292274  -0.6153387   2.4422317
  0.8078671  -2.4846377   2.0988142   1.4448209  -0.552992   -1.3411183
 -0.69847786 -0.45548356  3.8267968  -4.0225782   0.81215733  0.3766132
  0.15751392 -1.1428392  -1.3328214   0.7187766   2.1567593  -3.018766
  3.4919028   0.6938907  -1.1943094  -0.10796624  4.7029977   3.551554
 -0.71505725 -4.4580555  -0.26480573  0.6314918  -0.538128   -1.1131921
 -1.1251849   0.5740081  -1.1976193  -3.5157654   0.425157   -1.7545594
 -3.058784    0.01680815  0.97784567  1.7633746   0.4561966   2.5090182
  0.35267782  0.8351371  -1.394351    0.5082075   0.75960976 -3.3654122
  2.3440146  -2.4311178   1.2401564  -1.4498216  -2.3708577   1.274456
  2.6584334   2.505236    0.24999112  0.45838034  0.7396465  -3.0134087
 -1.1449497   2.441533    0.58746856 -0.47240722 -0.99527466 