#  Processing Textual Data (NLP)

# NLTK Package

In [2]:
# DataSource->  www.kaggle.com/snap/amazon-fine-food-reviews 
import pandas as pd
import nltk
import re
food_review = pd.read_csv("Reviews.csv")
food_review.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
# Using the sample function from the pandas data frame, let’s randomly pick
# the text of 1000 reviews and print the top rows 

food_review_text = pd.DataFrame(food_review["Text"])
food_review_text_1k = food_review_text.sample(n= 1000,random_state = 123)
food_review_text_1k.head()


Unnamed: 0,Text
277535,I love these chips! They always make a great h...
253901,"To add to the pile-on, really really hate the ..."
495520,This stuff is the best. I put it on just about...
373115,"Organic India Tulsi tea is, to me, the absolut..."
547017,I have a German Shorthaired Pointer (3 yrs old...


In [4]:
food_review_text_1k.count()

Text    1000
dtype: int64

In [5]:
# Tokenization Using NLTK
food_review_text_1k['tokenized_reviews'] = food_review_text_1k['Text'].apply(nltk.word_tokenize)
food_review_text_1k.head()


Unnamed: 0,Text,tokenized_reviews
277535,I love these chips! They always make a great h...,"[I, love, these, chips, !, They, always, make,..."
253901,"To add to the pile-on, really really hate the ...","[To, add, to, the, pile-on, ,, really, really,..."
495520,This stuff is the best. I put it on just about...,"[This, stuff, is, the, best, ., I, put, it, on..."
373115,"Organic India Tulsi tea is, to me, the absolut...","[Organic, India, Tulsi, tea, is, ,, to, me, ,,..."
547017,I have a German Shorthaired Pointer (3 yrs old...,"[I, have, a, German, Shorthaired, Pointer, (, ..."


In [6]:
# Word Search Using Regex
search_word = set([w for w in food_review_text_1k['tokenized_reviews'].iloc[0] if re.search('^c.i..$', w)])
print(search_word)


{'chips'}


In [7]:
# Word Search Using the Exact Word
#Search for the word "great" in reviews
# The rows of the reviews containing the word will be retrieved. They can be considered a positive review. 
food_review_text_1k[food_review_text_1k['Text'].str.contains('great')]

Unnamed: 0,Text,tokenized_reviews
277535,I love these chips! They always make a great h...,"[I, love, these, chips, !, They, always, make,..."
547017,I have a German Shorthaired Pointer (3 yrs old...,"[I, have, a, German, Shorthaired, Pointer, (, ..."
153491,"Our GreatDane loves these , he's never happy w...","[Our, GreatDane, loves, these, ,, he, 's, neve..."
307887,My parents' dog refused to take her medicine u...,"[My, parents, ', dog, refused, to, take, her, ..."
189614,"Like most of the other reviews state, you can ...","[Like, most, of, the, other, reviews, state, ,..."
362712,We own two dogs who are drastically different ...,"[We, own, two, dogs, who, are, drastically, di..."
564730,A friend's daughter has just gone to college. ...,"[A, friend, 's, daughter, has, just, gone, to,..."
353896,Scents:<br />Cool Impact<br />Arctic Edge - my...,"[Scents, :, <, br, /, >, Cool, Impact, <, br, ..."
87831,"My little diabetic shih-tzu, Lily, is notoriou...","[My, little, diabetic, shih-tzu, ,, Lily, ,, i..."
291616,This tastes great on chicken and shrimp and is...,"[This, tastes, great, on, chicken, and, shrimp..."


In [8]:
# Normalization Using NLTK
# stemming or normalization
#  NLTK provides two functions implementing the stemming algorithm. The first is the Porter Stemming algorithm, 
# and the second is the Lancaster stemmer
print("Before")
words = set(food_review_text_1k['tokenized_reviews'].iloc[0])
print(words)

print("After Porter Stemmer:")
porter = nltk.PorterStemmer()
print([porter.stem(w) for w in words])

print("After Lancaster Stemmer:")
lancaster = nltk.LancasterStemmer()
print([lancaster.stem(w) for w in words])


Before
{'at', 'sustenance', 'make', "'m", 'a', 'stuck', 'healthy', 'chips', 'love', 'the', '!', 'just', 'snack', 'I', '.', 'vending', 'machine', 'work', 'for', 'always', 'when', 'with', 'They', 'these', 'great'}
After Porter Stemmer:
['at', 'susten', 'make', "'m", 'a', 'stuck', 'healthi', 'chip', 'love', 'the', '!', 'just', 'snack', 'I', '.', 'vend', 'machin', 'work', 'for', 'alway', 'when', 'with', 'they', 'these', 'great']
After Lancaster Stemmer:
['at', 'sust', 'mak', "'m", 'a', 'stuck', 'healthy', 'chip', 'lov', 'the', '!', 'just', 'snack', 'i', '.', 'vend', 'machin', 'work', 'for', 'alway', 'when', 'with', 'they', 'thes', 'gre']


In [9]:
# Steeming all tokens in data frame

def stem_sentences(tokenslist):
#     tokens = sentence.split()
    import nltk
    porter = nltk.PorterStemmer()
    stemmed_tokens = [porter.stem(token) for token in tokenslist]
    return ' '.join(stemmed_tokens)


food_review_text_1k['stemming_tokens'] = food_review_text_1k['tokenized_reviews'].apply(stem_sentences)

In [10]:
food_review_text_1k.iloc[0]

Text                 I love these chips! They always make a great h...
tokenized_reviews    [I, love, these, chips, !, They, always, make,...
stemming_tokens      I love these chip ! they alway make a great he...
Name: 277535, dtype: object

In [11]:
# Noun phase chunking

import nltk
from nltk.tokenize import word_tokenize
text = word_tokenize("My English Bulldog Larry had skin allergies the summer we got him at age 3, I'm so glad that now I can buy his food from Amazon")
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
review_chunking_out = cp.parse(nltk.pos_tag(text))
print(review_chunking_out)



(S
  My/PRP$
  English/JJ
  Bulldog/NNP
  Larry/NNP
  had/VBD
  skin/VBN
  allergies/NNS
  (NP the/DT summer/NN)
  we/PRP
  got/VBD
  him/PRP
  at/IN
  (NP age/NN)
  3/CD
  ,/,
  I/PRP
  'm/VBP
  so/RB
  glad/JJ
  that/IN
  now/RB
  I/PRP
  can/MD
  buy/VB
  his/PRP$
  (NP food/NN)
  from/IN
  Amazon/NNP)


In [12]:
# IOB tag representation of chunking  I (Inside), O (Outside), and B(Begin).

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

#Print IOB tags
review_chunking_out_IOB = tree2conlltags(review_chunking_out)
pprint(review_chunking_out_IOB)

[('My', 'PRP$', 'O'),
 ('English', 'JJ', 'O'),
 ('Bulldog', 'NNP', 'O'),
 ('Larry', 'NNP', 'O'),
 ('had', 'VBD', 'O'),
 ('skin', 'VBN', 'O'),
 ('allergies', 'NNS', 'O'),
 ('the', 'DT', 'B-NP'),
 ('summer', 'NN', 'I-NP'),
 ('we', 'PRP', 'O'),
 ('got', 'VBD', 'O'),
 ('him', 'PRP', 'O'),
 ('at', 'IN', 'O'),
 ('age', 'NN', 'B-NP'),
 ('3', 'CD', 'O'),
 (',', ',', 'O'),
 ('I', 'PRP', 'O'),
 ("'m", 'VBP', 'O'),
 ('so', 'RB', 'O'),
 ('glad', 'JJ', 'O'),
 ('that', 'IN', 'O'),
 ('now', 'RB', 'O'),
 ('I', 'PRP', 'O'),
 ('can', 'MD', 'O'),
 ('buy', 'VB', 'O'),
 ('his', 'PRP$', 'O'),
 ('food', 'NN', 'B-NP'),
 ('from', 'IN', 'O'),
 ('Amazon', 'NNP', 'O')]


In [17]:
# POS tagging (Parts of speech)
text = "My English Bulldog Larry had skin allergies the summer we got him at age 3, I'm so glad that now I can buy his food from Amazon"
tagged_review_sent = nltk.pos_tag(text)
print(tagged_review_sent)
print("*******************************************************NER***********************************************************")
print(nltk.ne_chunk(tagged_review_sent))

[('M', 'NNP'), ('y', 'PRP'), (' ', 'VBP'), ('E', 'NNP'), ('n', 'FW'), ('g', 'NN'), ('l', 'NN'), ('i', 'NN'), ('s', 'VBP'), ('h', 'NN'), (' ', 'NN'), ('B', 'NNP'), ('u', 'NN'), ('l', 'NN'), ('l', 'NN'), ('d', 'NN'), ('o', 'NN'), ('g', 'NN'), (' ', 'NNP'), ('L', 'NNP'), ('a', 'DT'), ('r', 'NN'), ('r', 'NN'), ('y', 'NN'), (' ', 'NNP'), ('h', 'VBZ'), ('a', 'DT'), ('d', 'NN'), (' ', 'NN'), ('s', 'NN'), ('k', 'NN'), ('i', 'JJ'), ('n', 'VBP'), (' ', 'PDT'), ('a', 'DT'), ('l', 'NN'), ('l', 'NN'), ('e', 'NN'), ('r', 'NN'), ('g', 'NN'), ('i', 'NN'), ('e', 'VBP'), ('s', 'JJ'), (' ', 'NNP'), ('t', 'NN'), ('h', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('s', 'NN'), ('u', 'JJ'), ('m', 'NN'), ('m', 'NN'), ('e', 'NN'), ('r', 'NN'), (' ', 'NNP'), ('w', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('g', 'NN'), ('o', 'NN'), ('t', 'NN'), (' ', 'NNP'), ('h', 'NN'), ('i', 'NN'), ('m', 'VBP'), (' ', 'PDT'), ('a', 'DT'), ('t', 'NN'), (' ', 'VBZ'), ('a', 'DT'), ('g', 'NN'), ('e', 'NN'), (' ', 'VBD'), ('3', 'CD'), (',', ','), (' 

# Spacy 


In [None]:
# POS tagging and NER
import spacy
nlp = spacy.load('en')

In [29]:
#  POS tagging
text = "My English Bulldog Larry had skin allergies the summer we got him at age 3, I'm so glad that now I can buy his food from Amazon"
doc = nlp(text)
for token in doc:
    print(token.text, token.tag_)

My PRP$
English NNP
Bulldog NNP
Larry NNP
had VBD
skin NN
allergies NNS
the DT
summer NN
we PRP
got VBD
him PRP
at IN
age NN
3 CD
, ,
I PRP
'm VBP
so RB
glad JJ
that IN
now RB
I PRP
can MD
buy VB
his PRP$
food NN
from IN
Amazon NNP


In [30]:
# Dependency Parsing
for chunk in doc.noun_chunks:
     print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

My English Bulldog Larry Larry nsubj had
skin allergies allergies dobj had
we we nsubj got
him him dobj got
age age pobj at
I I nsubj 'm
I I nsubj buy
his food food dobj buy
Amazon Amazon pobj from


In [32]:
# Dependency tree
from spacy import displacy

nlp = spacy.load("en")
doc = nlp("My English Bulldog Larry had skin allergies the summer we got him at age 3")
displacy.render(doc, style='dep')

In [34]:
# Chunking
#  Extract VERB and NOUN
text = ("My English Bulldog Larry had skin allergies the summer we got him at age 3, I'm so glad that now I can buy his food from Amazon")
doc = nlp(text)
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['My English Bulldog Larry', 'skin allergies', 'we', 'him', 'age', 'I', 'I', 'his food', 'Amazon']
Verbs: ['have', 'get', 'be', 'can', 'buy']


In [44]:
# Named Entity Recognition
# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en")
# Process whole documents
text = ("I want to buy a Mobile within range 1500")
doc = nlp(text)
# Find named entities
for entity in doc.ents:
    print(entity.text, entity.label_)

1500 CARDINAL


# CoreNLP 

In [42]:
#  We skip corenlp rather we focus more on Spacy

# TextBlob

In [43]:
# We also skip TextBlob rather we focus more on Spacy

# Natural Language Understanding

In [5]:
# Generating n-grams from a sentence

from textblob import TextBlob
blob = TextBlob("Building an enterprise chatbot that can converse like humans")
blob.ngrams(n=2)

[WordList(['Building', 'an']),
 WordList(['an', 'enterprise']),
 WordList(['enterprise', 'chatbot']),
 WordList(['chatbot', 'that']),
 WordList(['that', 'can']),
 WordList(['can', 'converse']),
 WordList(['converse', 'like']),
 WordList(['like', 'humans'])]

# Natural Language Generation


In [5]:
import pandas as pd 
import markovify

data = pd.read_csv('conversation_data.csv', usecols=['text'],  skiprows = [1])
newData = data['text'].values.tolist()

# print(newData[:5])
text_model = markovify.NewlineText(newData, state_size = 2)

#Generate random text
for i in range(10):
    print(text_model.make_sentence())

Alright then. If you have contact information about AI, also chatbots.
Hmm learn from that
Hi! Nice to hear that song?
My goals are to be able to use getUpdates or switch to webhooks for my bot?
What is that there is always a difficult think to do luandry. Almost everyday 😊😊
nice to talk with 😁
Hi! How are you talking about binding the interface on Qt
russian, I think this tool will be so scary 😵😵😵
Nice! Humans really love him!! 😊😊
What do you think you are about it.
