## Importing required packages

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from spacy import displacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Reading the json file and setting it as a DataFrame

In [3]:
df = pd.read_json("/content/drive/MyDrive/bits/bits/sem - 3/assignments/nlp/yelp.json", lines=True)

In [4]:
df = df[['text', 'stars']].copy()

## 2. Cleaning up the text 


*   Remove punctuations and special characters
*   Remove stop words
*   Convert text to lower case



In [5]:
def clean(doc):
    stop = set(stopwords.words('english'))  # Getting the set of stop words
    exclude = set(string.punctuation)  # Set of punctuations including special characters
    clean_text = re.sub(r'['+string.punctuation+']+', ' ',doc) # Removing punctuations and special characters
    normalized = " ".join([i for i in clean_text.lower().split() if i not in stop]) # Removing stop words and converting to lower case
    return normalized

In [6]:
df['text'] = df.text.apply(clean)

## 3. Vectorizing and Splitting the data in test and train

In [8]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.text.values)
Y = df.stars.values

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

### Training a MultinomialNB model 

In [10]:
clf = MultinomialNB()
clf.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Displaying the confusion Matrix

In [11]:
confusion_matrix(Y_test, clf.predict(X_test), labels=[5,4,3,2,1])

array([[ 9213,  5293,   273,    75,   517],
       [ 3892, 10015,  1356,   212,   416],
       [  543,  3023,  2504,   569,   383],
       [  166,   634,  1444,  1212,   802],
       [  116,   197,   354,   845,  1928]])

## 4. POS tagging on the first 4 rows of text

In [12]:
for i,row in df.iloc[0:4].iterrows():
    sentence = word_tokenize(row['text'])
    print('-'*125)
    print(row['text'])
    print(nltk.pos_tag(sentence))

-----------------------------------------------------------------------------------------------------------------------------
wife took birthday breakfast excellent weather perfect made sitting outside overlooking grounds absolute pleasure waitress excellent food arrived quickly semi busy saturday morning looked like place fills pretty quickly earlier get better favor get bloody mary phenomenal simply best ever pretty sure use ingredients garden blend fresh order amazing everything menu looks excellent white truffle scrambled eggs vegetable skillet tasty delicious came 2 pieces griddled bread amazing absolutely made meal complete best toast ever anyway wait go back
[('wife', 'NN'), ('took', 'VBD'), ('birthday', 'JJ'), ('breakfast', 'NN'), ('excellent', 'NN'), ('weather', 'NN'), ('perfect', 'NN'), ('made', 'VBD'), ('sitting', 'VBG'), ('outside', 'JJ'), ('overlooking', 'VBG'), ('grounds', 'NNS'), ('absolute', 'JJ'), ('pleasure', 'NN'), ('waitress', 'NN'), ('excellent', 'JJ'), ('food', 'N

## 5. Build and display dependency parser tree

In [13]:
nlp = spacy.load('en_core_web_sm')
text = "It turned out that Cersei was having an affair with her twin brother, Jaime Lannister — and that all of Robert's supposed children, including his heir, Joffrey, were actually fathered by Jaime."
for token in nlp(text):
  print(token.text, '->', token.dep_,'->', token.head.text)

It -> nsubj -> turned
turned -> ROOT -> turned
out -> prt -> turned
that -> mark -> having
Cersei -> nsubj -> having
was -> aux -> having
having -> ccomp -> turned
an -> det -> affair
affair -> dobj -> having
with -> prep -> affair
her -> poss -> brother
twin -> amod -> brother
brother -> pobj -> with
, -> punct -> brother
Jaime -> compound -> Lannister
Lannister -> appos -> brother
— -> punct -> having
and -> cc -> having
that -> mark -> fathered
all -> nsubjpass -> fathered
of -> prep -> all
Robert -> poss -> children
's -> case -> Robert
supposed -> amod -> children
children -> pobj -> of
, -> punct -> children
including -> prep -> children
his -> poss -> heir
heir -> pobj -> including
, -> punct -> heir
Joffrey -> appos -> heir
, -> punct -> fathered
were -> auxpass -> fathered
actually -> advmod -> fathered
fathered -> conj -> having
by -> agent -> fathered
Jaime -> pobj -> by
. -> punct -> turned


In [14]:
displacy.render(nlp(text), jupyter=True)