In [0]:
import io
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Important Instructions

### 1. Copy the dataset to your Google Drive
### 2. Mount Google Drive to this notebook
### 3. Find the path to the dataset and paste it in the variable path

# Add path to file

In [0]:
path = '/content/drive/My Drive/IntroToNLP/1429_1.csv'

## Check if file exists

In [0]:
import os
os.path.exists(path)

## Read the dataset

In [0]:
review_data = pd.read_csv(path)

## Quickly take a peek at the dataset

In [0]:
review_data.head()

## What are the columns in the dataframe?

In [0]:
review_data.columns

## Select the review and the rating

In [0]:
data = review_data[['reviews.text','reviews.rating']]

## Take a quick peek at the modified dataframe

In [0]:
data.tail()

## Check for null values

In [0]:
null_text = data[data['reviews.text'].isnull()]

In [0]:
null_text

In [0]:
null_rating = data[data['reviews.rating'].isnull()]

In [0]:
null_rating

## Handle Null values

In [0]:
data = data.dropna()

In [0]:
data = data[(data['reviews.rating'] < 4) | (data['reviews.rating'] == 5)].reset_index()

## Check the distribution of data

In [0]:
count = data['reviews.rating'].value_counts()

In [0]:
x = list(count.index)
y = list(count.values)
sns.barplot(x,y)

## Percentage of positive review

In [0]:
print("Percentage of positive reiview is {}".format((count[5.0]/count.sum())*100))

# Preprocess text

1. Tokenize the sentence
2. Remove words like 'is','be','a'
3. Remove punctuation like ',' ,'@'
4. Convert words to lowercase

In [0]:
sample = data['reviews.text'][0]

In [0]:
print(sample)

# Natural Language Toolkit (Nltk)

In [0]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

## Tokenizing

In [0]:
from nltk.tokenize import word_tokenize

In [0]:
sample = word_tokenize(sample)

In [0]:
print(sample)

## Removing stop words

In [0]:
from nltk.corpus import stopwords

In [0]:
nltk.download('stopwords')

In [0]:
stop_words = set(stopwords.words('english'))
stop_words.remove('not')

## What are the stop words?

In [0]:
print(stop_words)

## Use a for loop

In [0]:
output = []
for word in sample:
  if word not in stop_words:
    output.append(word)

In [0]:
print(output)

## Use list comprehension

In [0]:
sample = [word for word  in sample if word.lower() not in stop_words]

# Remove punctuations

In [0]:
import string

In [0]:
print(string.punctuation)

## How to use it

In [0]:
',' in string.punctuation

In [0]:
sample = [word for word in sample if word not in string.punctuation]

In [0]:
print(sample)

# Stemming the words

In [0]:
from nltk.stem import PorterStemmer

In [0]:
stemmer = PorterStemmer()

In [0]:
output = []
for word in sample:
  output.append(stemmer.stem(word))

In [0]:
print(output)

# Lemmatizing the words

In [0]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 

In [0]:
output = []
for word in sample:
  output.append(lemmatizer.lemmatize(word))

In [0]:
print(output)

# Combine the functions together

In [0]:
def preprocess(sentence):
  tokens = word_tokenize(sentence)
  tokens = [word.lower() for word in tokens if word not in stop_words]
  tokens = [word for word in tokens if word not in string.punctuation]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  tokens = [word for word in tokens if len(word)>1]
  return " ".join(tokens)

In [0]:
def convert_rating(rating):
  if rating == 5.0:
    return 1
  else:
    return 0

# Preprocessing the text in dataset

In [0]:
X = data['reviews.text'].map(preprocess)
y = data['reviews.rating'].map(convert_rating)

# Visualize the words used in the reviews

In [0]:
from wordcloud import WordCloud

In [0]:
pos_text = data[data['reviews.rating']==5.0]['reviews.text']
pos_text = ' '.join(pos_text)
cloud = WordCloud(width=1600, height=800, max_font_size=200).generate(pos_text)
plt.figure(figsize=(12,10))
plt.imshow(cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [0]:
neg_text = data[data['reviews.rating'] < 4]['reviews.text']
neg_text = ' '.join(neg_text)
cloud = WordCloud(width=1600, height=800, max_font_size=200).generate(neg_text)
plt.figure(figsize=(12,10))
plt.imshow(cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Using Word Vectors and Sequence Models

In [0]:
X = data['reviews.text']

In [0]:
X = X.map(preprocess)

## Tokenize the review text

In [0]:
import tensorflow as tf

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
tokenizer = Tokenizer()

In [0]:
tokenizer.fit_on_texts(X)

In [0]:
sequence_dict = tokenizer.word_index

In [0]:
sequence_dict

In [0]:
word_dict = dict((num,val) for (val, num) in sequence_dict.items())

In [0]:
print("The number of words in dictionary is {}".format(len(word_dict)))

## Generate sequences from reviews text

In [0]:
sequences  = np.array(tokenizer.texts_to_sequences(X))

## What are these sequences

In [0]:
sequences[2]

In [0]:
' '.join(word_dict[index] for index in sequences[2])

## Get the input ready

In [0]:
X = tf.keras.preprocessing.sequence.pad_sequences(sequences, 10, truncating='post')

## Get train and test set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1, shuffle=True)

# Build a sequence model

In [0]:
model = tf.keras.models.Sequential()

In [0]:
model.add(tf.keras.layers.Embedding(len(word_dict)+1, 10, input_length=10))

In [0]:
model.add(tf.keras.layers.LSTM(150, return_sequences=True))
model.add(tf.keras.layers.LSTM(150, return_sequences=False))

In [0]:
model.add(tf.keras.layers.Dense(100, activation='relu'))

In [0]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [0]:
print(model.summary())

In [0]:
optimizer = tf.keras.optimizers.Adam(lr=0.0001, decay=0.0001)

In [0]:
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model

In [0]:
model.fit(X_train, y_train, batch_size=16, epochs=2)

# Test the model

In [0]:
y_pred = model.predict_classes(X_test)

In [0]:
accuracy = accuracy_score(y_test, y_pred)

In [0]:
print("The accuracy of the model is : {}".format(accuracy))

# Analyze the performance

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
cmatrix = confusion_matrix(y_test, y_pred)

In [0]:
sns.heatmap(cmatrix,annot=True,annot_kws={"size": 16},xticklabels=['Neg','Pos'], yticklabels=['Neg', 'Pos'])
plt.show()

In [0]:
pos_recall = (cmatrix[1][1]/cmatrix[1].sum())

In [0]:
print("The recall for positive reviews is {}".format(pos_recall))

In [0]:
neg_recall = (cmatrix[0][0]/cmatrix[0].sum())

In [0]:
print("The recall for negative reviews is {}".format(neg_recall))

## Calculate the precision

In [0]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [0]:
pos_precision = precision_score(y_test, y_pred)

In [0]:
print("The precision for positive review is {}".format(pos_precision))

In [0]:
neg_precision = precision_score(y_test, y_pred, pos_label=0)

In [0]:
print("The precision for negative review is {}".format(neg_precision))