In [1]:
#!pip install rake-nltk
#!pip install -U spacy
#!python -m spacy download en_core_web_trf

## Importing Libraries

In [2]:
import os
import spacy
import pickle
import keras

import numpy as np
import pandas as pd
import tensorflow as tf


from keras.layers import Embedding
from keras.layers import Bidirectional,GlobalMaxPool1D,Conv1D
from keras.layers import LSTM,Input,Dense,Dropout,Activation
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

## Reading in Data

In [4]:
df = pd.DataFrame();
with (open('model_data.pickle','rb')) as openfile:
    df = pickle.load(openfile)
df.head()

Unnamed: 0,business_id,business_stars,review_count,categories,user_id,text,review_stars,restaurants_table_service,wifi,bike_parking,...,alcohol,good_for_meal,dogs_allowed,restaurants_take_out,noise_level,restaurants_attire,restaurants_delivery,good_for_kids,good_for_dancing,music
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",93z0yh-sUpGZS-cSKu6xxA,Stopped in on a busy Friday night. Despite the...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",Q_CZIvnsDHjpls-EPzzG7Q,Went there about 1 PM on a Monday. It wasn't ...,2,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",rqxTSFFj5fZNmabY1fmTlw,This was the place the be on Friday Night! If ...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",vNPxlt5f50q0e2nVAScW3Q,Went to this place with my family over the wee...,4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",eXRC79iX60xwA1UuGRuWNg,"Stopped on a midweek afternoon, and so glad th...",4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True


In [5]:
review_data = df[['review_stars','text']].sample(n=2000000)

In [6]:
review_data.head()

Unnamed: 0,review_stars,text
4224672,5,It's a pretty average Target store. This locat...
1501709,5,"Incredible service, quality and friendly staff..."
313502,1,::: UPDATED::: \nBack in March 07I rated this ...
4639667,5,"Great food & friendly people. I'm vegetarian, ..."
85350,4,"Tucci's. I've now been once for brunch, and it..."


In [7]:
review_data['review_stars'].dtypes

dtype('int64')

## Modifying the ratings to more simplistic rating
Anything less than 3 stars is labels as a bad review while anything 3 or greater is a positive review.

In [8]:
def simplify_ratings(x):
    if x <=2:
        x = 0
    else:
        x = 1
    return x
review_data['review_stars'] = review_data['review_stars'].apply(lambda x: simplify_ratings(x) )

In [9]:
review_data.head()

Unnamed: 0,review_stars,text
4224672,1,It's a pretty average Target store. This locat...
1501709,1,"Incredible service, quality and friendly staff..."
313502,0,::: UPDATED::: \nBack in March 07I rated this ...
4639667,1,"Great food & friendly people. I'm vegetarian, ..."
85350,1,"Tucci's. I've now been once for brunch, and it..."


## Splitting data into two
I've limited it to a million for performance to produce a proof of concept

In [10]:
text = review_data['text'].values
label = review_data['review_stars'].values

In [11]:
text.shape

(2000000,)

In [12]:
label.shape

(2000000,)

In [13]:
text[0]

"It's a pretty average Target store. This location seems quieter with less traffic than normal. Not sure how long they will be in business. They carry most things that you might expect and/or need. Staff is nice. The store has a relaxed environment, low pressure."

In [14]:
nlp = spacy.load("en_core_web_trf")
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'DET'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [15]:
text[0]

"It's a pretty average Target store. This location seems quieter with less traffic than normal. Not sure how long they will be in business. They carry most things that you might expect and/or need. Staff is nice. The store has a relaxed environment, low pressure."

In [16]:
parsed_text = nlp(text[0])
parsed_text

It's a pretty average Target store. This location seems quieter with less traffic than normal. Not sure how long they will be in business. They carry most things that you might expect and/or need. Staff is nice. The store has a relaxed environment, low pressure.

In [17]:
for i,sentance in enumerate(parsed_text.sents):
    print(i,':',sentance)

0 : It's a pretty average Target store.
1 : This location seems quieter with less traffic than normal.
2 : Not sure how long they will be in business.
3 : They carry most things that you might expect and/or need.
4 : Staff is nice.
5 : The store has a relaxed environment, low pressure.


In [18]:
for num, entity in enumerate(nlp(text[10]).ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)

Entity 1: Amy - PERSON
Entity 2: French - NORP


In [19]:
token_pos = [token.pos_ for token in nlp(text[10])]
tokens = [token for token in nlp(text[10])]
sd = list(zip(tokens,token_pos))
sd = pd.DataFrame(sd,columns=['token','pos'])
sd.head()

Unnamed: 0,token,pos
0,I,PRON
1,had,VERB
2,a,DET
3,mani,X
4,pedi,X


In [20]:
max_num_words = 1000
max_seq_length = 100
tokenizer = Tokenizer(num_words=max_num_words)

In [21]:
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index

In [22]:
len(word_index)

407631

In [23]:
padded_text = pad_sequences(sequences, maxlen=max_seq_length)
padded_text

array([[  0,   0,   0, ...,  87,   4, 785],
       [  0,   0,   0, ...,   6,   4, 898],
       [  1, 257,  11, ...,   3,  98,  19],
       ...,
       [  5,  57,  37, ...,  79, 205,   6],
       [ 71, 176, 236, ..., 980,  14,  72],
       [  0,   0,   0, ..., 227,  11, 463]])

In [24]:
padded_text.shape

(2000000, 100)

In [25]:
label = tf.keras.utils.to_categorical(
    np.asarray(label)
)

In [26]:
label.shape

(2000000, 2)

In [27]:
validation_split = 0.2
indices = np.arange(text.shape[0])
np.random.shuffle(indices)

In [28]:
padded_text = padded_text[indices]
padded_text

array([[  0,   0,   0, ..., 199,  45, 188],
       [  0,   0,   0, ..., 327,   7, 302],
       [  0,   0,   0, ..., 407, 174,  31],
       ...,
       [  0,   0,   0, ..., 143,  35,   2],
       [  0,   0,   0, ..., 841,  49, 400],
       [  0,   0,   0, ..., 169,   4, 349]])

In [29]:
label = label[indices]
label

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [30]:
nb_validation_samples = int(validation_split*text.shape[0])
nb_validation_samples

400000

## Splitting Data

In [42]:
x_train = padded_text[:-nb_validation_samples]
y_train = label[:-nb_validation_samples]
x_val = padded_text[-nb_validation_samples:]
y_val = label[-nb_validation_samples:]

In [32]:
X_train, X_test, y_train, y_test  = train_test_split(text,label,test_size=0.3, random_state=42)

## Utilizing Glove for Embedding

In [33]:
glove_dir = './glove/'

In [34]:
embedding_index = {}

f = open(os.path.join(glove_dir,'glove.6B.50d.txt'),encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()

print('found word vecs: ',len(embedding_index))

found word vecs:  400000


In [35]:
embedding_dim = 50
embedding_matrix = np.zeros((len(word_index)+1,embedding_dim))
embedding_matrix.shape

(407632, 50)

In [36]:
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Creating Layers and Model

In [37]:
embedding_layer = Embedding(len(word_index)+1,embedding_dim,weights=[embedding_matrix],input_length=max_seq_length,trainable=False)

In [38]:
inp = Input(shape=(max_seq_length,))
x = embedding_layer(inp)
x = Bidirectional(LSTM(50,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(2,activation='sigmoid')(x)
model = Model(inputs=inp,outputs=x)

In [39]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [40]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(1600000, 100)
(1400000, 2)
(400000, 100)
(400000, 2)


## Fitting Model

In [43]:
model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=20,batch_size=1500, verbose=1,use_multiprocessing=True);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [44]:
score = model.evaluate(x_val,y_val)
score



[0.1685318797826767, 0.9293375015258789]

In [45]:
score[1]*100

92.93375015258789

In [46]:
y_pred = model.predict(x_val)

test_data = pd.DataFrame()

y_val_min = y_val.argmax(axis=1)
y_pred_min = y_pred.argmax(axis=1)

test_data['sentiment'] = y_val_min
test_data['sentiment'] = test_data['sentiment'].apply(lambda x: str(x))
test_data['sentiment'] = np.where((test_data.sentiment=='0'),'negative',test_data.sentiment)
test_data['sentiment'] = np.where((test_data.sentiment=='1'),'positive',test_data.sentiment)

test_data['pred_sentiment'] = y_pred_min
test_data['pred_sentiment'] = test_data['pred_sentiment'].apply(lambda x: str(x))
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='0'),'negative',test_data.pred_sentiment)
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='1'),'positive',test_data.pred_sentiment)

labels = ['negative', 'positive']


print(classification_report(test_data['sentiment'],test_data['pred_sentiment'], labels=labels))

confusion_matrix(test_data['sentiment'],test_data['pred_sentiment'], labels=labels)
df_matrix=pd.DataFrame(confusion_matrix(test_data['sentiment'], test_data['pred_sentiment']), 
             columns=["Predicted Negative", "Predicted Positive"], 
             index=["Actual Negative", "Actual Positive"])
df_matrix.style.background_gradient(cmap='Blues')

              precision    recall  f1-score   support

    negative       0.84      0.85      0.85     91552
    positive       0.96      0.95      0.95    308448

    accuracy                           0.93    400000
   macro avg       0.90      0.90      0.90    400000
weighted avg       0.93      0.93      0.93    400000



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,77864,13688
Actual Positive,14577,293871


## Testing model with different data

In [47]:
reviews = [
    # NEG
    'WORST FOOD I EVER HAD. It was burnt, it was cold. The waiter spilt coffee on my lap and did not apologize',
    'Stacy was a pleasent waitress during our stay. She did everything to make sure we felt like home. The food was amazing.',
    # NEG
    'No one asked for Mary Poppinss return to modern consciousness, but her reappearance unmistakably proves that Hollywood Boomers are desperate to justify their own mediocrity through nostalgic sentiment',
    '"Cast Away" is an exceptionally well-crafted exploration of the survival of the human spirit. Its a movie unafraid to consider the full complexity of life.',
    'Somewhat entertaining especially with a lot of the unintended comedy. At times very tedious and the main concept of the film was completely lost.',
    # NEG
    'You’d better have something in the fridge at home, because the likelihood of your joining the Clean Plate Club here is as good as getting invited to a Christmas party at the White House.',
    'A great movie that shows the progress of human development through Tom Hanks character while he is stranded on the desert island. But...all that is overshadowed by Wilson, who will remain in our hearts for all eternity.',
    # NEG
    'There’s V for Vegan. There’s GF for Gluten Free. There’s DF for Dairy Free. I think they’re missing a few. There should be TF for Taste Free and JF for Joy Free and AAHYWEH for Abandon All Hope, Ye Who Enter Here',
    'Probably one of the best disaster emotional films ever. A classic game of survival that is played absolutely perfectly.',
    # NEG
    'While there are many words I could use to describe Louie Louie, I’m going to say only this: Louie Louie is a bad restaurant',]

In [48]:
#tokenizer.fit_on_texts(reviews)
padded_text = pad_sequences(tokenizer.texts_to_sequences(reviews), maxlen=max_seq_length)

In [49]:
res = model.predict(padded_text)

In [50]:
for index, x in enumerate(res):
    if x[1] > .5:
        print(f'Positive Review:\n\t {reviews[index]}')
    else:
        print(f'Negative Review:\n\t {reviews[index]}')
    print(f'------------------------------------------------------------------------------------------')

Negative Review:
	 WORST FOOD I EVER HAD. It was burnt, it was cold. The waiter spilt coffee on my lap and did not apologize
------------------------------------------------------------------------------------------
Positive Review:
	 Stacy was a pleasent waitress during our stay. She did everything to make sure we felt like home. The food was amazing.
------------------------------------------------------------------------------------------
Negative Review:
	 No one asked for Mary Poppinss return to modern consciousness, but her reappearance unmistakably proves that Hollywood Boomers are desperate to justify their own mediocrity through nostalgic sentiment
------------------------------------------------------------------------------------------
Positive Review:
	 "Cast Away" is an exceptionally well-crafted exploration of the survival of the human spirit. Its a movie unafraid to consider the full complexity of life.
--------------------------------------------------------------------

## Save Model 

In [51]:
model.save('review_model_2mill.h5')

## Save Tokenizer

In [52]:
token = tokenizer.to_json()
with open('tokenizer_2mill.pickle','wb') as handle:
    pickle.dump(token,handle)

## Load Model and Tokenizer

In [53]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_json = pickle.load(handle)

tk = tf.keras.preprocessing.text.tokenizer_from_json(
    tokenizer_json
)

In [54]:
# Getting Sequences for the test data and adding padding accordingly
X_seq = pad_sequences(tk.texts_to_sequences(text), maxlen = 36, padding = 'post')
display(X_seq)

array([[123, 667, 243, ...,  86,   4, 849],
       [765,  46, 234, ...,   0,   0,   0],
       [ 95,  45,  37, ...,   3,  94,  19],
       ...,
       [ 20,  33,  50, ...,  80, 195,   6],
       [ 31, 229, 475, ..., 947,  14,  72],
       [ 63, 387,  19, ..., 217,  11, 476]])

In [55]:
from keras.models import load_model
loaded_model = load_model('review_model.h5')

In [56]:
score = loaded_model.evaluate(x_val,y_val)
score



[0.5737026929855347, 0.7400575280189514]

## Testing with loaded Model and Tokenizer

In [57]:
reviews = [
    "Slowly, gradually, with great mental resistance but still inexorably, it dawned on me that I had paid $98 for a duck with almost no flavor. It was dry, too.",
    "Every time I have been here there have been issues. The last time I came here the French fries were disgusting. Mushy oily mess. Not crisp whatsoever.",
    "I absolutely loved the way they used to make their cookies with the chocolate filling but from one of the questions posted, it seems the supplier went out of business from Covid. I've tried the regular chocolate chip cookies they have now and they're still great just different.",
]


In [58]:
padded_text = pad_sequences(tk.texts_to_sequences(reviews), maxlen=100)

In [59]:
res = loaded_model.predict(padded_text)

In [60]:
for index, x in enumerate(res):
    if x[1] > .5:
        print(f'Positive Review:\n\t {reviews[index]}')
    else:
        print(f'Negative Review:\n\t {reviews[index]}')
    print(f'------------------------------------------------------------------------------------------')

Positive Review:
	 Slowly, gradually, with great mental resistance but still inexorably, it dawned on me that I had paid $98 for a duck with almost no flavor. It was dry, too.
------------------------------------------------------------------------------------------
Negative Review:
	 Every time I have been here there have been issues. The last time I came here the French fries were disgusting. Mushy oily mess. Not crisp whatsoever.
------------------------------------------------------------------------------------------
Positive Review:
	 I absolutely loved the way they used to make their cookies with the chocolate filling but from one of the questions posted, it seems the supplier went out of business from Covid. I've tried the regular chocolate chip cookies they have now and they're still great just different.
------------------------------------------------------------------------------------------
