In [1]:
#!pip install rake-nltk
#!pip install -U spacy
#!python -m spacy download en_core_web_trf

## Importing Libraries

In [1]:
import os
import spacy
import pickle
import keras

import numpy as np
import pandas as pd
import tensorflow as tf


from keras.layers import Embedding
from keras.layers import Bidirectional,GlobalMaxPool1D,Conv1D
from keras.layers import LSTM,Input,Dense,Dropout,Activation
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

## Reading in Data

In [3]:
df = pd.DataFrame();
with (open('model_data.pickle','rb')) as openfile:
    df = pickle.load(openfile)
df.head()

Unnamed: 0,business_id,business_stars,review_count,categories,user_id,text,review_stars,restaurants_table_service,wifi,bike_parking,...,alcohol,good_for_meal,dogs_allowed,restaurants_take_out,noise_level,restaurants_attire,restaurants_delivery,good_for_kids,good_for_dancing,music
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",93z0yh-sUpGZS-cSKu6xxA,Stopped in on a busy Friday night. Despite the...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",Q_CZIvnsDHjpls-EPzzG7Q,Went there about 1 PM on a Monday. It wasn't ...,2,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",rqxTSFFj5fZNmabY1fmTlw,This was the place the be on Friday Night! If ...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",vNPxlt5f50q0e2nVAScW3Q,Went to this place with my family over the wee...,4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",eXRC79iX60xwA1UuGRuWNg,"Stopped on a midweek afternoon, and so glad th...",4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True


In [4]:
review_data = df[['review_stars','text']].sample(n=10000)

In [5]:
review_data.head()

Unnamed: 0,review_stars,text
1165574,5,Since moving to Vancouver 5 years ago I had a ...
5565016,4,"I have to say, this is a bit of an upscale pla..."
1170934,3,The staff has changed a bit much so the servic...
8612105,2,Non existent customer service. They just don't...
7688485,5,I've been holding off on writing my review for...


In [6]:
review_data['review_stars'].dtypes

dtype('int64')

## Modifying the ratings to more simplistic rating
Anything less than 3 stars is labels as a bad review while anything 3 or greater is a positive review.

In [7]:
def simplify_ratings(x):
    if x <=2:
        x = 0
    else:
        x = 1
    return x
review_data['review_stars'] = review_data['review_stars'].apply(lambda x: simplify_ratings(x) )

In [9]:
review_data.head()

Unnamed: 0,review_stars,text
1165574,1,Since moving to Vancouver 5 years ago I had a ...
5565016,1,"I have to say, this is a bit of an upscale pla..."
1170934,1,The staff has changed a bit much so the servic...
8612105,0,Non existent customer service. They just don't...
7688485,1,I've been holding off on writing my review for...


## Splitting data into two
I've limited it to a million for performance to produce a proof of concept

In [10]:
text = review_data['text'].values
label = review_data['review_stars'].values

In [11]:
text.shape

(10000,)

In [12]:
label.shape

(10000,)

In [13]:
text[0]

"Since moving to Vancouver 5 years ago I had a struggle to find a dentist and a dental office that  I felt completely comfortable and trusting in. After experiencing 4 other dental offices before I met Anita and her team, I am happy to say that Red Tree Dental is by far the best experience I've had in Vancouver. The entire team are very friendly, knowledgeable and helpful. The dental office feels more like a spa than your regular experience - there is art work on the walls (which you can even purchase), wonderful and clean washrooms, comfortable chairs, relaxing music and they even offer you a hot towel to refresh yourself after your appointment!\n\nI first had an appointment at Red Tree Dental in late-2015 for a consultation for wisdom teeth removal, after a previous dentist had told me that it would be a complicated procedure and I'd need to have a oral surgeon remove them (which would could triple the usual amount of having wisdom teeth removed at a dental office). Anita told me the

In [16]:
nlp = spacy.load("en_core_web_trf")
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'DET'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [17]:
text[0]

"Since moving to Vancouver 5 years ago I had a struggle to find a dentist and a dental office that  I felt completely comfortable and trusting in. After experiencing 4 other dental offices before I met Anita and her team, I am happy to say that Red Tree Dental is by far the best experience I've had in Vancouver. The entire team are very friendly, knowledgeable and helpful. The dental office feels more like a spa than your regular experience - there is art work on the walls (which you can even purchase), wonderful and clean washrooms, comfortable chairs, relaxing music and they even offer you a hot towel to refresh yourself after your appointment!\n\nI first had an appointment at Red Tree Dental in late-2015 for a consultation for wisdom teeth removal, after a previous dentist had told me that it would be a complicated procedure and I'd need to have a oral surgeon remove them (which would could triple the usual amount of having wisdom teeth removed at a dental office). Anita told me the

In [18]:
parsed_text = nlp(text[0])
parsed_text

Since moving to Vancouver 5 years ago I had a struggle to find a dentist and a dental office that  I felt completely comfortable and trusting in. After experiencing 4 other dental offices before I met Anita and her team, I am happy to say that Red Tree Dental is by far the best experience I've had in Vancouver. The entire team are very friendly, knowledgeable and helpful. The dental office feels more like a spa than your regular experience - there is art work on the walls (which you can even purchase), wonderful and clean washrooms, comfortable chairs, relaxing music and they even offer you a hot towel to refresh yourself after your appointment!

I first had an appointment at Red Tree Dental in late-2015 for a consultation for wisdom teeth removal, after a previous dentist had told me that it would be a complicated procedure and I'd need to have a oral surgeon remove them (which would could triple the usual amount of having wisdom teeth removed at a dental office). Anita told me there 

In [19]:
for i,sentance in enumerate(parsed_text.sents):
    print(i,':',sentance)

0 : Since moving to Vancouver 5 years ago I had a struggle to find a dentist and a dental office that  I felt completely comfortable and trusting in.
1 : After experiencing 4 other dental offices before I met Anita and her team, I am happy to say that Red Tree Dental is by far the best experience I've had in Vancouver.
2 : The entire team are very friendly, knowledgeable and helpful.
3 : The dental office feels more like a spa than your regular experience - there is art work on the walls (which you can even purchase), wonderful and clean washrooms, comfortable chairs, relaxing music and they even offer you a hot towel to refresh yourself after your appointment!
4 : 

I first had an appointment at Red Tree Dental in late-2015 for a consultation for wisdom teeth removal, after a previous dentist had told me that it would be a complicated procedure and I'd need to have a oral surgeon remove them (which would could triple the usual amount of having wisdom teeth removed at a dental office).

In [20]:
for num, entity in enumerate(nlp(text[10]).ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)

Entity 1: Three - CARDINAL
Entity 2: Mojo Pork - WORK_OF_ART
Entity 3: first - ORDINAL
Entity 4: last night - TIME
Entity 5: 8:30 pm - TIME
Entity 6: 4 hrs - TIME
Entity 7: over 4 hrs - TIME
Entity 8: today - DATE


In [21]:
token_pos = [token.pos_ for token in nlp(text[10])]
tokens = [token for token in nlp(text[10])]
sd = list(zip(tokens,token_pos))
sd = pd.DataFrame(sd,columns=['token','pos'])
sd.head()

Unnamed: 0,token,pos
0,Very,ADV
1,inconsistent,ADJ
2,!,PUNCT
3,I,PRON
4,have,AUX


In [22]:
max_num_words = 1000
max_seq_length = 100
tokenizer = Tokenizer(num_words=max_num_words)

In [23]:
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index

In [24]:
len(word_index)

28954

In [25]:
padded_text = pad_sequences(sequences, maxlen=max_seq_length)
padded_text

array([[ 61, 100,  74, ..., 102, 139, 139],
       [  0,   0,   0, ...,  37, 131, 111],
       [  0,   0,   0, ...,   1,  11,  45],
       ...,
       [  0,   0,   0, ...,  33,  24, 109],
       [ 45,  91,  22, ...,  19,   5,  58],
       [  0,   0,   0, ..., 270, 545, 224]])

In [26]:
padded_text.shape

(10000, 100)

In [27]:
label = tf.keras.utils.to_categorical(
    np.asarray(label)
)

In [28]:
label.shape

(10000, 2)

In [29]:
validation_split = 0.2
indices = np.arange(text.shape[0])
np.random.shuffle(indices)

In [30]:
padded_text = padded_text[indices]
padded_text

array([[  0,   0,   0, ...,   6,  22,  32],
       [  0,   0,   0, ...,  83,  35,  32],
       [  0,   0,  86, ...,   1, 354, 233],
       ...,
       [ 92,   2, 732, ..., 246,   1, 419],
       [ 59, 407,  31, ...,  51,  27, 910],
       [  0,   0,   0, ..., 445, 141, 150]])

In [31]:
label = label[indices]
label

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [32]:
nb_validation_samples = int(validation_split*text.shape[0])
nb_validation_samples

2000

## Splitting Data

In [33]:
x_train = padded_text[:-nb_validation_samples]
y_train = label[:-nb_validation_samples]
x_val = padded_text[-nb_validation_samples:]
y_val = label[-nb_validation_samples:]

In [33]:
X_train, X_test, y_train,y_test = train_test_split(text,label,test_size=0.3, random_state=42)

## Utilizing Glove for Embedding

In [34]:
glove_dir = './glove/'

In [35]:
embedding_index = {}

f = open(os.path.join(glove_dir,'glove.6B.50d.txt'),encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()

print('found word vecs: ',len(embedding_index))

found word vecs:  400000


In [36]:
embedding_dim = 50
embedding_matrix = np.zeros((len(word_index)+1,embedding_dim))
embedding_matrix.shape

(28955, 50)

In [37]:
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Creating Layers and Model

In [38]:

embedding_layer = Embedding(len(word_index)+1,embedding_dim,weights=[embedding_matrix],input_length=max_seq_length,trainable=False)

In [39]:
inp = Input(shape=(max_seq_length,))
x = embedding_layer(inp)
x = Bidirectional(LSTM(50,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(2,activation='sigmoid')(x)
model = Model(inputs=inp,outputs=x)

In [40]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [41]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(8000, 100)
(8000, 2)
(2000, 100)
(2000, 2)


## Fitting Model

In [42]:
model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=20,batch_size=1000, verbose=1);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [108]:
Model.summary(self, line_length=None, positions=None, print_fn=None)

NameError: name 'self' is not defined

In [43]:
score = model.evaluate(x_val,y_val)
score



[0.31508174538612366, 0.8644999861717224]

In [44]:
score[1]*100

86.44999861717224

In [45]:
y_pred = model.predict(x_val)

test_data = pd.DataFrame()

y_val_min = y_val.argmax(axis=1)
y_pred_min = y_pred.argmax(axis=1)

test_data['sentiment'] = y_val_min
test_data['sentiment'] = test_data['sentiment'].apply(lambda x: str(x))
test_data['sentiment'] = np.where((test_data.sentiment=='0'),'negative',test_data.sentiment)
test_data['sentiment'] = np.where((test_data.sentiment=='1'),'positive',test_data.sentiment)

test_data['pred_sentiment'] = y_pred_min
test_data['pred_sentiment'] = test_data['pred_sentiment'].apply(lambda x: str(x))
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='0'),'negative',test_data.pred_sentiment)
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='1'),'positive',test_data.pred_sentiment)

labels = ['negative', 'positive']


print(classification_report(test_data['sentiment'],test_data['pred_sentiment'], labels=labels))

confusion_matrix(test_data['sentiment'],test_data['pred_sentiment'], labels=labels)
df_matrix=pd.DataFrame(confusion_matrix(test_data['sentiment'], test_data['pred_sentiment']), 
             columns=["Predicted Negative", "Predicted Positive"], 
             index=["Actual Negative", "Actual Positive"])
df_matrix.style.background_gradient(cmap='Blues')

              precision    recall  f1-score   support

    negative       0.75      0.57      0.65       440
    positive       0.89      0.95      0.92      1560

    accuracy                           0.86      2000
   macro avg       0.82      0.76      0.78      2000
weighted avg       0.86      0.86      0.86      2000



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,253,187
Actual Positive,84,1476


## Testing model with different data

In [46]:
reviews = [
    # NEG
    'WORST FOOD I EVER HAD. It was burnt, it was cold. The waiter spilt coffee on my lap and did not apologize',
    'Stacy was a pleasent waitress during our stay. She did everything to make sure we felt like home. The food was amazing.',
    # NEG
    'No one asked for Mary Poppinss return to modern consciousness, but her reappearance unmistakably proves that Hollywood Boomers are desperate to justify their own mediocrity through nostalgic sentiment',
    '"Cast Away" is an exceptionally well-crafted exploration of the survival of the human spirit. Its a movie unafraid to consider the full complexity of life.',
    'Somewhat entertaining especially with a lot of the unintended comedy. At times very tedious and the main concept of the film was completely lost.',
    # NEG
    'You’d better have something in the fridge at home, because the likelihood of your joining the Clean Plate Club here is as good as getting invited to a Christmas party at the White House.',
    'A great movie that shows the progress of human development through Tom Hanks character while he is stranded on the desert island. But...all that is overshadowed by Wilson, who will remain in our hearts for all eternity.',
    # NEG
    'There’s V for Vegan. There’s GF for Gluten Free. There’s DF for Dairy Free. I think they’re missing a few. There should be TF for Taste Free and JF for Joy Free and AAHYWEH for Abandon All Hope, Ye Who Enter Here',
    'Probably one of the best disaster emotional films ever. A classic game of survival that is played absolutely perfectly.',
    # NEG
    'While there are many words I could use to describe Louie Louie, I’m going to say only this: Louie Louie is a bad restaurant',]

In [47]:
#tokenizer.fit_on_texts(reviews)
padded_text = pad_sequences(tokenizer.texts_to_sequences(reviews), maxlen=max_seq_length)

In [48]:
res = model.predict(padded_text)

In [49]:
for index, x in enumerate(res):
    if x[1] > .5:
        print(f'Positive Review:\n\t {reviews[index]}')
    else:
        print(f'Negative Review:\n\t {reviews[index]}')
    print(f'------------------------------------------------------------------------------------------')

Negative Review:
	 WORST FOOD I EVER HAD. It was burnt, it was cold. The waiter spilt coffee on my lap and did not apologize
------------------------------------------------------------------------------------------
Positive Review:
	 Stacy was a pleasent waitress during our stay. She did everything to make sure we felt like home. The food was amazing.
------------------------------------------------------------------------------------------
Negative Review:
	 No one asked for Mary Poppinss return to modern consciousness, but her reappearance unmistakably proves that Hollywood Boomers are desperate to justify their own mediocrity through nostalgic sentiment
------------------------------------------------------------------------------------------
Positive Review:
	 "Cast Away" is an exceptionally well-crafted exploration of the survival of the human spirit. Its a movie unafraid to consider the full complexity of life.
--------------------------------------------------------------------

## Save Model 

In [109]:
model.save('review_model.h5')

## Save Tokenizer

In [114]:
token = tokenizer.to_json()
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(token,handle)

## Load Model and Tokenizer

In [58]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_json = pickle.load(handle)

tk = tf.keras.preprocessing.text.tokenizer_from_json(
    tokenizer_json
)

In [None]:
# Getting Sequences for the test data and adding padding accordingly
X_seq = pad_sequences(tk.texts_to_sequences(text), maxlen = 36, padding = 'post')
display(X_seq)

In [59]:
from keras.models import load_model
loaded_model = load_model('review_model.h5')

In [None]:
score = loaded_model.evaluate(x_val,y_val)
score

## Testing with loaded Model and Tokenizer

In [60]:
reviews = ["This place is horrible. The food was cold and the meals was unfilling."]

In [61]:
#tk.fit_on_texts(reviews)
padded_text = pad_sequences(tk.texts_to_sequences(reviews), maxlen=100)

In [62]:
padded_text

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  15,  32,   9,
        589,   1,  28,   6, 384,   2,   1, 756,   6]])

In [63]:
res = loaded_model.predict(padded_text)

In [64]:
for index, x in enumerate(res):
    if x[1] > .5:
        print(f'Positive Review:\n\t {reviews[index]}')
    else:
        print(f'Negative Review:\n\t {reviews[index]}')
    print(f'------------------------------------------------------------------------------------------')

Negative Review:
	 This place is horrible. The food was cold and the meals was unfilling.
------------------------------------------------------------------------------------------
