In [1]:
#!pip install rake-nltk
#!pip install -U spacy
#!python -m spacy download en_core_web_trf

## Importing Libraries

In [105]:
import os
import spacy
import pickle
import keras

import numpy as np
import pandas as pd
import tensorflow as tf


from keras.layers import Embedding
from keras.layers import Bidirectional,GlobalMaxPool1D,Conv1D
from keras.layers import LSTM,Input,Dense,Dropout,Activation
from keras.models import Model

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [106]:
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

## Reading in Data

In [4]:
df = pd.DataFrame();
with (open('model_data.pickle','rb')) as openfile:
    df = pickle.load(openfile)
df.head()

Unnamed: 0,business_id,business_stars,review_count,categories,user_id,text,review_stars,restaurants_table_service,wifi,bike_parking,...,alcohol,good_for_meal,dogs_allowed,restaurants_take_out,noise_level,restaurants_attire,restaurants_delivery,good_for_kids,good_for_dancing,music
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",93z0yh-sUpGZS-cSKu6xxA,Stopped in on a busy Friday night. Despite the...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",Q_CZIvnsDHjpls-EPzzG7Q,Went there about 1 PM on a Monday. It wasn't ...,2,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",rqxTSFFj5fZNmabY1fmTlw,This was the place the be on Friday Night! If ...,5,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",vNPxlt5f50q0e2nVAScW3Q,Went to this place with my family over the wee...,4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,"Gastropubs, Food, Beer Gardens, Restaurants, B...",eXRC79iX60xwA1UuGRuWNg,"Stopped on a midweek afternoon, and so glad th...",4,True,free,True,...,beer_and_wine,"{'dessert': False, 'latenight': False, 'lunch'...",False,True,average,casual,,,,True


In [5]:
review_data = df[['review_stars','text']].sample(n=10000)

In [6]:
review_data.head()

Unnamed: 0,review_stars,text
4205015,4,"Straight from Japan, honest to god donburi mak..."
5839953,1,"I was a client and got a ""t shirt"" ... my back..."
2274589,3,One thing I know for sure here is that Yelper ...
2100610,4,A newer brewery ( at least to me) in the craft...
4137572,5,"Wide variety of Asian dishes, all excellent, w..."


In [7]:
review_data['review_stars'].dtypes

dtype('int64')

## Modifying the ratings to more simplistic rating
Anything less than 3 stars is labels as a bad review while anything 3 or greater is a positive review.

In [8]:
def simplify_ratings(x):
    if x <=2:
        x = 0
    else:
        x = 1
    return x
review_data['review_stars'] = review_data['review_stars'].apply(lambda x: simplify_ratings(x) )

In [9]:
review_data.head()

Unnamed: 0,review_stars,text
4205015,1,"Straight from Japan, honest to god donburi mak..."
5839953,0,"I was a client and got a ""t shirt"" ... my back..."
2274589,1,One thing I know for sure here is that Yelper ...
2100610,1,A newer brewery ( at least to me) in the craft...
4137572,1,"Wide variety of Asian dishes, all excellent, w..."


## Splitting data into two
I've limited it to a million for performance to produce a proof of concept

In [10]:
text = review_data['text'].values
label = review_data['review_stars'].values

In [11]:
text.shape

(10000,)

In [12]:
label.shape

(10000,)

In [13]:
text[0]

"Straight from Japan, honest to god donburi making you feel like you're in a train station somewhere on the main island after work stuffing your face.\n\nFresh fish, decent variety and hearty portions.\n\nStrictly rice bowls, some pre-packed dinners and assorted onigiri are served here so if you're expecting rolls, you might be disappointed.\n\nThere are like 5 chairs in this joint."

In [14]:
r = Rake()
r.extract_keywords_from_text(text[0])
r.get_ranked_phrases()

['train station somewhere',
 'strictly rice bowls',
 'god donburi making',
 'like 5 chairs',
 'feel like',
 'work stuffing',
 'packed dinners',
 'main island',
 'hearty portions',
 'fresh fish',
 'expecting rolls',
 'decent variety',
 'assorted onigiri',
 'straight',
 'served',
 'pre',
 'might',
 'joint',
 'japan',
 'honest',
 'face',
 'disappointed']

In [15]:
nlp = spacy.load("en_core_web_trf")
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'DET'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [16]:
text[0]

"Straight from Japan, honest to god donburi making you feel like you're in a train station somewhere on the main island after work stuffing your face.\n\nFresh fish, decent variety and hearty portions.\n\nStrictly rice bowls, some pre-packed dinners and assorted onigiri are served here so if you're expecting rolls, you might be disappointed.\n\nThere are like 5 chairs in this joint."

In [17]:
parsed_text = nlp(text[0])
parsed_text

Straight from Japan, honest to god donburi making you feel like you're in a train station somewhere on the main island after work stuffing your face.

Fresh fish, decent variety and hearty portions.

Strictly rice bowls, some pre-packed dinners and assorted onigiri are served here so if you're expecting rolls, you might be disappointed.

There are like 5 chairs in this joint.

In [18]:
for i,sentance in enumerate(parsed_text.sents):
    print(i,':',sentance)

0 : Straight from Japan, honest to god donburi making you feel like you're in a train station somewhere on the main island after work stuffing your face.
1 : 

Fresh fish, decent variety and hearty portions.
2 : 

Strictly rice bowls, some pre-packed dinners and assorted onigiri are served here so if you're expecting rolls, you might be disappointed.
3 : 

There are like 5 chairs in this joint.


In [19]:
for num, entity in enumerate(nlp(text[10]).ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)

Entity 1: first - ORDINAL
Entity 2: same day - DATE
Entity 3: the following day - DATE
Entity 4: tomorrow - DATE
Entity 5: tomorrow - DATE
Entity 6: Woolite - PRODUCT
Entity 7: Almost $50 - MONEY


In [20]:
token_pos = [token.pos_ for token in nlp(text[10])]
tokens = [token for token in nlp(text[10])]
sd = list(zip(tokens,token_pos))
sd = pd.DataFrame(sd,columns=['token','pos'])
sd.head()

Unnamed: 0,token,pos
0,It,PRON
1,is,AUX
2,my,PRON
3,first,ADJ
4,time,NOUN


In [21]:
max_num_words = 1000
max_seq_length = 100
tokenizer = Tokenizer(num_words=max_num_words)

In [22]:
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index

In [23]:
len(word_index)

28679

In [24]:
padded_text = pad_sequences(sequences, maxlen=max_seq_length)
padded_text

array([[  0,   0,   0, ..., 134,  11,  15],
       [  0,   0,   0, ...,  16,  27, 589],
       [  0,   0,   0, ...,  10,   4, 107],
       ...,
       [121, 161,  33, ...,  57,   5, 150],
       [  0,   0,   0, ...,   1, 340, 184],
       [  0,   0,   0, ...,  10, 167,  28]])

In [25]:
padded_text.shape

(10000, 100)

In [26]:
label = tf.keras.utils.to_categorical(
    np.asarray(label)
)

In [27]:
label.shape

(10000, 2)

In [28]:
validation_split = 0.2
indices = np.arange(text.shape[0])
np.random.shuffle(indices)

In [29]:
padded_text = padded_text[indices]
padded_text

array([[  0,   0,   0, ...,   1,  55, 645],
       [  0,   0,   0, ..., 159, 390,   8],
       [  0,   0,   0, ...,   9, 267, 590],
       ...,
       [ 23,  56, 179, ...,  15,  32, 115],
       [  0,   0,   0, ...,   1,   7, 755],
       [  0,   0,   0, ...,  79,  11, 455]])

In [30]:
label = label[indices]
label

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [31]:
nb_validation_samples = int(validation_split*text.shape[0])
nb_validation_samples

2000

## Splitting Data

In [44]:
x_train = padded_text[:-nb_validation_samples]
y_train = label[:-nb_validation_samples]
x_val = padded_text[-nb_validation_samples:]
y_val = label[-nb_validation_samples:]

In [33]:
X_train, X_test, y_train,y_test = train_test_split(text,label,test_size=0.3, random_state=42)

## Utilizing Glove for Embedding

In [34]:
glove_dir = './glove/'

In [35]:
embedding_index = {}

f = open(os.path.join(glove_dir,'glove.6B.50d.txt'),encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()

print('found word vecs: ',len(embedding_index))

found word vecs:  400000


In [36]:
embedding_dim = 50
embedding_matrix = np.zeros((len(word_index)+1,embedding_dim))
embedding_matrix.shape

(28680, 50)

In [37]:
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Creating Layers and Model

In [38]:

embedding_layer = Embedding(len(word_index)+1,embedding_dim,weights=[embedding_matrix],input_length=max_seq_length,trainable=False)

In [40]:
inp = Input(shape=(max_seq_length,))
x = embedding_layer(inp)
x = Bidirectional(LSTM(50,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(2,activation='sigmoid')(x)
model = Model(inputs=inp,outputs=x)

In [41]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [45]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(8000, 100)
(8000, 2)
(2000, 100)
(2000, 2)


## Fitting Model

In [46]:
model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=20,batch_size=1000, verbose=1);

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [108]:
Model.summary(self, line_length=None, positions=None, print_fn=None)

NameError: name 'self' is not defined

In [47]:
score = model.evaluate(x_val,y_val)
score



[0.31406673789024353, 0.862500011920929]

In [48]:
score[1]*100

86.2500011920929

In [97]:
y_pred = model.predict(x_val)

test_data = pd.DataFrame()

y_val_min = y_val.argmax(axis=1)
y_pred_min = y_pred.argmax(axis=1)

test_data['sentiment'] = y_val_min
test_data['sentiment'] = test_data['sentiment'].apply(lambda x: str(x))
test_data['sentiment'] = np.where((test_data.sentiment=='0'),'negative',test_data.sentiment)
test_data['sentiment'] = np.where((test_data.sentiment=='1'),'positive',test_data.sentiment)

test_data['pred_sentiment'] = y_pred_min
test_data['pred_sentiment'] = test_data['pred_sentiment'].apply(lambda x: str(x))
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='0'),'negative',test_data.pred_sentiment)
test_data['pred_sentiment'] = np.where((test_data.pred_sentiment=='1'),'positive',test_data.pred_sentiment)

labels = ['negative', 'positive']


print(classification_report(test_data['sentiment'],test_data['pred_sentiment'], labels=labels))

confusion_matrix(test_data['sentiment'],test_data['pred_sentiment'], labels=labels)
df_matrix=pd.DataFrame(confusion_matrix(test_data['sentiment'], test_data['pred_sentiment']), 
             columns=["Predicted Negative", "Predicted Positive"], 
             index=["Actual Negative", "Actual Positive"])
df_matrix.style.background_gradient(cmap='Blues')

              precision    recall  f1-score   support

    negative       0.69      0.67      0.68       439
    positive       0.91      0.92      0.91      1561

    accuracy                           0.86      2000
   macro avg       0.80      0.79      0.80      2000
weighted avg       0.86      0.86      0.86      2000



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,293,146
Actual Positive,129,1432


In [79]:
reviews = [
    # NEG
    '',
    
    'A great survival-on-an-island movie. Tom Hanks is superb. A sad story, but one that most people will like.',
    
    # NEG
    'No one asked for Mary Poppinss return to modern consciousness, but her reappearance unmistakably proves that Hollywood Boomers are desperate to justify their own mediocrity through nostalgic sentiment',
    
    '"Cast Away" is an exceptionally well-crafted exploration of the survival of the human spirit. Its a movie unafraid to consider the full complexity of life.',
    
    'Somewhat entertaining especially with a lot of the unintended comedy. At times very tedious and the main concept of the film was completely lost.',
    # NEG
    'The film shows shallow -- fake -- empathy with the Appalachian background that begins Vances humble brag about leaving backwoods hollers and winding up at Yale University',
    
    'A great movie that shows the progress of human development through Tom Hanks character while he is stranded on the desert island. But...all that is overshadowed by Wilson, who will remain in our hearts for all eternity.',
    # NEG
    'Trash like Red Sparrow, the Jennifer Lawrence spy movie, represents the garbagey essence of most Hollywood movies',
    
    'Probably one of the best disaster emotional films ever. A classic game of survival that is played absolutely perfectly.',
    # NEG
    'The film disastrously focuses on Udays outrages and does so without any moral perspective. "Rape, torture, disembowelment, killing, drinking, drugs and decadence" is practically the films synopsis',]

In [80]:
tokenizer.fit_on_texts(reviews)
padded_text = pad_sequences(tokenizer.texts_to_sequences(reviews), maxlen=max_seq_length)

In [99]:
res = model.predict(padded_text)

In [100]:
for x in res:
    if x[1] > .5:
        print(f'Positive Review')
    else:
        print(f'Negative Review')

Positive Review
Positive Review
Negative Review
Positive Review
Positive Review
Positive Review
Positive Review
Negative Review
Positive Review
Positive Review


In [109]:
model.save('review_model.h5')

In [110]:
from keras.models import load_model
loaded_model = load_model('review_model.h5')

In [111]:
score = loaded_model.evaluate(x_val,y_val)
score



[0.31406673789024353, 0.862500011920929]

In [112]:
token = tokenizer.to_json()
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(token,handle)

In [113]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer_json = pickle.load(handle)

tk = tf.keras.preprocessing.text.tokenizer_from_json(
    tokenizer_json
)
# Getting Sequences for the test data and adding padding accordingly
X_seq = pad_sequences(tk.texts_to_sequences(text), maxlen = 36, padding = 'post')
display(X_seq)

array([[  4, 838,  20, ..., 135,  11,  15],
       [  4,  12,  55, ...,  16,  27, 589],
       [324,  20, 269, ...,  10,   4, 107],
       ...,
       [805, 206,  58, ...,  57,   5, 150],
       [237,   5,   3, ...,   1, 340, 184],
       [202,   4, 174, ...,  10, 167,  28]])