In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM, Dropout
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
num_words = 10000
max_review_len = 200

In [4]:
train = pd.read_csv('/content/drive/MyDrive/A6/train.csv',
                   header=None,
                   names=['Label', 'Review'])
test = pd.read_csv('/content/drive/MyDrive/A6/test.csv',
                   header=None,
                   names=['Label', 'Review'])

In [5]:
train

Unnamed: 0,Label,Review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...
...,...,...
559995,2,Ryan was as good as everyone on yelp has claim...
559996,2,Professional \nFriendly\nOn time AND affordabl...
559997,1,Phone calls always go to voicemail and message...
559998,1,Looks like all of the good reviews have gone t...


In [6]:
y_train, y_test = train['Label'] - 1, test['Label'] - 1

In [7]:
reviews = train['Review']
reviews[:5]

0    Unfortunately, the frustration of being Dr. Go...
1    Been going to Dr. Goldberg for over 10 years. ...
2    I don't know what Dr. Goldberg was like before...
3    I'm writing this review to give you a heads up...
4    All the food is great here. But the best thing...
Name: Review, dtype: object

In [8]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(reviews)
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'a': 5,
 'was': 6,
 'of': 7,
 'it': 8,
 'for': 9,
 'in': 10,
 'is': 11,
 'n': 12,
 'that': 13,
 'my': 14,
 'we': 15,
 'this': 16,
 'but': 17,
 'with': 18,
 'they': 19,
 'you': 20,
 'on': 21,
 'not': 22,
 'have': 23,
 'had': 24,
 'at': 25,
 'were': 26,
 'so': 27,
 'are': 28,
 'food': 29,
 'be': 30,
 'place': 31,
 'me': 32,
 'there': 33,
 'good': 34,
 'as': 35,
 'out': 36,
 'all': 37,
 'like': 38,
 'if': 39,
 'just': 40,
 'our': 41,
 'very': 42,
 'get': 43,
 'one': 44,
 'here': 45,
 'time': 46,
 'when': 47,
 'up': 48,
 'or': 49,
 'from': 50,
 'great': 51,
 'service': 52,
 'would': 53,
 'back': 54,
 'their': 55,
 'about': 56,
 'no': 57,
 'go': 58,
 'an': 59,
 'what': 60,
 'really': 61,
 'he': 62,
 "it's": 63,
 'she': 64,
 'which': 65,
 'will': 66,
 'some': 67,
 'only': 68,
 'been': 69,
 'us': 70,
 'your': 71,
 'because': 72,
 'more': 73,
 'even': 74,
 'can': 75,
 'them': 76,
 'by': 77,
 "don't": 78,
 'got': 79,
 'other': 80,
 'after': 81,
 'do': 8

In [9]:
sequences = tokenizer.texts_to_sequences(reviews)

In [10]:
index = 42
print(reviews[index])
print(sequences[index])

Some of the worst pizza I've ever had.  We used a coupon from the paper for a 2 topping 8 cut Sicilian. First of all the pizza wasn't even cut through, and the sad attempt at cutting was so uneven that 4 of the slices were about an inch wide, while the others were about 4\" each. The toppings were scarce, they used mini pepperoni and put maybe 8 on the whole pizza. The onions were huge chunks and the mushrooms were straight from a can. The worst part though was the thick doughy crust that tasted more like a fishy sourdough roll. I'm serious... It was so noticeable that it made me wonder if the dough was bad or if they for some weird reason put fish sauce in it. It was gross. \n\nWe also ordered steak and Italian hoagies. The veggies were old and wilted, and there was no dressing on either. The Italian had deli meat that was clearly bottom of the line and not very generous. The \"steak\" (if you an call it that) was greyish instead of brown and looked like it was a processed meat choppe

In [11]:
print(tokenizer.word_index['some'])
print(tokenizer.word_index['of'])
print(tokenizer.word_index['the'])
print(tokenizer.word_index['worst'])

67
7
1
384


In [12]:
x_train = pad_sequences(sequences, maxlen=max_review_len)
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  707,    1, 4339,    7,  173,  939, 1665,   11,    5,
       2161,    7,    1,  137,  112,   24,   18,   27,  189,   80, 3128,
         10, 2636,   34, 1416,  462,  129,    8,  519,   13,  177,  129,
        803,   88, 3812,    1,  446,    8,  392,  841,  115,  360,    7,
       4136, 1438,    4,   43,   59, 1266,  118,   95,   46,    9,   13,
         49, 1687,    4,  418,   18,    8,    3,   

In [13]:
model = Sequential()
model.add(Embedding(num_words, 64, input_length=max_review_len))
model.add(GRU(128))
model.add(Dense(1, activation='sigmoid'))

In [14]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [15]:
model_save_path = '/content/best_model.h5'
checkpoint_callback = ModelCheckpoint(model_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [16]:
history = model.fit(x_train, 
                    y_train, 
                    epochs=5,
                    batch_size=128,
                    validation_split=0.1,
                    callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.95968, saving model to /content/best_model.h5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.95968 to 0.96548, saving model to /content/best_model.h5
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.96548
Epoch 4/5
Epoch 00004: val_accuracy did not improve from 0.96548
Epoch 5/5
Epoch 00005: val_accuracy did not improve from 0.96548


In [17]:
model.load_weights(model_save_path)

In [18]:
test_sequences = tokenizer.texts_to_sequences(test['Review'])
x_test = pad_sequences(test_sequences, maxlen=max_review_len)
x_test[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0, 8158,    4,   80,  336,    3,   23, 1361, 1547,   56,
          1,   52,   49,    1,  256,    3,   23,   69,  287, 1837,   52,
         45,    9,    1,  546,  148,  274,  165,    2, 1221,    4,   14,
        137,   18,  289,   38, 8346, 2311,  257,  511,   28, 1167,    2,
        113,   60,  568,  520, 1426,   16,   11,   44,   31,   13,    3,
         82,   22,  249,   38,    3,  144,  173,  694, 2235,    7,   40,
         72,    7,   14,   80, 2540, 5951,   23,   

In [19]:
scores = model.evaluate(x_test, y_test, verbose=1)
print("The percent of correct answers:", round(scores[1] * 100, 4))

The percent of correct answers: 95.6868
