<a href="https://colab.research.google.com/github/rishipython/ReviewClassifier/blob/main/ReviewClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [144]:
from tensorflow import keras
import numpy as np
import os
from random import randint

In [145]:
dir = f"{os.sep}content{os.sep}drive{os.sep}My Drive{os.sep}ReviewClassifier{os.sep}"

In [146]:
# Data
data = keras.datasets.imdb
# Only uses 88000 most used words
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=88000)
word_index = data.get_word_index()
word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [147]:
# Sets data to same length (250) (adds padding)
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [148]:
np.save(dir + 'train_data.npy', train_data)
np.save(dir + 'test_data.npy', test_data)

In [149]:
# Function for decoding data into human readable words
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text]) # Note: get(i, "?") means to try to get word for i, but if not,
    # put ?.

In [150]:
# Model
model = keras.Sequential([
    # Embedding layer finds word vectors for each word we pass it
    # In our embedding layer, our word vectors are 16th dimensional
    # 10000 is the number of word vectors
    # Words will have similar values if they are similar, and very different values if they are very different
    keras.layers.Embedding(88000, 16),
    # Puts data into lower dimension
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(50, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation="sigmoid")
])

# Model summary
model.summary()

# Compiles model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Splits train data into training and cv sets
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]

# Fits model
# Note: batch size is how many movie reviews we're doing at once
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)

# Saves model
model.save(dir + "model")

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 16)          1408000   
_________________________________________________________________
global_average_pooling1d_7 ( (None, 16)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 50)                850       
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 51        
Total params: 1,408,901
Trainable params: 1,408,901
Non-trainable params: 0
_________________________________________________________________
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40

In [151]:
# Results
results = model.evaluate(test_data, test_labels)
print(f"Results: Loss: {results[0]}, Accuracy: {results[1]}")

Results: Loss: 0.38813045620918274, Accuracy: 0.8663600087165833


In [152]:
# Makes function review encode which encodes review
def review_encode(s):
    encoded = [1]
    for word in s:
        if word in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

In [153]:
num_to_review = {0: 'Negative review', 1:'Positive review'}

In [154]:
def round(num):
  if num >= 0.5:
    return 1
  else:
    return 0

In [157]:
# Looks at random and prints review, prediction, and actual value
i = randint(0, len(test_data)-1)
print(f"i: {i}")
test_review = test_data[i]
predict = model.predict([test_review])
print("Review: ", end='')
print(decode_review(test_review))
print(f"Prediction: {num_to_review[round(predict[0][0])]}")
print(f"Actual: {num_to_review[test_labels[i]]}")

i: 8821
Review: <START> sebastian cabot is a rich jerk who wants to buy up all the land because there is oil though none of the locals are aware of the oil with the help of an evil gunfighter in black they kill and terrorize everyone when the son of a murdered man arrives he refuses to back down and stands up to these forces of darkness br br wow as i watched terror in a texas town i felt as if i'd seen this film many times before and would probably see something like it again that's because aside from a few novelties such as sterling hayden using a harpoon on the bad guy it has a plot that is too familiar once again we've got a rich guy who is trying to drive out all the farmers in order to gain control of all the land and to do so he's brought in hired guns to force people to sell or kill them been there done that in just too many films br br i love sterling hayden in films but just couldn't recommend this as anything other than a poor time passer <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 