### Data Preprocessing 
You can use your own way of preprocessing to enhance results. Best results will lead to bonus points.

In [None]:
import pandas as pd
import nltk
import tensorflow as tf
import torch
import torch.nn as nn
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import torch.optim as optim
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
import re
from nltk.corpus import stopwords   # to get collection of stopwords
from keras import models    
from tensorflow.keras.models import load_model   # load saved model


In [None]:
reviews = pd.read_csv("imdb_dataset.csv")

print(reviews)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
47995  First of all, Blythe Danner doesn't look anywh...  negative
47996  I wouldn't be so quick to look at all the good...  negative
47997  Everything about this show is terrible. Its pr...  negative
47998  This movie just was not very funny. There's no...  negative
47999  "The Yoke's on Me" is undoubtedly the most con...  negative

[48000 rows x 2 columns]


In [None]:

english_stops = set(stopwords.words('english'))

In [None]:
def preprocess(text):
    lower = text.lower()
    # Removing Punctuation marks
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    rem_punc = tokenizer.tokenize(lower)
    # Removing Stop Words
    stopwords = nltk.corpus.stopwords.words('english')
    rem_stop_words = [word for word in rem_punc if not word in stopwords]
    # Removing Non-English words 
    english_words = nltk.corpus.words.words()
    english_words = [word for word in rem_stop_words if word in english_words]   
    # Insert Start End tokens
    english_words.insert(0,'<start>')
    english_words.append('<end>')
    sentence = ' '.join(english_words)
    return sentence

In [None]:
def encode_text(text):
    # Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(text)
    # Converting to sequences
    sequences = tokenizer.texts_to_sequences(text)
    # Padding Zeros 
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
    
    return padded_sequences, tokenizer

In [None]:
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download("words")

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
text = list(map(preprocess,reviews.review[:10]))
encodings, tokenizer = encode_text(text)

In [None]:
def load_dataset():
    df = pd.read_csv('imdb_dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
   # x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [o, n, e,  , o, f,  , t, h, e,  , o, t, h, e, ...
1        [a,  , w, o, n, d, e, r, f, u, l,  , l, i, t, ...
2        [i,  , t, h, o, u, g, h, t,  , t, h, i, s,  , ...
3        [b, a, s, i, c, a, l, l, y,  , t, h, e, r, e, ...
4        [p, e, t, t, e, r,  , m, a, t, t, e, i,  , s, ...
                               ...                        
47995    [f, i, r, s, t,  , o, f,  , a, l, l,  ,  , b, ...
47996    [i,  , w, o, u, l, d, n,  , t,  , b, e,  , s, ...
47997    [e, v, e, r, y, t, h, i, n, g,  , a, b, o, u, ...
47998    [t, h, i, s,  , m, o, v, i, e,  , j, u, s, t, ...
47999    [ , t, h, e,  , y, o, k, e,  , s,  , o, n,  , ...
Name: review, Length: 48000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
47995    0
47996    0
47997    0
47998    0
47999    0
Name: sentiment, Length: 48000, dtype: int64


# Data Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
30758    [i, n,  , t, h, e,  , i, m, m, o, r, t, a, l, ...
7206     [t, h, e,  , w, o, r, d,  , h, o, n, o, r,  , ...
25777    [ , w, i, t, h,  , a, l, l,  , t, h, e,  , m, ...
36185    [t, h, e, r, e,  , s,  , n, o, t, h, i, n, g, ...
13321    [i,  , v, e,  , b, e, e, n,  , a, b, l, e,  , ...
                               ...                        
46693    [t, h, i, s,  , i, s,  , t, h, e,  , o, n, l, ...
45234    [t, y, r, a,  , b, a, n, k, s,  , n, e, e, d, ...
32380    [o, k,  ,  , i, t,  , w, a, s,  , a,  , g, o, ...
2856     [i,  , r, e, n, t, e, d,  , t, h, i, s,  , m, ...
6802     [i, f,  , w, e,  , r, e, a, l, l, y,  , w, a, ...
Name: review, Length: 38400, dtype: object 

42093    [h, u, g, e,  ,  , e, x, h, a, u, s, t, i, v, ...
40077    [d, e, s, p, i, t, e,  , u, n, f, o, r, t, u, ...
44348    [l, e, t,  , m, e,  , s, a, y,  , t, h, a, t, ...
31798    [t, h, e, r, e,  , i, s,  , a,  , v, e, r, s, ...
47518    [w, e,  , f, o, u, n, d,  , t, h, i, s,  , m, ...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 5  8  1 ...  5 17 10]
 [ 3 10  2 ...  8 12  7]
 [ 1 19  5 ...  2  1  2]
 ...
 [ 6 23  1 ...  0  0  0]
 [ 5  1  9 ...  0  0  0]
 [ 5 16  1 ...  0  0  0]] 

Encoded X Test
 [[10 15 17 ...  0  0  0]
 [12  2  7 ...  0  0  0]
 [11  2  3 ...  0  0  0]
 ...
 [ 5  1 12 ...  1 19  2]
 [ 5  1  3 ... 13 23  1]
 [14 18  1 ...  0  0  0]] 

Maximum review length:  1287


In [None]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer = 'adam', 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1287, 32)          896       
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 25,793
Trainable params: 25,793
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:

model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])


Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.50216, saving model to models/LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.50216 to 0.50534, saving model to models/LSTM.h5
Epoch 3/5

Epoch 00003: accuracy did not improve from 0.50534
Epoch 4/5

Epoch 00004: accuracy improved from 0.50534 to 0.50922, saving model to models/LSTM.h5
Epoch 5/5

Epoch 00005: accuracy did not improve from 0.50922


<tensorflow.python.keras.callbacks.History at 0x7f75eaab89d0>

#Testing

In [None]:

y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))



Correct Prediction: 4807
Wrong Prediction: 4793
Accuracy: 50.072916666666664


In [None]:
loaded_model = load_model('models/LSTM.h5')

In [None]:
review = str(input('Movie Review: '))

Movie Review:  Nothing was typical about this. Everything was beautifully done in this movie, the story, the flow, the scenario, everything. I highly recommend it for mystery lovers, for anyone who wants to watch a good movie!


In [None]:

# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:   Nothing was typical about this Everything was beautifully done in this movie the story the flow the scenario everything I highly recommend it for mystery lovers for anyone who wants to watch a good movie
Filtered:  [' nothing typical everything beautifully done movie story flow scenario everything i highly recommend mystery lovers anyone wants watch good movie']


In [None]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[5 0 0 ... 0 0 0]]


In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.488618]]


In [None]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

negative


#Plot The Result

In [None]:
test_metrics = model.evaluate(x_train, y_train)



In [None]:

test_accuracy = test_metrics[1] 
test_loss = test_metrics[0]
print("Test Accuracy :", test_accuracy)

Test Accuracy : 0.5088281035423279


In [None]:
import matplotlib.pyplot as plt

y_train = checkpoint.checkpoint['accu']
x_train = checkpoint.checkpoint['val_accu']

epochs = range(1, len(train_acc) + 1)

plt.plot(epochs, y_train, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training & Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

AttributeError: ignored

In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, train_loss, 'bo', label="Training loss")
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training & Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()