In [None]:
#Import all the libraries needed
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [None]:
#Preview dataset

In [None]:
import pathlib
from google.colab import drive
drive.mount('/content/drive')
data_dir = pathlib.Path('/content/drive/My Drive/SC549Data/IMDB Dataset.csv')
data = pd.read_csv(data_dir)

print(data)

Mounted at /content/drive
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [None]:
#Declaring the english stop words

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Preprocessing and Encoding labels

In [None]:
def load_dataset():
    x_data = data['review']       # Reviews/Input
    y_data = data['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [None]:
#train to test split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
27458    [i, bore, story, plot, lines, presented, many,...
44676    [i, plagued, nightmares, involving, sesame, st...
5615     [this, one, worst, movies, i, ever, seen, than...
32856    [to, put, simply, i, enjoyed, film, the, reaso...
2955     [ring, ring, have, horror, directors, hotline,...
                               ...                        
13545    [i, expected, fame, uplifting, film, ended, op...
17546    [carmen, one, best, films, i, ever, seen, it, ...
10016    [a, really, bad, movie, good, moments, qualiti...
48218    [suzumiya, haruhi, utsu, the, melancholy, haru...
32109    [this, movie, perfect, portrayal, the, nutcrac...
Name: review, Length: 40000, dtype: object 

40354    [anyone, enjoys, lynchian, weirdness, twin, pe...
19450    [hollow, point, alright, movie, worth, half, p...
43340    [an, absorbing, exploration, virtual, reality,...
47245    [this, cool, marvel, superhero, game, pays, pr...
26599    [i, seen, mst, k, version, uncut, version, i, ...
 

In [None]:
#Function for getting the maximum review length, by calculating the mean of all the reviews length (using numpy.mean)

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
#Tokenize and Pad/Truncate Reviews
#post, pad or truncate the words in the back of a sentence
#pre, pad or truncate the words in front of a sentence

In [None]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[    1  2611    13 ...  6668    11   513]
 [    1  8029  4004 ...     0     0     0]
 [    8     5   153 ...  6327  6238   905]
 ...
 [   39    15    20 ...   156    20  2308]
 [14720  9936 92224 ...   487   732   887]
 [    8     3   302 ...     0     0     0]] 

Encoded X Test
 [[  154  4333 17927 ...     0     0     0]
 [ 4145   127  2537 ...     0     0     0]
 [  699  6717  4899 ...   637   361  3709]
 ...
 [    2   748     3 ...     0     0     0]
 [  989  9054   219 ...    28   285  1837]
 [ 1196   515   716 ...  1550  3905  5376]] 

Maximum review length:  130


In [None]:
#Build the model

In [None]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))


print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2951424   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2976321 (11.35 MB)
Trainable params: 2976321 (11.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
#Set hyperparameters
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
#Model Training

In [None]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
#Model Training

In [None]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.74785, saving model to models/LSTM.h5
Epoch 2/5


  saving_api.save_model(


Epoch 2: accuracy improved from 0.74785 to 0.92360, saving model to models/LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92360 to 0.96070, saving model to models/LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.96070 to 0.97725, saving model to models/LSTM.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.97725 to 0.98453, saving model to models/LSTM.h5


<keras.src.callbacks.History at 0x7f5da478dde0>

In [None]:
#Model testing

In [None]:
pred = model.predict(x=x_test)
y_pred = (pred >= 0.5) * 1

#y_pred = model.predict(x_test)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1


print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8730
Wrong Prediction: 1270
Accuracy: 87.3
