In [None]:
import pandas as pd    
import numpy as np     
from nltk.corpus import stopwords  
from sklearn.model_selection import train_test_split      
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences   
from tensorflow.keras.models import Sequential   
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.callbacks import ModelCheckpoint   
from tensorflow.keras.models import load_model
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
data = pd.read_csv("IMDB Dataset.csv", on_bad_lines="skip")

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [10]:
english_stops = set(stopwords.words('english'))
print(list(english_stops)[:20])  # check first 20 stopwords

['too', "she'd", 'and', 'between', "couldn't", "i've", 'what', "weren't", 'your', 'below', "you'll", 'down', "wasn't", 'these', 'until', 've', 'didn', "doesn't", "you're", 'who']


In [11]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)


Train Set
45635    [for, first, fifteen, minutes, story, naked, f...
11420    [i, loved, flash, gordon, child, watching, ser...
27686    [if, want, really, terrify, people, choose, de...
43621    [i, agree, messages, book, like, movie, i, rea...
41940    [david, beckham, british, soccer, star, husban...
                               ...                        
30556    [it, extremely, difficult, film, watch, partic...
31116    [if, like, adult, comedy, cartoons, like, sout...
41585    [zane, beringer, keep, edge, seats, i, typical...
3174     [i, found, movie, local, video, store, i, surp...
18591    [all, right, elements, seemed, conspire, make,...
Name: review, Length: 40000, dtype: object 

22508    [i, thirteen, years, old, i, saw, movie, i, ex...
12380    [i, high, hopes, production, one, favourite, w...
5304     [a, thief, night, film, generally, ignored, mo...
23653    [tourist, trap, genuinely, spooky, low, budget...
29073    [this, film, great, i, love, way, mixes, dark,...
 

In [13]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))


In [14]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  204    23  3444 ...     0     0     0]
 [    1   338  2820 ...    81   151 10652]
 [   55    88    13 ...     8   131  6946]
 ...
 [ 7138 40559   296 ...     0     0     0]
 [    1   162     3 ...     0     0     0]
 [  199   113   709 ...     0     0     0]] 

Encoded X Test
 [[   1 9645   71 ...    0    0    0]
 [   1  209 1892 ...    0    0    0]
 [  39 2960  215 ...    0    0    0]
 ...
 [2723 1667 1802 ...    0    0    0]
 [  49  686    1 ...    0    0    0]
 [5210 1925    4 ...  213  192  624]] 

Maximum review length:  130


In [15]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())



None


In [16]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [31]:
model.fit(x_train, y_train, batch_size = 128, epochs = 7, callbacks=[checkpoint])

Epoch 1/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.6050 - loss: 0.6384
Epoch 1: accuracy did not improve from 0.82995
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 209ms/step - accuracy: 0.6049 - loss: 0.6384
Epoch 2/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.6354 - loss: 0.5896
Epoch 2: accuracy did not improve from 0.82995
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 210ms/step - accuracy: 0.6357 - loss: 0.5894
Epoch 3/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.7463 - loss: 0.5125
Epoch 3: accuracy did not improve from 0.82995
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 209ms/step - accuracy: 0.7460 - loss: 0.5128
Epoch 4/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.6193 - loss: 0.6097
Epoch 4: accuracy did not improve from 



[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 210ms/step - accuracy: 0.7970 - loss: 0.4295
Epoch 6/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.9309 - loss: 0.1967
Epoch 6: accuracy improved from 0.85550 to 0.93273, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 211ms/step - accuracy: 0.9309 - loss: 0.1967
Epoch 7/7
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.9653 - loss: 0.1159
Epoch 7: accuracy improved from 0.93273 to 0.96350, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 210ms/step - accuracy: 0.9653 - loss: 0.1159


<keras.src.callbacks.history.History at 0x7d98f5e66720>

In [32]:
# Predict probabilities
y_pred_prob = model.predict(x_test, batch_size=128)

# Convert probabilities to class labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {:.2f}%'.format(true / len(y_pred) * 100))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 91ms/step
Correct Prediction: 8687
Wrong Prediction: 1313
Accuracy: 86.87%


In [33]:
loaded_model = load_model('models/LSTM.h5')



In [45]:
review = str(input('Movie Review: '))

Movie Review: Nothing was typical about this. Everything was beautifully done in this movie, the story, the flow, the scenario, everything. I highly recommend it for mystery lovers, for anyone who wants to watch a good movie!


In [46]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Nothing was typical about this Everything was beautifully done in this movie the story the flow the scenario everything I highly recommend it for mystery lovers for anyone who wants to watch a good movie
Filtered:  ['nothing typical everything beautifully done movie story flow scenario everything i highly recommend mystery lovers anyone wants watch good movie']


In [47]:

tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  76  692  172 1191  127    3   15 2700 2649  172    1  447  281  682
  1733  153  395   33    9    3    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]


In [48]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[[0.9206993]]


In [49]:

if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
