In [40]:
import pandas as pd # to load dataset
import numpy as np # for mathematic equation
from nltk.corpus import stopwords # to get collection of stopwords
from sklearn.model_selection import train_test_split # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences # to do padding or truncating
from tensorflow.keras.models import Sequential # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint # save model
from tensorflow.keras.models import load_model # load saved model
import re

In [41]:
data = pd.read_csv('IMDB Dataset.csv')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [42]:
english_stops = set(stopwords.words('english'))

In [43]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review'] # Reviews/Input
    y_data = df['sentiment'] # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True) # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True) # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) # remove stop word
    x_data = x_data.apply(lambda review: [w.lower() for w in review]) # lower case

# ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)
    
    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [44]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
6432     [while, exploring, caves, wife, doctor, bitten...
17197    [entertainment, tonight, going, hill, last, ye...
34155    [hilarious, clean, light, hearted, quote, wort...
6595     [i, remember, much, movie, except, distinctly,...
43530    [people, please, bother, watch, movie, this, m...
                               ...                        
45032    [this, one, movies, get, better, every, time, ...
45268    [those, prophetic, words, spoken, william, hol...
47553    [this, first, time, writer, director, comes, a...
40419    [one, dark, night, typical, teen, horror, film...
31082    [this, one, best, martial, art, kung, fu, movi...
Name: review, Length: 40000, dtype: object 

49433    [who, ever, put, review, bad, taste, quite, fu...
37158    [wow, i, hate, sound, opinionated, anyone, rat...
19057    [i, seen, movie, since, i, teenager, i, grew, ...
43333    [and, all, through, house, special, crypt, epi...
16726    [you, help, marvel, hitchcock, early, work, sa...
 

In [45]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    
    return int(np.ceil(np.mean(review_length)))

In [46]:
# ENCODE REVIEW
token = Tokenizer(lower=False) # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1 # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  367  5447 10484 ...     0     0     0]
 [  618  4420    79 ...     0     0     0]
 [  483  2012   553 ...     0     0     0]
 ...
 [    8    23    10 ...     0     0     0]
 [    5   355   217 ...   228    80    26]
 [    8     5    45 ...     0     0     0]] 

Encoded X Test
 [[  725    51   181 ...     0     0     0]
 [ 1228     1   634 ...  4304   316  1315]
 [    1    38     3 ...     0     0     0]
 ...
 [  298   221 58561 ...     0     0     0]
 [  145  5678   476 ...     0     0     0]
 [   63  1693   422 ... 29600  8620   643]] 

Maximum review length:  130


In [47]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM))  # Remove input_length argument
model.add(LSTM(LSTM_OUT, input_shape=(max_length,)))  # Add input_length here
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_length))  # Assuming max_length is the length of your input sequences

print(model.summary())

  super().__init__(**kwargs)


None


In [49]:
checkpoint = ModelCheckpoint(
    'models/LSTM.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [50]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.5367 - loss: 0.6782
Epoch 1: accuracy improved from -inf to 0.54492, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 138ms/step - accuracy: 0.5367 - loss: 0.6782
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.5184 - loss: 0.6914
Epoch 2: accuracy did not improve from 0.54492
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 140ms/step - accuracy: 0.5184 - loss: 0.6914
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.5754 - loss: 0.6716
Epoch 3: accuracy improved from 0.54492 to 0.60067, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 143ms/step - accuracy: 0.5755 - loss: 0.6715
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/ste

<keras.src.callbacks.history.History at 0x14c3f24d190>

In [51]:
loaded_model = load_model('models/LSTM.keras')

In [52]:
review = str(input('Movie Review: '))

Movie Review: 36807


In [53]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  
Filtered:  ['']


In [54]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')

print(tokenize_words)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [55]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457ms/step
[[0.77677774]]


In [56]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
