In [2]:
import os
import json
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
#load dataset
data = pd.read_csv("IMDB Dataset.csv")

In [4]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [5]:
data.shape

(50000, 2)

In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

There is no class imbalance. the dataset is completedly balanced

In [7]:
#encode sentiment to number
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [8]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [10]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


Data-Preprocessing

In [24]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])  # Fit on training data only

# Process training data
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
Y_train = train_data["sentiment"]

# Process test data
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)
Y_test = test_data["sentiment"]

In [25]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [26]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

Build LSTM Model

In [27]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [28]:
model.summary()

In [29]:
#compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [30]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 595ms/step - accuracy: 0.7291 - loss: 0.5308 - val_accuracy: 0.8472 - val_loss: 0.3767
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 606ms/step - accuracy: 0.8571 - loss: 0.3493 - val_accuracy: 0.8596 - val_loss: 0.3452
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 623ms/step - accuracy: 0.8774 - loss: 0.3090 - val_accuracy: 0.8687 - val_loss: 0.3148
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 583ms/step - accuracy: 0.8941 - loss: 0.2646 - val_accuracy: 0.8695 - val_loss: 0.3189
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 577ms/step - accuracy: 0.9127 - loss: 0.2214 - val_accuracy: 0.8759 - val_loss: 0.3138


In [31]:
model.save("sentiment.h5")



Model Evaluation    

In [32]:
loss, acc = model.evaluate(X_test, Y_test)
print("Loss: ", loss)
print("Accuracy: ", acc)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.8796 - loss: 0.2942
Loss:  0.3001149594783783
Accuracy:  0.8823999762535095


Build a Predictive System:

In [49]:
def predict_sentiment(review):
    #tokenise and pad the reviews
    seq = tokenizer.texts_to_sequences([review])
    pad_seq = pad_sequences(seq, maxlen=200)
    prediction = model.predict(pad_seq)
    sentiment = "positive"  if prediction[0][0] > 0.5 else "negative"
    return sentiment  

Test

In [50]:
sentence = "Very bad, worst"
senti = predict_sentiment(sentence)
print(senti)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
negative


In [51]:
sentence = "the movie is fantastic, I loved it"
senti = predict_sentiment(sentence)
print(senti)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
positive
