In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv("/content/IMDB Dataset.csv")
# Dataset is now stored in a Pandas DataFrame

Exploratory Data Analysis

In [3]:
#Describing the no of rows and columns
data.shape

(50000, 2)

In [4]:
#Describing the top 5 values of the dataset
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
#display column data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
#displaying summary statistics
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
#displaying the unique values and their counts
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [9]:
# one hot encoding or label encoder
# positive -> 1
# negative -> 0


data.replace({"sentiment": {"positive":1, "negative":0}}, inplace=True)


In [10]:
!pip install tensorflow scikit-learn




In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
train_data, test_data= train_test_split(data, test_size = 0.2, random_state=42)

In [13]:
train_data.shape

(40000, 2)

In [14]:
test_data.shape

(10000, 2)

In [15]:
tokenizer= Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])

In [16]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [17]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [18]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [19]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [None]:
#LSTM MODEL BUILDING

In [20]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [21]:
model.summary

In [22]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train,Y_train, epochs=5, batch_size = 64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 533ms/step - accuracy: 0.7116 - loss: 0.5532 - val_accuracy: 0.7741 - val_loss: 0.4702
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 539ms/step - accuracy: 0.8510 - loss: 0.3573 - val_accuracy: 0.8554 - val_loss: 0.3347
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 573ms/step - accuracy: 0.8739 - loss: 0.3096 - val_accuracy: 0.8633 - val_loss: 0.3269
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 543ms/step - accuracy: 0.8929 - loss: 0.2707 - val_accuracy: 0.8680 - val_loss: 0.3271
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 532ms/step - accuracy: 0.9063 - loss: 0.2358 - val_accuracy: 0.8496 - val_loss: 0.4057


<keras.src.callbacks.history.History at 0x7e0fe8216990>

In [23]:
model.save("model_imdb.h5")



In [24]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [25]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 127ms/step - accuracy: 0.8567 - loss: 0.3842


In [26]:
print(loss, accuracy)

0.3818220794200897 0.8603000044822693


Building a Predictive system

In [27]:
def predictive_system(review):
  sequences= tokenizer.texts_to_sequences([review])
  padded_sequence= pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [28]:
predictive_system("Brilliantly executed with a heartwarming story. Highly recommend it!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step


'positive'

In [29]:
predictive_system("I couldn't sit through the entire film. It was boring and overly predictable.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


'negative'

In [30]:
predictive_system("What a disaster! Poor script, weak performances, and lackluster direction.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


'negative'

In [31]:
predictive_system("An absolute masterpiece! The visuals were stunning, and the music was phenomenal.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


'positive'

In [32]:
predictive_system("This is one of the best movies I’ve seen in years. Incredible performances by the cast!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


'positive'