In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("IMDB_Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [7]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
#Supervised Learning
#1.Regression
#2.Classification

In [9]:
#one hot encoding or label encoder
# positive -> 1
# negative -> 0

In [10]:
data.replace({"sentiment":{"positive":1, "negative":0}}, inplace=True)

  data.replace({"sentiment":{"positive":1, "negative":0}}, inplace=True)


In [11]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [12]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [13]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [14]:
# we use LSTM -> long short term memory

In [15]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)

In [17]:
train_data.shape


(40000, 2)

In [18]:
test_data.shape

(10000, 2)

In [20]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])                    

In [21]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen = 200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen = 200)

In [22]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [23]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [24]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [25]:
Y_train

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64

In [26]:
# LSTM model building


In [31]:
model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))


In [33]:
model.build(input_shape=(None, 200))

In [34]:
model.summary()

In [35]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

In [36]:
model.fit(X_train, Y_train, epochs = 5, batch_size = 64, validation_split = 0.2)


Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 217ms/step - accuracy: 0.7249 - loss: 0.5335 - val_accuracy: 0.8351 - val_loss: 0.3752
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 215ms/step - accuracy: 0.8464 - loss: 0.3650 - val_accuracy: 0.8565 - val_loss: 0.3369
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 239ms/step - accuracy: 0.8775 - loss: 0.3068 - val_accuracy: 0.8260 - val_loss: 0.3992
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 266ms/step - accuracy: 0.8761 - loss: 0.3052 - val_accuracy: 0.8303 - val_loss: 0.4696
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 282ms/step - accuracy: 0.8365 - loss: 0.3768 - val_accuracy: 0.8615 - val_loss: 0.3223


<keras.src.callbacks.history.History at 0x1698cba50>

In [37]:
model.save("model.h5")



In [44]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [45]:
loss, accuracy = model.evaluate(X_test, Y_test, verbose=1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 57ms/step - accuracy: 0.8675 - loss: 0.3139


In [46]:
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.3099576532840729
Test Accuracy: 0.8716999888420105


In [47]:
#Building Predictive System

In [62]:
  def predictive_system(review):
      sequences = tokenizer.texts_to_sequences([review])
      padded_sequence = pad_sequences(sequences, maxlen = 200)
      prediction = model.predict(padded_sequence)
      sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
      return sentiment

In [64]:
predictive_system("This movie was very good")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


'positive'

In [66]:
predictive_system("unique and ever green")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step


'positive'

In [67]:
predictive_system("boring movie")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


'negative'