In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("archive/IMDB Dataset.csv")

In [3]:
data.shape

(50000, 2)

In [4]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)

  data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)


In [7]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [9]:
 from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [10]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=42)

In [11]:
train_data.shape

(40000, 2)

In [12]:
test_data.shape

(10000, 2)

In [13]:
tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

In [14]:
X_train=pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test=pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

In [15]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]])

In [16]:
Y_train=train_data["sentiment"]
Y_test=test_data["sentiment"]

In [17]:
model=Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.build(input_shape=(None, 200))



In [18]:
model.summary()

In [19]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [20]:
model.fit(X_train,Y_train,epochs=10,batch_size=64,validation_split=0.2)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 589ms/step - accuracy: 0.7218 - loss: 0.5364 - val_accuracy: 0.8497 - val_loss: 0.3988
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 531ms/step - accuracy: 0.8601 - loss: 0.3400 - val_accuracy: 0.8646 - val_loss: 0.3270
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 574ms/step - accuracy: 0.8785 - loss: 0.2987 - val_accuracy: 0.8673 - val_loss: 0.3321
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 549ms/step - accuracy: 0.8917 - loss: 0.2732 - val_accuracy: 0.8347 - val_loss: 0.3750
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 575ms/step - accuracy: 0.9005 - loss: 0.2598 - val_accuracy: 0.8671 - val_loss: 0.3394
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 581ms/step - accuracy: 0.9152 - loss: 0.2193 - val_accuracy: 0.8714 - val_loss: 0.3234
Epoc

<keras.src.callbacks.history.History at 0x21305def9e0>

In [21]:
model.save("model.h5")



In [23]:
import joblib 
joblib.dump(tokenizer,"tokenizer.pkl")

['tokenizer.pkl']

In [22]:
model.evaluate(X_test,Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 110ms/step - accuracy: 0.8725 - loss: 0.3499


[0.3445354104042053, 0.8755999803543091]

In [33]:
def predictive_system(review):
    sequences=tokenizer.texts_to_sequences([review])
    padded_sequence=pad_sequences(sequences,maxlen=200)
    prediction=model.predict(padded_sequence)
    sentiment="positive" if prediction[0][0]>0.5 else "negative"
    return sentiment

In [39]:
predictive_system("A virtual masterpiece")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step


'positive'