In [121]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [122]:
df=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [123]:
#checking the class distribution i.e check how many positives and negatives are there in dataset
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [124]:
#convert potive to 1 and negative to -1
df.replace({"sentiment":{"positive":1, "negative":0}}, inplace=True)

  df.replace({"sentiment":{"positive":1, "negative":0}}, inplace=True)


In [125]:
df['sentiment'] = df['sentiment'].astype(int) 

In [126]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [127]:
#split data into traing and testing data
from sklearn.model_selection import train_test_split
train_data, test_data=train_test_split(df,test_size=0.2,random_state=23)

In [128]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [129]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [130]:
#Tokenizing: converting the text into tokens
tokens=Tokenizer(num_words=5000)  
#num_words: assigns numbers to the words in an order(considering common words) and the parameter limits the vocabulary size to the top 5000 most frequent words in the dataset

In [131]:
tokens.fit_on_texts(train_data['review'])
#fit_on_texts method takes a list or Series of text data i.e, reviews 
#Tokenizes the text into words, counts the frequency of each word and builds a word index that maps each word to a unique integer.

In [132]:
x_train=pad_sequences(tokens.texts_to_sequences(train_data['review']), maxlen=200)
x_test=pad_sequences(tokens.texts_to_sequences(test_data['review']), maxlen=200)
#pad_seqyences ensures all sequences are of the same length (max_len=200).
#i.e if it is shorter than 200, it is padded with zeros at the beginning and if it is longer than 200, it is truncated.

#tokens.texts_to_sequences() converts each review in the train and test data into a sequence of integers.
#and each word in the review is replaced by its corresponding integer ID from the tokenizer’s word index.

In [133]:
x_train

array([[   0,    0,    0, ..., 1167,  636, 2699],
       [  33,   77,   21, ...,   49, 2970,   17],
       [   0,    0,    0, ...,   77,  225,  416],
       ...,
       [   0,    0,    0, ...,   59,  138,  243],
       [   1,   80,  825, ...,  338,   10,  165],
       [ 906, 1382,   37, ...,    5,    1, 1110]], dtype=int32)

In [134]:
x_test

array([[   0,    0,    0, ...,  913,    8,    9],
       [  20,  344,   18, ...,   25,   76, 2699],
       [   0,    0,    0, ...,  180,  206, 2370],
       ...,
       [   0,    0,    0, ...,   11, 1987,   17],
       [  11,   19,  182, ...,   57, 1238, 1045],
       [ 414,   54,  281, ...,    1,  654, 2786]], dtype=int32)

In [135]:
x_train.shape

(40000, 200)

In [136]:
x_test.shape

(10000, 200)

In [137]:
y_train=train_data["sentiment"]
y_test=test_data["sentiment"]

In [138]:
y_train

20198    0
34103    1
40179    1
34586    0
30725    0
        ..
9704     0
11190    1
26569    0
9256     1
41555    0
Name: sentiment, Length: 40000, dtype: int64

In [139]:
y_test

49466    1
11621    0
39058    1
10033    0
22076    0
        ..
16074    0
26432    1
17868    0
36795    0
39347    0
Name: sentiment, Length: 10000, dtype: int64

In [140]:
'''
LSTM: Long Short Term Memory
Its a type of RNN and used for sequentical datasets like time series and text data
LSTM has the dependency of text in data
'''

'\nLSTM: Long Short Term Memory\nIts a type of RNN and used for sequentical datasets like time series and text data\nLSTM has the dependency of text in data\n'

In [141]:
#build the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))


'''
Dropout rate for the input connections to prevent overfitting (20% of input neurons are randomly dropped during training).
Dropout rate for the recurrent connections within the LSTM (20% of recurrent neurons are randomly dropped).

Embedding Layer: Converts word indices to dense word embeddings.
LSTM Layer: Captures sequential dependencies and long-term context from the text.
Dense Layer: Produces the final probability for classification.

'''



'\nDropout rate for the input connections to prevent overfitting (20% of input neurons are randomly dropped during training).\nDropout rate for the recurrent connections within the LSTM (20% of recurrent neurons are randomly dropped).\n\nEmbedding Layer: Converts word indices to dense word embeddings.\nLSTM Layer: Captures sequential dependencies and long-term context from the text.\nDense Layer: Produces the final probability for classification.\n\n'

In [142]:
#model.build(input_shape=(None, 200))

In [143]:
model.summary()

In [144]:
#compile the model
#from keras.optimizers import Adam
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

In [145]:
#train the model
model.fit(x_train,y_train, epochs=2, batch_size=64, validation_split=0.2)
#20%of training data is used for validation

Epoch 1/2
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 282ms/step - accuracy: 0.7289 - loss: 0.5277 - val_accuracy: 0.8069 - val_loss: 0.4309
Epoch 2/2
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 282ms/step - accuracy: 0.8265 - loss: 0.3945 - val_accuracy: 0.8551 - val_loss: 0.3529


<keras.src.callbacks.history.History at 0x7d6454d40fd0>

In [146]:
#evaluation of model
loss,accuracy=model.evaluate(x_test, y_test)
print(f"Test Loss:{loss}")
print(f"Test Accuracy:{accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 78ms/step - accuracy: 0.8485 - loss: 0.3576
Test Loss:0.3400472402572632
Test Accuracy:0.8593000173568726


In [147]:
#build a predictive system
def predict_sentiment(review):
    #tokenize and pad the review
    sequence=tokens.texts_to_sequences([review])
    padded_sequence= pad_sequences(sequence, maxlen=200)
    prediction=model.predict(padded_sequence)
    sentiment="positive" if prediction[0][0]>0.5 else "negative"
    return sentiment


In [148]:
#example usage:
new_review="This movie was fantastic. I loved it."
sentiment=predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
The sentiment of the review is: positive


In [149]:
#example usage:
new_review="This movie was bad."
sentiment=predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
The sentiment of the review is: negative


In [167]:
new_review="Rajdeep killed ramya"
sentiment=predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
The sentiment of the review is: negative
