# **Sentiment Analysis on IMDB Reviews with LSTM**

## **Import Dependencies**

In [29]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## **Data Collection (kaggle API)**

In [8]:
# configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [10]:
# extracting the compessed Dataset
dataset = '/content/imdb-dataset-of-50k-movie-reviews.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [11]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


## Loading the Dataset

In [12]:
data = pd.read_csv('/content/IMDB Dataset.csv')

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [15]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [16]:
## Encoding
data.replace({'sentiment': {'positive':1,'negative':0}},inplace=True)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [19]:
## Split data into training and testing data
train_data, test_data = train_test_split(data, test_size=0.2,random_state=0)

In [20]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


## **Data Preprocessing**

In [22]:
## Tokenize Text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])

X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [31]:
print(X_train)
print("---------------------------------------")
print(X_test)

[[   0    0    0 ...    6  681   68]
 [  21  335 1825 ...   72  681 1419]
 [   7    9   45 ...    7  701  155]
 ...
 [   0    0    0 ...    1   79  661]
 [   0    0    0 ...    3 2550 1419]
 [   0    0    0 ...  265  853  267]]
---------------------------------------
[[   0    0    0 ...    4    1  278]
 [   0    0    0 ...   52   52  346]
 [   0    0    0 ...   41   11   56]
 ...
 [  71    3  592 ...  297  715  440]
 [   0    0    0 ...  412   26   50]
 [   0    0    0 ...   32  310 1360]]


In [30]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

## **LSTM Model**

In [33]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


In [34]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [35]:
## Compile the MOdel
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

**Training The Model**

In [36]:
model.fit(X_train,y_train,epochs=5,batch_size=64,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x788b0b3bf6a0>

**MOdel Evaluation**

In [37]:
loss, accuracy = model.evaluate(X_test,y_test)
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 0.42226821184158325
Accuracy: 0.8709999918937683


## **Building A Predictive System**

In [49]:
from collections.abc import Sequence
def predict_sentiment(review):
  ## Tokenize and pad the sequence
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)

  sentiment = 'negative' if prediction[0][0] < 0.5 else 'positive'
  return sentiment

In [51]:
## Example Usage
new_review = "I liked it!!"
sentiment = predict_sentiment(new_review)

print(f'The sentiment of the review is: {sentiment}')

The sentiment of the review is: positive


In [55]:
## Example Usage
new_review = "That's an amazing movie."
sentiment = predict_sentiment(new_review)

print(f'The sentiment of the review is: {sentiment}')

The sentiment of the review is: negative
