In [1]:
!pip install kaggle




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Importing the Dependencies

In [52]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Data Collection - Kaggle API **

In [4]:
kaggle_dictionary = json.load(open("kaggle.json"))

In [5]:
# Setup Kaggle credentials in environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dictionary["username"]
os.environ["KAGGLE_KEY"] = kaggle_dictionary["key"]

In [6]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to D:\sahith projects\Sentiment analysis




  0%|          | 0.00/25.7M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.7M [00:01<00:38, 673kB/s]
  8%|7         | 2.00M/25.7M [00:01<00:18, 1.35MB/s]
 12%|#1        | 3.00M/25.7M [00:02<00:11, 2.00MB/s]
 16%|#5        | 4.00M/25.7M [00:02<00:09, 2.47MB/s]
 19%|#9        | 5.00M/25.7M [00:02<00:07, 2.83MB/s]
 23%|##3       | 6.00M/25.7M [00:02<00:06, 3.10MB/s]
 27%|##7       | 7.00M/25.7M [00:03<00:05, 3.31MB/s]
 31%|###1      | 8.00M/25.7M [00:03<00:05, 3.23MB/s]
 35%|###5      | 9.00M/25.7M [00:03<00:05, 3.16MB/s]
 39%|###8      | 10.0M/25.7M [00:04<00:05, 3.09MB/s]
 43%|####2     | 11.0M/25.7M [00:04<00:05, 2.81MB/s]
 47%|####6     | 12.0M/25.7M [00:04<00:05, 2.80MB/s]
 51%|#####     | 13.0M/25.7M [00:05<00:04, 3.20MB/s]
 54%|#####4    | 14.0M/25.7M [00:05<00:03, 3.10MB/s]
 58%|#####8    | 15.0M/25.7M [00:05<00:03, 3.54MB/s]
 62%|######2   | 16.0M/25.7M [00:06<00:02, 3.67MB/s]
 66%|######6   | 17.0M/25.7M [00:06<00:02, 3.72MB/s]
 70%|#######   | 18.0M/25.7M [00:06<00:02, 3.76MB/s]
 7

In [9]:
!dir

 Volume in drive D is DATA
 Volume Serial Number is EACC-602E

 Directory of D:\sahith projects\Sentiment analysis

25-12-2024  18:12    <DIR>          .
25-12-2024  16:40    <DIR>          ..
25-12-2024  16:45    <DIR>          .ipynb_checkpoints
19-10-2019  19:25        26,962,657 imdb-dataset-of-50k-movie-reviews.zip
25-12-2024  17:58                68 kaggle.json
25-12-2024  18:12             7,845 Sentiment analysis by IMDB Reviews.ipynb
               3 File(s)     26,970,570 bytes
               3 Dir(s)  488,234,999,808 bytes free


In [11]:
# unzip the dataset file
with ZipFile("imdb-dataset-of-50k-movie-reviews.zip", "r") as zip_ref:
    zip_ref.extractall()

In [12]:
!dir/b

.ipynb_checkpoints
IMDB Dataset.csv
imdb-dataset-of-50k-movie-reviews.zip
kaggle.json
Sentiment analysis by IMDB Reviews.ipynb


Loading the Dataset

In [13]:
data = pd.read_csv("IMDB Dataset.csv")

In [14]:
data.shape

(50000, 2)

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [36]:
data["sentiment"].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [37]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [38]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [39]:
# Splitting data into train and test data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [40]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


Data Preprocessing

Tokenize text data

In [42]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [43]:
print(X_train)
print(X_test)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]
[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [44]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [70]:
print(Y_train)
import numpy as np

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


LSTM - Long Short-Term Memory

In [71]:
# build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128,input_length=100))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

dummy_input = np.zeros((1, 100))  # (batch_size=1, sequence_length=100)

# Run the model on the dummy input to build it
model.build(dummy_input.shape)

In [65]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [73]:
model.summary()

In [74]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Training the Model

In [75]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 532ms/step - accuracy: 0.7213 - loss: 0.5328 - val_accuracy: 0.8158 - val_loss: 0.4319
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 520ms/step - accuracy: 0.8408 - loss: 0.3789 - val_accuracy: 0.8518 - val_loss: 0.3485
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 567ms/step - accuracy: 0.8712 - loss: 0.3195 - val_accuracy: 0.8627 - val_loss: 0.3302
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 571ms/step - accuracy: 0.8889 - loss: 0.2787 - val_accuracy: 0.8716 - val_loss: 0.3083
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 549ms/step - accuracy: 0.8761 - loss: 0.2992 - val_accuracy: 0.8668 - val_loss: 0.3306


<keras.src.callbacks.history.History at 0x29eb7fb7320>

Model Evaluation

In [79]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 169ms/step - accuracy: 0.8761 - loss: 0.3149
Test Loss: 0.31580615043640137
Test Accuracy: 0.8755000233650208


Building a Predictive System

In [90]:
def predict_sentiment(review):
    # tokenize and pad the review
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    sentiment = "positive" if prediction[0][0] >0.5 else "negative"
    return sentiment

In [91]:
# example usage
new_review = "This movie is extraordinary and beautiful. I enjoyed it !"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
The sentiment of the review is: positive


In [94]:
# example usage
new_review = "This movie is ok !"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
The sentiment of the review is: negative


In [95]:
# example usage
new_review = "This movie is worth watch !"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
The sentiment of the review is: positive


In [93]:
# example usage
new_review = "This movie is worst and time waste!"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
The sentiment of the review is: negative
