## IMPORTING THE DATASET

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split as tests
import re
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical



## PREPARING DATA

In [2]:
max_feature =1000
maxlen=80 # maximum lenght of the sentensce or letter is 80
batch_size=32

In [3]:
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_feature)

In [31]:
print(x_train[1])

[125  68   2   2  15 349 165   2  98   5   4 228   9  43   2   2  15 299
 120   5 120 174  11 220 175 136  50   9   2 228   2   5   2 656 245   2
   5   4   2 131 152 491  18   2  32   2   2  14   9   6 371  78  22 625
  64   2   9   8 168 145  23   4   2  15  16   4   2   5  28   6  52 154
 462  33  89  78 285  16 145  95]


In [5]:
print(y_train)

[1 0 0 ... 0 1 0]


In [6]:
x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
print(x_train.shape)
print(x_test.shape)

(25000, 80)
(25000, 80)


In [7]:
print(y_train.shape)
print(y_test.shape)

(25000,)
(25000,)


In [8]:
x_train[1]

array([125,  68,   2,   2,  15, 349, 165,   2,  98,   5,   4, 228,   9,
        43,   2,   2,  15, 299, 120,   5, 120, 174,  11, 220, 175, 136,
        50,   9,   2, 228,   2,   5,   2, 656, 245,   2,   5,   4,   2,
       131, 152, 491,  18,   2,  32,   2,   2,  14,   9,   6, 371,  78,
        22, 625,  64,   2,   9,   8, 168, 145,  23,   4,   2,  15,  16,
         4,   2,   5,  28,   6,  52, 154, 462,  33,  89,  78, 285,  16,
       145,  95])

## VISUALIZE THE DATA

In [9]:
# Define index shift
INDEX_FROM = 3  

# Load IMDB word index
word_to_id = imdb.get_word_index()

# Explicitly shift all word IDs using a loop
updated_word_to_id = {}  # Empty dictionary
for word, idx in word_to_id.items():  # Loop through each word in the dataset
    updated_word_to_id[word] = idx + INDEX_FROM  # Adjust the index
word_to_id = updated_word_to_id  # Update the dictionary

# Add special tokens explicitly
word_to_id["<PAD>"] = 0    # Padding token
word_to_id["<START>"] = 1  # Start token
word_to_id["<UNK>"] = 2    # Unknown word token

# Explicitly create reverse mapping (ID to Word)
id_to_word = {}  # Empty dictionary for reverse lookup
for word, idx in word_to_id.items():  # Loop through dictionary items
    id_to_word[idx] = word  # Assign reversed key-value pairs

# Function to decode a tokenized review using an explicit loop
def decode_review(encoded_review):
    decoded_words = []  # Store decoded words
    for id in encoded_review:  # Loop through each ID in the review
        word = id_to_word.get(id, "<UNK>")  # Lookup the word, default to <UNK>
        decoded_words.append(word)  # Add the word to the list
    return " ".join(decoded_words)  # Convert list into sentence

# Example: Decoding the second review from x_train using the function
print(decode_review(x_train[1]))

off their <UNK> <UNK> that men actually <UNK> them and the music is just <UNK> <UNK> that plays over and over again in almost every scene there is <UNK> music <UNK> and <UNK> taking away <UNK> and the <UNK> still doesn't close for <UNK> all <UNK> <UNK> this is a truly bad film whose only <UNK> is to look back on the <UNK> that was the <UNK> and have a good old laugh at how bad everything was back then


In [17]:
print("MODEL BUILDING")
model=Sequential()
model.add(Embedding(max_feature,8))
model.add(LSTM(16,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))
model.summary()

MODEL BUILDING


In [20]:
model.compile(loss="binary_crossentropy",optimizer='adam',metrics=['accuracy'])
model

<Sequential name=sequential_6, built=False>

In [22]:
model.fit(x_train,y_train,batch_size=128,epochs=25,validation_data=(x_test,y_test))

Epoch 1/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8227 - loss: 0.4061 - val_accuracy: 0.8052 - val_loss: 0.4213
Epoch 2/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8234 - loss: 0.4034 - val_accuracy: 0.7989 - val_loss: 0.4270
Epoch 3/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8268 - loss: 0.3969 - val_accuracy: 0.8014 - val_loss: 0.4243
Epoch 4/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.8262 - loss: 0.3964 - val_accuracy: 0.8035 - val_loss: 0.4211
Epoch 5/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.8243 - loss: 0.3940 - val_accuracy: 0.8060 - val_loss: 0.4148
Epoch 6/25
[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8287 - loss: 0.3877 - val_accuracy: 0.8076 - val_loss: 0.4135
Epoch 7/25
[1m196/196

<keras.src.callbacks.history.History at 0x11f169ab050>

In [23]:
model

<Sequential name=sequential_6, built=True>

## TESTING

In [24]:
score,acc=model.evaluate(x_test,y_test,batch_size=128)
print(f'test score{score}')
print(f"test accuracy{acc}")

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8040 - loss: 0.4205
test score0.41366448998451233
test accuracy0.80867999792099


## PREDICTING

In [30]:
prediction=model.predict(x_train[1:2])
print('prediction value:',prediction[0])
print('Test Label:',y_train[1:2])
# if predicted is near 1 means the reviwes is positive
# if predicted is near 0 means the reviews is negative

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
prediction value: [0.11404576]
Test Label: [0]
