In [1]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 19% 5.00M/25.7M [00:00<00:00, 50.9MB/s]
100% 25.7M/25.7M [00:00<00:00, 152MB/s] 


In [2]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [3]:
import pandas as pd

In [4]:
IMDB=pd.read_csv('IMDB Dataset.csv')
IMDB.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [13]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
lemmatizer=WordNetLemmatizer()

In [16]:
def preprocess_text(text):
  text=text.lower()

  #remove HTML tags
  text=re.sub(r'<.*?>',"",text)

  #remove non alphapitichal text
  text = re.sub(r'[^a-zA-Z\s]', '', text)

  words=nltk.word_tokenize(text)

  words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]

  text=' '.join(words)

  return text

In [17]:
IMDB["cleaning_review"]=IMDB['review'].apply(preprocess_text)

In [19]:
IMDB.head()

Unnamed: 0,review,sentiment,cleaning_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 

In [22]:
Tokenizer=Tokenizer(num_words=5000)
Tokenizer.fit_on_texts(IMDB['cleaning_review'])

In [23]:
sequences = Tokenizer.texts_to_sequences(IMDB['review'])

In [24]:
x=pad_sequences(sequences,maxlen=100)

In [25]:
x.shape

(50000, 100)

In [26]:
y=IMDB['sentiment'].apply(lambda x:1 if x=='positive' else 0).values

In [29]:
y.shape

(50000,)

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [40]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((40000, 100), (10000, 100), (40000,), (10000,))

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense,Input,Dropout,BatchNormalization

In [69]:
embedding_size = 100

model = Sequential()
model.add(Embedding(vocab_sz + 1, embedding_size, mask_zero=True, input_length=maxlen))

# Add a first RNN layer
model.add(SimpleRNN(100, return_sequences=True,recurrent_dropout=0.2,dropout=0.2)) 
model.add(Dropout(0.3))  
model.add(BatchNormalization()) 

# Add a second RNN layer
model.add(SimpleRNN(100, return_sequences=True,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# Add a third RNN layer
model.add(SimpleRNN(100,recurrent_dropout=0.2,dropout=0.2))
model.add(Dropout(0.3))

# Add a fully connected layer
model.add(Dense(64, activation='relu'))  
model.add(BatchNormalization())
model.add(Dropout(0.3)) 

# Output layer
model.add(Dense(2, activation='softmax'))
model.summary()

In [70]:
model.summary()

In [71]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [72]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss=SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [73]:
model.fit(
    X_train,y_train,
    epochs=25,
    batch_size=32,
    validation_data=(X_test,y_test)
)

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 12ms/step - accuracy: 0.6617 - loss: 0.5803 - val_accuracy: 0.8589 - val_loss: 0.3175
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.8739 - loss: 0.3102 - val_accuracy: 0.8639 - val_loss: 0.3238
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.8888 - loss: 0.2765 - val_accuracy: 0.8642 - val_loss: 0.3147
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 11ms/step - accuracy: 0.8969 - loss: 0.2619 - val_accuracy: 0.8710 - val_loss: 0.3125
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9001 - loss: 0.2558 - val_accuracy: 0.8669 - val_loss: 0.3323
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9032 - loss: 0.2416 - val_accuracy: 0.8635 - val_loss: 0.3505
Epoc

<keras.src.callbacks.history.History at 0x7a66571bee00>

In [84]:
predection=model.predict(X_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [86]:
loss,accuracy=model.evaluate(X_test,y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8652 - loss: 0.3274


In [82]:
y_test[500]

0

In [88]:
model.save("IMDB_model.keras")

In [89]:
from tensorflow.keras.models import load_model

In [90]:
model=load_model("IMDB_model.keras")

  saveable.load_own_variables(weights_store.get(inner_path))


In [117]:
def preprocess_given_text(text, tokenizer):
    sequence = tokenizer.texts_to_sequences([text])

    padded_sequence = pad_sequences(sequence, maxlen=100)

    return padded_sequence

In [118]:
def predict_sentiment(text):
   padded_sequence = preprocess_given_text(text, Tokenizer)

   prediction = model.predict(padded_sequence)

   if prediction[0][0] > prediction[0][1]:
       return "Negative"
   else:
       return "Positive"

In [124]:
predict_sentiment("I couldn't even finish watching it. It was that bad—poor acting, predictable plot, and awful dialogue.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


'Negative'