In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('IMDB.csv')
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [4]:
df.duplicated().sum()

277

In [5]:
df = df.drop_duplicates()

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [7]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Join tokens back into a single string
    text = ' '.join(tokens)
    
    return text

In [8]:
df['text'] = df['text'].apply(clean_text)

In [9]:
x = df['text']
y = df['label']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
tookenizer = Tokenizer(num_words=5000)
tookenizer.fit_on_texts(x_train)

In [12]:
x_train_seq = tookenizer.texts_to_sequences(x_train)
x_test_seq = tookenizer.texts_to_sequences(x_test)

In [13]:
max_len = 100
x_train_padded = pad_sequences(x_train_seq, maxlen=max_len, padding='post', truncating='post')
x_test_padded = pad_sequences(x_test_seq, maxlen=max_len, padding='post', truncating='post')

In [14]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=0.001, clipvalue=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_len))
model.summary()



In [15]:
history = model.fit(
    x_train_padded, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(x_test_padded, y_test)
)

Epoch 1/5
[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 153ms/step - accuracy: 0.5849 - loss: 0.6681 - val_accuracy: 0.6519 - val_loss: 0.6461
Epoch 2/5
[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 154ms/step - accuracy: 0.6619 - loss: 0.6339 - val_accuracy: 0.7281 - val_loss: 0.5710
Epoch 3/5
[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 156ms/step - accuracy: 0.7389 - loss: 0.5423 - val_accuracy: 0.7904 - val_loss: 0.4855
Epoch 4/5
[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 156ms/step - accuracy: 0.8030 - loss: 0.4537 - val_accuracy: 0.8341 - val_loss: 0.3824
Epoch 5/5
[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 157ms/step - accuracy: 0.8577 - loss: 0.3451 - val_accuracy: 0.8609 - val_loss: 0.3407


In [16]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.8647 - loss: 0.3347
Test Loss: 0.34066689014434814
Test Accuracy: 0.8609188199043274
