In [19]:
# Importing Libraries
import pandas as pd
import re
import nltk
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
# Downloading NLTK Resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pradeesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pradeesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
# Load Dataset
data = pd.read_csv("IMDB Dataset.csv")

In [22]:
def clean_review(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [23]:
# Apply Cleaning
data['review'] = data['review'].apply(clean_review)

In [24]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [25]:
# Lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [26]:
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

data['review'] = data['review'].apply(lemmatize_text)

In [27]:
# Encode Sentiment
encoder = LabelEncoder()
data['sentiment'] = encoder.fit_transform(data['sentiment'])

In [28]:
# Train-Test Split
X = data['review'].values
y = data['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=3000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [30]:

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=200)


test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=200)


In [31]:
# Model Building
model = keras.Sequential([
    keras.layers.Embedding(3000, 100, input_length=200),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [32]:
# Compile Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          300000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 24)                3096      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387,601
Trainable params: 387,601
Non-trainable params: 0
_________________________________________________________________


In [33]:
# Train Model
history = model.fit(train_padded, y_train, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
# Predictions
pred_probs = model.predict(test_padded)
pred_labels = [1 if prob >= 0.5 else 0 for prob in pred_probs]



In [37]:
# Evaluation
print("Accuracy on test set:", accuracy_score(y_test, pred_labels))
print(classification_report(y_test, pred_labels))

Accuracy on test set: 0.87
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      4961
           1       0.84      0.92      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [38]:
# Inference
sample_sentences = [
    "The movie was very touching and heart whelming", 
    "I have never seen a terrible movie like this", 
    "The movie plot is terrible but it had good acting"
]

In [36]:
sample_sequences = tokenizer.texts_to_sequences(sample_sentences)
sample_padded = pad_sequences(sample_sequences, padding='post', maxlen=200)

sample_preds = model.predict(sample_padded)
sample_labels = ['Positive' if p >= 0.5 else 'Negative' for p in sample_preds]

for sentence, label in zip(sample_sentences, sample_labels):
    print(f"{sentence}\nPredicted Sentiment: {label}\n")

The movie was very touching and heart whelming
Predicted Sentiment: Positive

I have never seen a terrible movie like this
Predicted Sentiment: Negative

The movie plot is terrible but it had good acting
Predicted Sentiment: Negative

