In [None]:
!pip install contractions

In [2]:
#Importing all the necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import contractions
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers

In [3]:
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

In [None]:
#importing the training data
#Download the file from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
#Upload the file to your environment
file_path = './IMDB_Dataset.csv' #Change the path according to your environment
imdb_data = pd.read_csv(file_path)
print(imdb_data.shape)
print(imdb_data.head(25))

In [None]:
# Downloading and loading stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [6]:
# Encode the sentiments "positive" and "negative" as numeric values
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
# Ensure data types are correct
imdb_data['sentiment'] = imdb_data['sentiment'].astype(int)

In [7]:
#Function to clean the training data by removing unnecessary things
def clean_text(text):
    text = re.sub('<br />', ' ', text)  # HTML tag removal
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # removing all numeric values.
    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  #Contractions like don't are expanded to do not
    text = ' '.join([word for word in text.split() if word not in stop_words]) #Stopword removal
    return text

In [None]:
# Apply the function to the reviews
imdb_data['review'] = imdb_data['review'].apply(clean_text)

# check the result
imdb_data.head(5)

In [None]:
#Splitting the data into 85% training and 15% testing data
X = imdb_data['review']
y = imdb_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

print(f"Training set: {len(X_train)} reviews and {len(y_train)} sentiments")
print(f"Testing set: {len(X_test)} reviews and {len(y_test)} sentiments")

In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer(num_words=30000)  # Use the first 30,000 words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
maxlen = 150  # The maximum review lenght is set to 150 for easier processing
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

print(f"Padded training data shape: {X_train_pad.shape}")
print(f"Padded testing data shape: {X_test_pad.shape}")

In [None]:
# Set the embedding dimensions, max vocab size, and input length
embedding_dim = 200
max_vocab_size = 30000
maxlen = 150

# Define the model architecture
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Dropout(0.2),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.01))),
    Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

# Train the model
history = model.fit(
    X_train_pad,
    y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Plot training and validation accuracy/loss
plt.figure(figsize=(12, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Model Accuracy')

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Model Loss')

plt.show()

# Make predictions on the test data
predictions = model.predict(X_test_pad)
predictions = (predictions > 0.5).astype(int).flatten()

# Display a few predictions alongside true labels
print("Sample Predictions:")
for i in range(10):
    print(f"Review: {X_test.iloc[i][:100]}...")
    print(f"True Sentiment: {y_test.iloc[i]}, Predicted Sentiment: {predictions[i]}")
    print("-" * 80)

In [None]:
# Define a function to preprocess handwritten reviews
def preprocess_review(review, tokenizer, maxlen):

    # Clean the review
    review = re.sub('<br />', ' ', review)  # Remove HTML tags
    review = re.sub(r'[^a-zA-Z\s]', '', review)  # Keep only letters
    review = review.lower()  # Convert to lowercase
    review = contractions.fix(review)  # Expand contractions
    review = ' '.join([word for word in review.split() if word not in stop_words])  # Remove stopwords

    # Tokenize and pad the review
    review_seq = tokenizer.texts_to_sequences([review])
    review_pad = pad_sequences(review_seq, maxlen=maxlen)
    return review_pad

# Function to predict sentiment for a list of reviews
def predict_reviews(reviews, model, tokenizer, maxlen):
    results = []
    for review in reviews:
        # Preprocess and predict
        processed_review = preprocess_review(review, tokenizer, maxlen)
        prediction = model.predict(processed_review, verbose=0)
        sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
        results.append((review, sentiment))
    return results

# Example list of handwritten reviews
handwritten_reviews = [
    "I absolutely loved this movie! The acting was great and the story was so touching.",
    "This film was a complete waste of time. The plot was terrible, and the acting was even worse.",
    "It had some good moments, but overall, it was just okay.",
    "One of the best movies I've seen in a long time. Highly recommended!",
    "I couldn't finish watching it. It was too boring.",
    "I absolutely hated this movie! The acting was bad and the story was so boring.",
    "The movie had some interesting twists, but the pacing was too slow for my liking.",
    "A masterpiece! The cinematography was stunning, and the storyline was gripping.",
    "It was alright, but I expected more. It didn't live up to the hype.",
    "An emotional rollercoaster. I was on the edge of my seat the entire time.",
    "The special effects were fantastic, but the story didn't resonate with me.",
    "I found the plot to be too predictable and lacking any real excitement.",
    "An entertaining film, but not something I'd watch again.",
    "I loved the soundtrack! It really added to the atmosphere of the movie.",
    "A solid movie, but it didn't stand out compared to others in the genre.",
    "Too much CGI for my taste. It felt like the plot was secondary to the effects.",
    "The ending was so satisfying! Everything came together beautifully.",
    "It had its moments, but overall, it was too cheesy for me.",
    "A visually stunning film with a powerful performance from the lead actor.",
    "The pacing was uneven, but I enjoyed the overall concept of the film.",
    "One of those films that sticks with you long after it's over. Truly impactful.",
    "I wasn't a fan of the humor. It felt forced and out of place in the storyline.",
    "The direction was brilliant, and the performances were top-notch.",
    "It was a fun, lighthearted movie, perfect for a family night.",
    "The plot was weak, but the characters were strong and well-developed.",
    "I didn't understand the hype. It didn't live up to my expectations."
]

# Test the model with handwritten reviews
handwritten_results = predict_reviews(handwritten_reviews, model, tokenizer, maxlen)

# Display the results
print("Predictions for Handwritten Reviews:")
for i, (review, sentiment) in enumerate(handwritten_results):
    print(f"Review {i + 1}: {review[:100]}...")
    print(f"Predicted Sentiment: {sentiment}")
    print("-" * 80)