In [1]:
# Part A: IMDb Movie Review Sentiment Analysis
# 1. Data Exploration and Preprocessing 
# Tasks:
# Analyze the dataset: This includes checking for missing values, identifying class imbalances, and analyzing the length and structure of the reviews.
# Data cleaning and preprocessing:
# Remove stop words, punctuation, and special characters.
# Tokenize the text (split the text into individual words).
# Perform lemmatization and stemming to reduce words to their base form.
# Use vectorization techniques like Bag-of-Words (BoW) and TF-IDF to convert text into numerical features.

# Steps:
# Load the dataset and check for trends:

import pandas as pd

# Load the IMDb dataset 
df = pd.read_csv('imdb.csv')  

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(df.head())  # First few rows

# Check for missing values
print(f"Missing values:\n{df.isnull().sum()}")

# Check the distribution of sentiments
print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")

Dataset shape: (50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Missing values:
review       0
sentiment    0
dtype: int64
Sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [3]:
# step 2
# Text Preprocessing:
# Remove stop words, punctuation, and special characters.
# Tokenization: Split the reviews into individual words.
# Lemmatization: Convert words to their base form.
# Vectorization: Use TF-IDF or Bag-of-Words to convert text into a numerical format.

import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenization and lemmatization
    tokens = word_tokenize(text.lower())  # Convert text to lowercase and tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing to each review
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Display the cleaned data
df.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [5]:
# step 3
# Vectorization:
# TF-IDF (Term Frequency-Inverse Document Frequency) is a common technique for transforming text into numerical 
# features that capture the importance of terms in a document.

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # Limit the features for simplicity
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment'].map({'positive': 1, 'negative': 0})  # Convert sentiment to binary (1=positive, 0=negative)

# Check the shape of the transformed features
print(f"Shape of feature matrix X: {X.shape}")

Shape of feature matrix X: (50000, 5000)


In [7]:
# Task 2. Feature Engineering 
# Tasks:
# Feature extraction: Besides TF-IDF, we can extract additional textual features such as word count, character count, and average word length.

# Steps:1 
# Extracting textual features:

# Additional features: word count, character count, average word length
df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
df['char_count'] = df['cleaned_review'].apply(lambda x: len(x.replace(" ", "")))  # Remove spaces for char count
df['avg_word_length'] = df['char_count'] / df['word_count']

# Combine these features with the TF-IDF features
X_additional = df[['word_count', 'char_count', 'avg_word_length']].values

# Concatenate the features
import numpy as np
X_final = np.hstack((X, X_additional))  # Combine TF-IDF features and additional features

print(f"Shape of final feature matrix: {X_final.shape}")

Shape of final feature matrix: (50000, 5003)


In [None]:
# 3. Model Development 
# Tasks:
# Build and train classification models: Experiment with various classification algorithms such as:
# Logistic Regression
# Naive Bayes
# Support Vector Machine (SVM)
# Random Forest
# Neural Networks (e.g., LSTM, BERT)

# step 1 Steps:
# Train models using different algorithms:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Evaluate the Logistic Regression model
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred)}")
print(classification_report(y_test, lr_pred))

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

# Evaluate Naive Bayes model
print("Naive Bayes:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred)}")
print(classification_report(y_test, nb_pred))

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Evaluate SVM model
print("Support Vector Machine:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred)}")
print(classification_report(y_test, svm_pred))

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluate Random Forest model
print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred)}")
print(classification_report(y_test, rf_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy: 0.8632
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.86      0.88      0.87      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Naive Bayes:
Accuracy: 0.8246
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      4961
           1       0.83      0.83      0.83      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



In [None]:
# task 4 4. Model Evaluation 
# Tasks:
# Evaluate the model’s performance using accuracy, precision, recall, F1-score, and confusion matrix.
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix for Logistic Regression as an example
conf_matrix = confusion_matrix(y_test, lr_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# Displaying classification report for better metrics visualization
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, lr_pred))
