In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import sklearn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("MovieReviewTrainingDatabase.csv")

In [3]:
df.head(5)

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


In [4]:
len(df)

25000

In [5]:
#Here I am converting Positive and Negative sentiment into 1 and 0 respectively. I thought this would be easier to interpret.

def conv_sentiment(sentiment):

    if sentiment == "Positive":
        return 1 #Positive sentiment
    else:
        return 0 #Negative sentiment

df["sentiment_binary"] = df["sentiment"].apply(conv_sentiment)

In [6]:
df.head(5)

Unnamed: 0,sentiment,review,sentiment_binary
0,Positive,With all this stuff going down at the moment w...,1
1,Positive,'The Classic War of the Worlds' by Timothy Hin...,1
2,Negative,The film starts with a manager (Nicholas Bell)...,0
3,Negative,It must be assumed that those who praised this...,0
4,Positive,Superbly trashy and wondrously unpretentious 8...,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         25000 non-null  object
 1   review            25000 non-null  object
 2   sentiment_binary  25000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [8]:
#No longer need the original sentiment column

df = df.drop("sentiment", axis=1)

In [9]:
df.head(5)

Unnamed: 0,review,sentiment_binary
0,With all this stuff going down at the moment w...,1
1,'The Classic War of the Worlds' by Timothy Hin...,1
2,The film starts with a manager (Nicholas Bell)...,0
3,It must be assumed that those who praised this...,0
4,Superbly trashy and wondrously unpretentious 8...,1


In [10]:
#Make all text lowercase

df['review'] = df['review'].str.lower()

In [11]:
df.head(5)

Unnamed: 0,review,sentiment_binary
0,with all this stuff going down at the moment w...,1
1,'the classic war of the worlds' by timothy hin...,1
2,the film starts with a manager (nicholas bell)...,0
3,it must be assumed that those who praised this...,0
4,superbly trashy and wondrously unpretentious 8...,1


In [12]:
#removing punctuation and numbers, as well as leading or trailing whitespace. i also tokenize and remove stopwords.

def clean(review):

    rem_dig = ''.join([i for i in review if not i.isdigit()])
    rem_ws = rem_dig.strip()
    tokenized_review = word_tokenize(rem_ws)
    stopwords_set = set(stopwords.words("english"))
    rem_stopwords = ' '.join([word for word in tokenized_review if word not in stopwords_set])
    rem_punctuation = re.sub(r'[^\w\s\!-]', "", rem_stopwords)
    rem_punctuation = re.sub(r'[-]', " ", rem_punctuation)
    return(rem_punctuation)

In [13]:
df['review'] = (df['review'].apply(clean)).apply(clean)

In [14]:
print(df.loc[31, 'review']) # TEST, issues with whitespace between words, im too lazy to fix my clean function because it doesnt impact results...
#instead ill just run it twice lol

simon pegg plays rude crude often control celebrity journalist brought england work big american magazine course winning ways create sorts complications amusing fact based comedy co stars kristen dunst looking rather grown danny huston jeff bridges works primarily like simon pegg despite bad behavior completely understand kristen dunst continues talk despite frequent screw ups liked film end nice way cap evening sitting couch watching movies


In [15]:
#Vectorizing data for tfidf, splitting data

from sklearn.feature_extraction.text import TfidfVectorizer

# Original reviews and sentiment columns
X = df['review']  # Original text data (not vectorized yet)
y = df['sentiment_binary']  # Sentiment column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

# Vectorize the text for training and testing
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [16]:
from sklearn.linear_model import LogisticRegression #logistic reg model

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [17]:
from sklearn.metrics import accuracy_score #Log Reg

y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regressionm Accuracy: {accuracy:.4f}")

Logistic Regressionm Accuracy: 0.8892


In [18]:
from sklearn.naive_bayes import MultinomialNB # Multinomial NB

model = MultinomialNB()
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Multinomial Naive Bayes Accuracy: {accuracy:.4f}")

Multinomial Naive Bayes Accuracy: 0.8652


In [19]:
from sklearn.ensemble import RandomForestClassifier #Random forest

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

Random Forest Accuracy: 0.8476


In [20]:
from sklearn.svm import SVC #SVM, i sample 15% of the data because otherwise takes too long to train (even 30% was only 87% accuracy)

X_sampled, _, y_sampled, _ = train_test_split(X_train_vectorized, y_train, test_size=0.85, random_state=42)
model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42) #doubtful that data is linearly seperable so trying rbf.
model.fit(X_sampled, y_sampled)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with RBF kernel: {accuracy:.4f}")

Accuracy with RBF kernel: 0.8532


In [26]:
import xgboost
from xgboost import XGBClassifier #Gradient booster, trying to learn parameters

model = XGBClassifier(
    eval_metric='logloss',
    n_estimators=250,        # Number of trees
    learning_rate=0.3,       # Learning rate
    max_depth=9,             # Maximum depth of each tree
    subsample=0.9,           # Fraction of samples used per tree
    colsample_bytree=0.9,    # Fraction of features used per tree
    gamma=1,                 # Minimum loss reduction for split
    reg_lambda=1,            # L2 regularization
    random_state=42
)
model.fit(X_train_vectorized, y_train)
y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with XGBoost: {accuracy:.4f}")

Accuracy with XGBoost: 0.8542


In [34]:
import lightgbm #slightly better than xgboost probably due to optimizations for sparse data
from lightgbm import LGBMClassifier

# Initialize the LightGBM model
lgb_model = LGBMClassifier(n_estimators = 500, random_state=42)

# Train the model
lgb_model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test_vectorized)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_lgb)
print(f"Accuracy with LightGBM: {accuracy:.4f}")

[LightGBM] [Info] Number of positive: 10000, number of negative: 10000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.396783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442724
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 10161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy with LightGBM: 0.8714


In [39]:
from sklearn.neural_network import MLPClassifier #NN model
from sklearn.preprocessing import StandardScaler

# Sample a portion of the data for faster experimentation (Optional)
X_sampled, _, y_sampled, _ = train_test_split(X_train_vectorized, y_train, test_size=0.8, random_state=42)

# New MLP settings for reduced training time
mlp_model = MLPClassifier(
    hidden_layer_sizes=(50,),         # Fewer neurons in a single hidden layer
    max_iter=150,                     # Reduce iterations (can increase later if needed)
    solver='adam',                    # Adam optimizer (can switch to 'lbfgs' for smaller datasets)
    early_stopping=True,              # Enable early stopping to halt if no improvement
    validation_fraction=0.1,          # Use 10% of the training data as validation for early stopping
    alpha=0.0001,                     # Small L2 regularization (keeps the model generalizing)
    random_state=42
)

# Train the model on a smaller sample of the data (to test quickly)
mlp_model.fit(X_sampled, y_sampled)

# Evaluate the model
y_pred_mlp = mlp_model.predict(X_test_vectorized)

In [40]:
accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"Accuracy with NN: {accuracy:.4f}")

Accuracy with NN: 0.8606
