In [2]:
# Import all necessary libraries for NLP
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load in and read IMDB dataset, output first 5 rows to test
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# Explore and clean data - counts of positive vs negative reviews
df['sentiment'].value_counts()


In [6]:
# Explore and clean data - sum of null values
df.isnull().sum()


Unnamed: 0,0
review,0
sentiment,0


In [9]:
# Preprocess data: clean and tokenize by removing all HTML tags and keep only lowercase characters (remove all other characters). Store in function
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

df['review_clean'] = df['review'].apply(clean_text)

In [10]:
# Turn text into numeric values using TF-IDF vectorization, map sentiment values as 1 and 0
tfidf = TfidfVectorizer(
    max_features = 5000, # Limit to top 5000 most common terms for processing time and memory
    ngram_range=(1,2), # Captures unigrams and bigrams (single words like 'data' as well as 2 consecutive words like 'data science' for more context)
    stop_words='english' # Remove English stop words like 'a', 'an', 'the', 'is', 'are'
)

X = tfidf.fit_transform(df['review_clean'])
y = df['sentiment'].map({'positive': 1, 'negative': 0})

In [11]:
# Split data into training and testing set, set 20% aside for testing, startify y if sentiment proportion is imbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Train classification model: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

In [13]:
# Train classification model: Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [14]:
# Run predictions and evaluate models: Logistic Regression
log_reg_pred = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_reg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, log_reg_pred))

Logistic Regression Accuracy: 0.8899
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [15]:
# Run predictions and evaluate models: Naive Bayes
nb_pred = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_pred))

Naive Bayes Accuracy: 0.8587
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.86      5000
           1       0.85      0.88      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [16]:
# Visualize most common positive vs negative words - logistic regression
feature_names_log_reg = tfidf.get_feature_names_out()
coefficients = log_reg.coef_[0]

top_positive_words_log_reg = np.argsort(coefficients)[-20:]
top_negative_words_log_reg = np.argsort(coefficients)[:20]

print("Top Positive Words:")
print([feature_names_log_reg[word] for word in top_positive_words_log_reg])

print("Top Negative Words:")
print([feature_names_log_reg[word] for word in top_negative_words_log_reg])

Top Positive Words:
['perfectly', 'definitely', 'gem', 'funniest', 'fun', 'enjoyable', 'fantastic', 'today', 'enjoyed', 'superb', 'loved', 'favorite', 'brilliant', 'hilarious', 'wonderful', 'best', 'amazing', 'perfect', 'great', 'excellent']
Top Negative Words:
['worst', 'awful', 'bad', 'waste', 'boring', 'poor', 'dull', 'terrible', 'poorly', 'worse', 'horrible', 'disappointment', 'fails', 'disappointing', 'annoying', 'unfortunately', 'instead', 'lame', 'avoid', 'lacks']


In [17]:
# Visualize most common positive vs negative words - Naive Bayes
feature_names_nb = tfidf.get_feature_names_out()

log_prob_positive = nb.feature_log_prob_[1]
log_prob_negative = nb.feature_log_prob_[0]

top_positive_idx = np.argsort(log_prob_positive)[-20:]
top_positive_words_nb = feature_names_nb[top_positive_idx]

top_negative_idx = np.argsort(log_prob_negative)[-20:]
top_negative_words_nb = feature_names_nb[top_negative_idx]

print("Top Positive Words:")
print(top_positive_words_nb)

print("Top Negative Words:")
print(top_negative_words_nb)

Top Positive Words:
['characters' 'way' 'seen' 'watch' 'films' 'movies' 'think' 'life'
 'people' 'best' 'really' 'love' 'time' 'just' 'story' 'like' 'good'
 'great' 'movie' 'film']
Top Negative Words:
['characters' 'did' 'worst' 'better' 'watch' 'people' 'make' 'acting'
 'movies' 'plot' 'story' 'don' 'time' 'really' 'good' 'just' 'bad' 'like'
 'film' 'movie']


We built 2 different sentiment classifier models (logistic regression and Naive Bayes) to classify positive and negative sentiment in IMDb movie reviews. To do this, we split the dataset into training and testing sets to train/fit each model and then test/evaluate the models respectively. Cleaning and preprocessing of the data was required, which included tokenization and TF-IDF vectorization. After analyzing both models' performance, it seems the logistic regression model performed better - with higher accuracy, precision, recall, and F1-scores. Upon analyzing which words were most indicative in determining positive vs negative sentiment, the logistic regression model seemed more accurate (using words like 'enjoyable', 'fantastic', 'fun' for positive reviews and 'worst', 'awful', 'bad' for negative reviews) than the naive bayes model (used more neutral words such as 'characters', 'watch', and 'people' in classifying both positive and negative reviews).