In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
df = pd.read_pickle('/content/drive/My Drive/Northwestern MS/453 - Natural Language Processing/imdb/imdb_dataset.pkl')
df.head()

Unnamed: 0,review,sentiment,processed
0,One of the other reviewers has mentioned that ...,positive,"[[reviewer, mention, watch, oz, episode, hook,..."
1,A wonderful little production. <br /><br />The...,positive,"[[wonderful, little, production, filming, tech..."
2,I thought this was a wonderful way to spend ti...,positive,"[[think, wonderful, way, spend, time, hot, sum..."
3,Basically there's a family where a little boy ...,negative,"[[basically, family, little, boy, jake, think,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[[petter, matteis, love, time, money, visually..."


In [8]:
# Combine the processed tokens back into strings for TF-IDF
df['processed_text'] = df['processed'].apply(lambda x: ' '.join([' '.join(tokens) for tokens in x]))

In [9]:
# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(df['processed_text'], df['sentiment'], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [10]:
# Vectorize the text data using TF-IDF with 2-grams and 3-grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [11]:
# Hyperparameter tuning for Naive Bayes
nb_params = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}
nb_classifier = MultinomialNB()
nb_grid = GridSearchCV(nb_classifier, nb_params, cv=5, scoring='accuracy')
nb_grid.fit(X_train_tfidf, y_train)
print("Best Naive Bayes Params:", nb_grid.best_params_)
y_val_pred_nb = nb_grid.predict(X_val_tfidf)
print("Naive Bayes Validation Accuracy:", accuracy_score(y_val, y_val_pred_nb))
print(classification_report(y_val, y_val_pred_nb))

Best Naive Bayes Params: {'alpha': 2.0}
Naive Bayes Validation Accuracy: 0.8529333333333333
              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      3689
    positive       0.85      0.87      0.86      3811

    accuracy                           0.85      7500
   macro avg       0.85      0.85      0.85      7500
weighted avg       0.85      0.85      0.85      7500



In [12]:
# Hyperparameter tuning for Logistic Regression
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
lr_classifier = LogisticRegression(max_iter=1000)
lr_grid = GridSearchCV(lr_classifier, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train_tfidf, y_train)
print("Best Logistic Regression Params:", lr_grid.best_params_)
y_val_pred_lr = lr_grid.predict(X_val_tfidf)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print(classification_report(y_val, y_val_pred_lr))

Best Logistic Regression Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Validation Accuracy: 0.8830666666666667
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      3689
    positive       0.88      0.90      0.89      3811

    accuracy                           0.88      7500
   macro avg       0.88      0.88      0.88      7500
weighted avg       0.88      0.88      0.88      7500



In [13]:
# Evaluate on the test set
y_test_pred_nb = nb_grid.predict(X_test_tfidf)
y_test_pred_lr = lr_grid.predict(X_test_tfidf)

In [14]:
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, y_test_pred_nb))
print(classification_report(y_test, y_test_pred_nb))

Naive Bayes Test Accuracy: 0.8534666666666667
              precision    recall  f1-score   support

    negative       0.87      0.83      0.85      3722
    positive       0.84      0.88      0.86      3778

    accuracy                           0.85      7500
   macro avg       0.85      0.85      0.85      7500
weighted avg       0.85      0.85      0.85      7500



In [15]:
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred_lr))
print(classification_report(y_test, y_test_pred_lr))

Logistic Regression Test Accuracy: 0.8853333333333333
              precision    recall  f1-score   support

    negative       0.90      0.87      0.88      3722
    positive       0.88      0.90      0.89      3778

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500



Conclusion:
Overall Accuracy: Logistic Regression outperforms Naive Bayes in terms of overall accuracy on the test set, achieving 88.53% compared to Naive Bayes' 85.35%.

Precision and Recall:

Logistic Regression shows higher precision and recall for both the negative and positive classes compared to Naive Bayes. This indicates that Logistic Regression is more effective at correctly identifying both positive and negative sentiments.
Specifically, Logistic Regression achieves a precision of 0.90 for the negative class and 0.88 for the positive class, while Naive Bayes achieves 0.87 and 0.84, respectively.
In terms of recall, Logistic Regression scores 0.87 for the negative class and 0.90 for the positive class, whereas Naive Bayes scores 0.83 and 0.88, respectively.
F1-Score: Logistic Regression also shows superior F1-scores for both classes, reflecting a better balance between precision and recall compared to Naive Bayes.

Macro and Weighted Averages: The macro and weighted averages for precision, recall, and F1-score are higher for Logistic Regression, indicating its overall better performance across all classes.

Summary:
Logistic Regression is a more effective model for this sentiment analysis task, as evidenced by its higher accuracy, precision, recall, and F1-scores across both the negative and positive sentiment classes compared to the Naive Bayes classifier. Therefore, for this specific dataset and task, Logistic Regression is recommended over Naive Bayes.