In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import resample

In [18]:
# Load the dataset
df = pd.read_csv('imdb_master.csv', encoding='ISO-8859-1')

In [19]:
# Drop the unnamed leftmost column
df = df.drop(df.columns[0], axis=1)

In [20]:
# Filter out 'unsup' rows
df = df[df['label'] != 'unsup']

In [21]:
# Shuffle the data
df = df.sample(frac=1, random_state=42)

In [22]:
# Take the first 100 reviews
df_sample = df.head(10000)

In [23]:
# Separate the data into features and labels
X = df_sample['review']
y = df_sample['label'].apply(lambda x: 1 if x == 'pos' else 0)  # Convert labels to binary (1 for positive, 0 for negative)

In [24]:
# Check the distribution of labels
print(y.value_counts())  # Ensure there are both 0s and 1s

label
0    5055
1    4945
Name: count, dtype: int64


In [25]:
# Handle class imbalance (if any)
df_combined = pd.concat([X, y], axis=1)
df_majority = df_combined[df_combined['label'] == 0]
df_minority = df_combined[df_combined['label'] == 1]

In [26]:
# Upsample the minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=len(df_majority), # to match majority class
                                 random_state=42) 

In [27]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [28]:
# Separate back into features and labels
X_upsampled = df_upsampled['review']
y_upsampled = df_upsampled['label']

In [29]:
# Prepare models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

In [30]:
# Prepare the vectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [31]:
# Function to calculate and print metrics
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    conf_matrix = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, zero_division=1)

    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{report}")

# Function to run sentiment analysis using the specified model
def run_sentiment_analysis(X, y, model_name, model):
    # Using traditional ML models
    X_tfidf = tfidf.fit_transform(X)
    model.fit(X_tfidf, y)
    predictions = model.predict(X_tfidf)
    
    # Print metrics
    print_metrics(y, predictions, model_name)


In [32]:
# Run sentiment analysis with each model
for model_name, model in models.items():
    run_sentiment_analysis(X_upsampled, y_upsampled, model_name, model)


Logistic Regression Metrics:
Accuracy: 0.93
Precision: 0.92
Recall: 0.95
F1-Score: 0.93
Confusion Matrix:
[[4638  417]
 [ 276 4779]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      5055
           1       0.92      0.95      0.93      5055

    accuracy                           0.93     10110
   macro avg       0.93      0.93      0.93     10110
weighted avg       0.93      0.93      0.93     10110


Naive Bayes Metrics:
Accuracy: 0.89
Precision: 0.89
Recall: 0.88
F1-Score: 0.89
Confusion Matrix:
[[4528  527]
 [ 593 4462]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      5055
           1       0.89      0.88      0.89      5055

    accuracy                           0.89     10110
   macro avg       0.89      0.89      0.89     10110
weighted avg       0.89      0.89      0.89     10110


Support Vector Machine Metrics:
Accurac