In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import resample
from transformers import pipeline, AutoTokenizer

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\sanja\OneDrive\Documents\ADA project\.venv\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [None]:
# Load the dataset
df = pd.read_csv('imdb_master.csv', encoding='ISO-8859-1')

In [None]:
# Drop the unnamed leftmost column
df = df.drop(df.columns[0], axis=1)

In [None]:
# Filter out 'unsup' rows
df = df[df['label'] != 'unsup']

In [None]:
# Shuffle the data
df = df.sample(frac=1, random_state=42)

In [None]:
# Take the first 100 reviews
df_sample = df.head(100)

In [None]:
# Separate the data into features and labels
X = df_sample['review']
y = df_sample['label'].apply(lambda x: 1 if x == 'pos' else 0)  # Convert labels to binary (1 for positive, 0 for negative)

In [None]:
# Check the distribution of labels
print(y.value_counts())  # Ensure there are both 0s and 1s

label
0    50
1    50
Name: count, dtype: int64


In [None]:
# Handle class imbalance (if any)
df_combined = pd.concat([X, y], axis=1)
df_majority = df_combined[df_combined['label'] == 0]
df_minority = df_combined[df_combined['label'] == 1]

In [None]:
# Upsample the minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=len(df_majority), # to match majority class
                                 random_state=42) 

In [None]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [None]:
# Separate back into features and labels
X_upsampled = df_upsampled['review']
y_upsampled = df_upsampled['label']

In [None]:
# Prepare models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'DistilBERT (Transformers)': pipeline("sentiment-analysis")
}

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [None]:
# Prepare the vectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
# Function to calculate and print metrics
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{report}")

In [None]:
# Tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Create a function to run sentiment analysis using the specified model
def run_sentiment_analysis(X, y, model_name, model):
    if model_name == 'DistilBERT (Transformers)':
        # Tokenize and truncate the inputs
        inputs = tokenizer(list(X), padding=True, truncation=True, max_length=512, return_tensors="pt")
        results = model(inputs["input_ids"])
        predictions = [1 if res['label'] == 'POSITIVE' else 0 for res in results]
    else:
        # Using traditional ML models
        X_tfidf = tfidf.fit_transform(X)
        model.fit(X_tfidf, y)
        predictions = model.predict(X_tfidf)
    
    # Print metrics
    print_metrics(y, predictions, model_name)

In [None]:
# Run sentiment analysis with each model
for model_name, model in models.items():
    run_sentiment_analysis(X_upsampled, y_upsampled, model_name, model)


Logistic Regression Metrics:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
Confusion Matrix:
[[50  0]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100


Naive Bayes Metrics:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
Confusion Matrix:
[[50  0]
 [ 0 50]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100


Support Vector Machine Metrics:
Accuracy: 1.00
Precisio

ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.