In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from transformers import pipeline

In [11]:
# Load the dataset
df = pd.read_csv('imdb_master.csv', encoding='ISO-8859-1')

# Drop the unnamed leftmost column
df = df.drop(df.columns[0], axis=1)

In [12]:
df = df.sample(frac=1, random_state=42)

In [13]:
# Take the first 100 reviews
df_sample = df.head(100)

# Separate the data into training and test sets
X = df_sample['review']
y = df_sample['label'].apply(lambda x: 1 if x == 'pos' else 0)  # Convert labels to binary (1 for positive, 0 for negative)


In [14]:
# Check the distribution of labels
print(y.value_counts())  # Ensure there are both 0s and 1s

label
0    79
1    21
Name: count, dtype: int64


In [15]:
# Prepare models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'DistilBERT (Transformers)': pipeline("sentiment-analysis")
}

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [16]:
# Prepare the vectorizer
tfidf = TfidfVectorizer(max_features=5000)


In [17]:
# Function to calculate and print metrics
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{report}")


In [18]:
# Create a function to run sentiment analysis using the specified model
def run_sentiment_analysis(X, y, model_name, model):
    if model_name == 'DistilBERT (Transformers)':
        # Using a pre-trained DistilBERT model
        results = model(list(X))
        predictions = [1 if res['label'] == 'POSITIVE' else 0 for res in results]
    else:
        # Using traditional ML models
        X_tfidf = tfidf.fit_transform(X)
        model.fit(X_tfidf, y)
        predictions = model.predict(X_tfidf)
    
    # Print metrics
    print_metrics(y, predictions, model_name)

In [20]:
# Run sentiment analysis with each model
for model_name, model in models.items():
    run_sentiment_analysis(X, y, model_name, model)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression Metrics:
Accuracy: 0.79
Precision: 0.00
Recall: 0.00
F1-Score: 0.00
Confusion Matrix:
[[79  0]
 [21  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        79
           1       0.00      0.00      0.00        21

    accuracy                           0.79       100
   macro avg       0.40      0.50      0.44       100
weighted avg       0.62      0.79      0.70       100


Naive Bayes Metrics:
Accuracy: 0.79
Precision: 0.00
Recall: 0.00
F1-Score: 0.00
Confusion Matrix:
[[79  0]
 [21  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        79
           1       0.00      0.00      0.00        21

    accuracy                           0.79       100
   macro avg       0.40      0.50      0.44       100
weighted avg       0.62      0.79      0.70       100


Support Vector Machine Metrics:
Accuracy: 0.94
Precisio

InvalidArgumentError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 512 is not in [0, 512) [Op:ResourceGather] name: 

Call arguments received by layer 'embeddings' (type TFEmbeddings):
  • input_ids=tf.Tensor(shape=(1, 834), dtype=int32)
  • position_ids=None
  • inputs_embeds=None
  • training=False