In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import torch
import random
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
# Set random seeds for reproducibility
seed_value = 42
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) 
np.random.seed(seed_value)
random.seed(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



# Load the sentiment dataset
df = pd.read_csv('Sentimental.csv',encoding='ISO-8859-1')

# Drop rows with missing text data
df.dropna(subset=['text'], inplace=True)

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define models and their corresponding tokenizers
models = [
    ('BERT Base Uncased', 'bert-base-uncased'),
    ('RoBERTa Base', 'roberta-base'),
    ('DistilBERT Base Uncased', 'distilbert-base-uncased'),
    ('Twitter RoBERTa', 'cardiffnlp/twitter-roberta-base-sentiment'),

]

# Define a dictionary to store evaluation metrics and predictions for each model
evaluation_results = {}

# Iterate over models
for model_name, model_type in models:
    print(f"Evaluating {model_name}...")
    
    # Load tokenizer and model
    if 'roberta' in model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_type)
        model = RobertaForSequenceClassification.from_pretrained(model_type, num_labels=len(df['sentiment'].unique()))
    elif 'distilbert' in model_type:
        tokenizer = DistilBertTokenizer.from_pretrained(model_type)
        model = DistilBertForSequenceClassification.from_pretrained(model_type, num_labels=len(df['sentiment'].unique()))
    elif 'twitter-roberta' in model_type:  # Add Twitter-specific model condition
        tokenizer = AutoTokenizer.from_pretrained(model_type)
        model = AutoModelForSequenceClassification.from_pretrained(model_type)
    else:
        tokenizer = BertTokenizer.from_pretrained(model_type)
        model = BertForSequenceClassification.from_pretrained(model_type, num_labels=len(df['sentiment'].unique()))
    
    # Tokenize text and convert labels to indices
    test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)
    test_labels = torch.tensor(test_df['sentiment'].astype('category').cat.codes.tolist())
    
    # Create PyTorch dataset and data loader
    test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
    
    # Evaluate the model on the test data
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')
    
    # Store evaluation results including predictions and true labels
    evaluation_results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Predictions': predictions,
        'True Labels': true_labels
    }
    evaluation_results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Predictions': predictions,

    }
    
    evaluation_results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,

    }
# Print evaluation metrics in tabular form
print("Evaluation Metrics:")
evaluation_df = pd.DataFrame.from_dict(evaluation_results, orient='index')
print(evaluation_df)


Evaluating BERT Base Uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating RoBERTa Base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating DistilBERT Base Uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Twitter RoBERTa...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluation Metrics:
                         Accuracy  Precision    Recall  F1 Score
BERT Base Uncased        0.393211   0.237368  0.393211  0.246992
RoBERTa Base             0.302687   0.091620  0.302687  0.140663
DistilBERT Base Uncased  0.295615   0.306951  0.295615  0.161411
Twitter RoBERTa          0.711457   0.715022  0.711457  0.704682
