In [2]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from transformers import Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification
from datasets import Dataset
from sklearn.metrics import accuracy_score

# Load the cleaned dataset from 'cleaned_dataset.csv'
df = pd.read_csv('cleaned_reviews.csv')

# Create HuggingFace Dataset from the pandas DataFrame
dataset = Dataset.from_pandas(df[['ProcessedText', 'Sentiment']])

# Label encoding: Convert 'positive' to 0 and 'negative' to 1
label_encoder = {'positive': 0, 'negative': 1}
dataset = dataset.map(lambda x: {'labels': label_encoder[x['Sentiment']]}, remove_columns=['Sentiment'])

# Split the dataset into train and validation
train_ds, eval_ds = dataset.train_test_split(test_size=0.2).values()

# Load a tokenizer and model (Roberta for sequence classification)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples['ProcessedText'], padding='max_length', truncation=True, max_length=512)

train_ds = train_ds.map(tokenize_function, batched=True)
eval_ds = eval_ds.map(tokenize_function, batched=True)

# TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',           
    evaluation_strategy="epoch",      
    learning_rate=2e-5,               
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16,    
    num_train_epochs=3,               
    weight_decay=0.01,                
    push_to_hub=False,                
    logging_dir='./logs',            
    logging_steps=10,                 
)

def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': classification_report(p.label_ids, preds, output_dict=True)['macro avg']['f1-score']
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate model 
trainer.evaluate()

predictions, labels, _ = trainer.predict(eval_ds)
predicted_labels = predictions.argmax(axis=-1)

# classification report and confusion matrix
print("Classification Report:\n", classification_report(labels, predicted_labels))
cm = confusion_matrix(labels, predicted_labels)
print("Confusion Matrix:\n", cm)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 0.9471, 'grad_norm': 6.38163423538208, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}
{'loss': 0.7367, 'grad_norm': 8.004007339477539, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}
{'loss': 0.6834, 'grad_norm': 4.16328239440918, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.6}
{'loss': 0.6168, 'grad_norm': 6.591975212097168, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}
{'loss': 0.5141, 'grad_norm': 12.136236190795898, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.4423254430294037, 'eval_accuracy': 0.82, 'eval_f1': 0.81998199819982, 'eval_runtime': 105.2845, 'eval_samples_per_second': 1.9, 'eval_steps_per_second': 0.123, 'epoch': 1.0}
{'loss': 0.4265, 'grad_norm': 31.69100570678711, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.3714, 'grad_norm': 16.750341415405273, 'learning_rate': 1.0666666666666667e-05, 'epoch': 1.4}
{'loss': 0.3483, 'grad_norm': 15.036824226379395, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}
{'loss': 0.4174, 'grad_norm': 38.17802429199219, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.8}
{'loss': 0.3066, 'grad_norm': 19.477554321289062, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.39956527948379517, 'eval_accuracy': 0.845, 'eval_f1': 0.8448098921178444, 'eval_runtime': 111.2702, 'eval_samples_per_second': 1.797, 'eval_steps_per_second': 0.117, 'epoch': 2.0}
{'loss': 0.337, 'grad_norm': 23.309993743896484, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.2}
{'loss': 0.2702, 'grad_norm': 18.441246032714844, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}
{'loss': 0.2407, 'grad_norm': 1.7277811765670776, 'learning_rate': 2.666666666666667e-06, 'epoch': 2.6}
{'loss': 0.2597, 'grad_norm': 54.012001037597656, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.8}
{'loss': 0.1626, 'grad_norm': 17.924144744873047, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.466318815946579, 'eval_accuracy': 0.855, 'eval_f1': 0.8545600441335038, 'eval_runtime': 61.2469, 'eval_samples_per_second': 3.265, 'eval_steps_per_second': 0.212, 'epoch': 3.0}
{'train_runtime': 5042.9065, 'train_samples_per_second': 0.476, 'train_steps_per_second': 0.03, 'train_loss': 0.44256450653076174, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       103
           1       0.87      0.82      0.85        97

    accuracy                           0.85       200
   macro avg       0.86      0.85      0.85       200
weighted avg       0.86      0.85      0.85       200

Confusion Matrix:
 [[91 12]
 [17 80]]


Model results file was too long so i have them compressed into a zip folder on my google drive. Here's the link to access them : https://drive.google.com/file/d/1yofFDytjDqGvvfDX3BQD3xJ3X9ZK6SDI/view?usp=sharing  