In [None]:
from google.colab import files
uploaded = files.upload()

annoted = list(uploaded.keys())[0]

In [None]:
!pip -q install transformers datasets fsspec==2024.10.0
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch

In [None]:
df = pd.read_csv(annoted)

df = df[df['Manual_Sentiment'].isin([0, 1, 2])]
df['Manual_Sentiment'] = df['Manual_Sentiment'].astype(int)
df = df.reset_index(drop=True)

df.head()

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    return tokenizer(examples['Body'], padding="max_length", truncation=True)

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column('Manual_Sentiment', 'labels')
tokenized_dataset = tokenized_dataset.remove_columns(['Body'])

train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=400,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="epoch",
    learning_rate=6e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)
trainer.train()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
eval_results = trainer.evaluate()

print(f"Evaluation results:")
for key, value in eval_results.items():
    print(f"  {key}: {value}")