In [None]:
pip install accelerate -U



In [None]:
# 🔧 Install necessary packages
#!pip install -q transformers datasets accelerate

# Imports
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch

# Load and prepare your full dataset
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/data_newsbias_cleaned.parquet')
df = df.dropna(subset=['text', 'bias_label']).copy()
df['bias_label'] = df['bias_label'].astype(str)

# Optional: Balance the dataset (sample 50k from each class)
df = (
    df.groupby('bias_label', group_keys=False)
      .apply(lambda g: g.sample(50_000, random_state=42))
      .reset_index(drop=True)
)

# Label encoding
label2id = {label: idx for idx, label in enumerate(sorted(df['bias_label'].unique()))}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['bias_label'].map(label2id)

# Split
train_df, test_df = train_test_split(df, stratify=df['label'], test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=False, max_length=256)

# Convert to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df[['text', 'label']])
test_ds = Dataset.from_pandas(test_df[['text', 'label']])

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# Pad dynamically per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./roberta-news-bias",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# 📏 Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save fine-tuned model to Google Drive
trainer.save_model("/content/drive/MyDrive/roberta-news-bias/checkpoint-best")
tokenizer.save_pretrained("/content/drive/MyDrive/roberta-news-bias/checkpoint-best")

# Final evaluation
preds = trainer.predict(test_ds)
print("\n Final Accuracy:", accuracy_score(preds.label_ids, np.argmax(preds.predictions, axis=1)))
print("\n Classification Report:\n", classification_report(preds.label_ids, np.argmax(preds.predictions, axis=1), target_names=list(label2id.keys())))


  .apply(lambda g: g.sample(50_000, random_state=42))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4109,0.43238,0.848967
2,0.3206,0.467838,0.861033
3,0.2654,0.528175,0.873967



🎯 Final Accuracy: 0.8739666666666667

📋 Classification Report:
               precision    recall  f1-score   support

      center       0.89      0.87      0.88     10000
        left       0.84      0.89      0.87     10000
       right       0.89      0.86      0.87     10000

    accuracy                           0.87     30000
   macro avg       0.87      0.87      0.87     30000
weighted avg       0.87      0.87      0.87     30000

