In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import (
    set_seed,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

# Disable WANDB
os.environ["WANDB_DISABLED"] = "true"

# 1. Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
set_seed(SEED)                               # Python, NumPy, PyTorch
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark     = False

# 2. Load data without dropping rows; fill missing, cast to str
train_df = pd.read_csv("/kaggle/input/5790finalproj-dataset/train.csv", encoding="latin1")
test_df  = pd.read_csv("/kaggle/input/5790finalproj-dataset/test.csv",  encoding="latin1")

for df in (train_df, test_df):
    for col in ["Question","CorrectAnswer","Response"]:
        df[col] = df[col].fillna("").astype(str)

# 3. Map labels {-1→0, 0→1, 1→2}
label_map = {-1:0, 0:1, 1:2}
train_df["label"] = train_df["label"].map(label_map)
test_df["label"]  = test_df["label"].map(label_map)

# 4. Stratified train/validation split (80/20)
train_data, val_data = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label"], random_state=SEED
)

# 5. Tokenizer, model, collator
MODEL_NAME = "roberta-base"
tokenizer  = RobertaTokenizerFast.from_pretrained(
    MODEL_NAME, add_prefix_space=True
)
model      = RobertaForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Dataset using pair-encoding API
class ShortAnswerDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        enc = self.tokenizer(
            row.Question + " " + row.CorrectAnswer,
            row.Response,
            truncation=True,
            max_length=self.max_len
        )
        enc["labels"] = torch.tensor(row.label, dtype=torch.long)
        return enc

train_ds = ShortAnswerDataset(train_data, tokenizer)
val_ds   = ShortAnswerDataset(val_data,   tokenizer)
test_ds  = ShortAnswerDataset(test_df,    tokenizer)

# 7. Weighted sampler for class imbalance
counts = train_data["label"].value_counts().sort_index()
class_weights = 1.0 / counts
sample_weights = train_data["label"].map(class_weights).values
train_sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# 8. Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, alpha=1.0):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction="none")
        pt = torch.exp(-ce)
        return (self.alpha * (1 - pt) ** self.gamma * ce).mean()

loss_fn = FocalLoss(gamma=2.0, alpha=1.0)

# 9. Metrics with per-class breakdown
def compute_metrics(pred):
    labels = pred.label_ids
    preds  = np.argmax(pred.predictions, axis=1)
    acc    = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, labels=[0,1,2], zero_division=0
    )
    return {
        "accuracy":      acc,
        "precision_-1":  p[0], "recall_-1":  r[0], "f1_-1":  f1[0],
        "precision_0":   p[1], "recall_0":   r[1], "f1_0":   f1[1],
        "precision_1":   p[2], "recall_1":   r[2], "f1_1":   f1[2],
    }

# 10. Training arguments with warmup_steps & cosine schedule
training_args = TrainingArguments(
    output_dir             = "./results",
    eval_strategy          = "epoch",
    save_strategy          = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model  = "f1_1",          # track f1 for label=2 (“correct”)
    greater_is_better      = True,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 32,
    learning_rate          = 2e-5,
    weight_decay           = 0.01,
    warmup_steps           = 500,
    lr_scheduler_type      = "cosine",
    num_train_epochs       = 10,
    fp16                   = True,
    logging_dir            = "./logs",
    logging_strategy       = "epoch",
)

# 11. Custom Trainer hooking sampler & focal loss
class CustomTrainer(Trainer):
    def __init__(self, sampler=None, **kwargs):
        super().__init__(**kwargs)
        self.train_sampler = sampler

    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            sampler=self.train_sampler,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=self.data_collator
        )

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels.to(outputs.logits.device))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_ds,
    eval_dataset    = val_ds,
    data_collator   = data_collator,
    compute_metrics = compute_metrics,
    sampler         = train_sampler,
    callbacks       = [EarlyStoppingCallback(early_stopping_patience=2)],
)

# 12. Train
trainer.train()

# 13. Evaluate on test set
preds_out = trainer.predict(test_ds)
y_true = preds_out.label_ids
y_pred = np.argmax(preds_out.predictions, axis=1)

print("Test Classification Report:")
print(classification_report(
    y_true, y_pred,
    target_names=["Incorrect(-1)","Partial(0)","Correct(1)"]
))
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

2025-05-10 22:45:25.261513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746917125.626942      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746917125.741485      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,Precision -1,Recall -1,F1 -1,Precision 0,Recall 0,F1 0,Precision 1,Recall 1,F1 1
1,0.4944,0.475285,0.446667,0.460621,0.965,0.623586,0.178571,0.384615,0.243902,1.0,0.012658,0.025
2,0.3584,0.169319,0.844444,0.902857,0.79,0.842667,0.189189,0.538462,0.28,0.903361,0.907173,0.905263
3,0.1497,0.177719,0.875556,0.980892,0.77,0.862745,0.205882,0.538462,0.297872,0.899614,0.983122,0.939516
4,0.0853,0.203756,0.888889,0.873786,0.9,0.8867,0.190476,0.307692,0.235294,0.96861,0.911392,0.93913
5,0.0737,0.168039,0.915556,0.92,0.92,0.92,0.333333,0.307692,0.32,0.941176,0.945148,0.943158
6,0.0668,0.405351,0.928889,0.983516,0.895,0.937173,0.5,0.307692,0.380952,0.903846,0.991561,0.945674
7,0.0959,0.420019,0.937778,0.973684,0.925,0.948718,0.375,0.230769,0.285714,0.928571,0.987342,0.957055
8,0.0487,0.372202,0.942222,0.968912,0.935,0.951654,0.5,0.230769,0.315789,0.932271,0.987342,0.959016
9,0.0428,0.34571,0.942222,0.979058,0.935,0.956522,0.428571,0.230769,0.3,0.928571,0.987342,0.957055
10,0.0256,0.364463,0.935556,0.978723,0.92,0.948454,0.375,0.230769,0.285714,0.92126,0.987342,0.953157


Test Classification Report:
               precision    recall  f1-score   support

Incorrect(-1)       0.93      0.90      0.92     16614
   Partial(0)       0.12      0.20      0.15       320
   Correct(1)       0.90      0.91      0.91     13532

     accuracy                           0.90     30466
    macro avg       0.65      0.67      0.66     30466
 weighted avg       0.91      0.90      0.90     30466

Confusion Matrix:
[[15003   343  1268]
 [  124    65   131]
 [ 1051   120 12361]]
