In [2]:
import os
import re
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2025-05-01 01:30:24.382926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746063024.602058      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746063024.665034      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# 2. Load data with encoding
train_df = pd.read_csv("/kaggle/input/5790finalproj-dataset/train.csv", encoding="latin1")
test_df  = pd.read_csv("/kaggle/input/5790finalproj-dataset/test.csv",  encoding="latin1")


# 3. Map labels 
label_map = {-1: 0, 0: 1, 1: 2}
train_df["label_id"] = train_df["label"].map(label_map)
test_df["label_id"]  = test_df["label"].map(label_map)


# 4. Text normalization
def normalize(text):
    text = str(text)
    text = re.sub(r"\s+", " ", text)           # collapse whitespace
    text = re.sub(r"[^A-Za-z0-9 ]", "", text)  # remove punctuation
    return text.strip().lower()


# 5. Prepare (Question,Reference,Response) triples and labels
triples = [
    (normalize(q), normalize(r), normalize(s))
    for q,r,s in zip(train_df["Question"], train_df["CorrectAnswer"], train_df["Response"])
]
labels  = train_df["label_id"].tolist()


# 6. Stratified 90/10 train/validation split
train_data, val_data, train_labels, val_labels = train_test_split(
    triples, labels,
    test_size=0.1,
    random_state=42,
    stratify=labels
)

In [4]:
# 7. Tokenizer & Dataset
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

class SADataset(torch.utils.data.Dataset):
    def __init__(self, data, labels, tokenizer, max_length=128):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        q, ref, rsp = self.data[idx]
        enc = self.tokenizer(
            q + " " + ref,    # segment A
            rsp,              # segment B
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )
        item = {k: torch.tensor(v) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_ds = SADataset(train_data, train_labels, tokenizer)
val_ds   = SADataset(val_data,   val_labels,   tokenizer)

# Prepare test set
q_test = [normalize(q) for q in test_df["Question"]]
r_test = [normalize(r) for r in test_df["CorrectAnswer"]]
s_test = [normalize(s) for s in test_df["Response"]]
y_test = test_df["label_id"].tolist()
test_ds = SADataset(list(zip(q_test, r_test, s_test)), y_test, tokenizer)


# 8. Compute class weights for focal loss
cw = compute_class_weight("balanced", classes=[0,1,2], y=train_labels)
alpha = torch.tensor(cw, dtype=torch.float, device=device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# 9. Focal Loss implementation
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma=2.0, reduction="mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, labels):
        ce = F.cross_entropy(logits, labels, weight=self.alpha, reduction="none")
        pt = torch.exp(-ce)
        loss = (1 - pt) ** self.gamma * ce
        return loss.mean() if self.reduction=="mean" else loss.sum()


# 10. Custom Trainer to plug in focal loss
class FocalTrainer(Trainer):
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs: bool = False,
        num_items_in_batch: int = None       # <- add this parameter
    ):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # compute focal loss (alpha/gamma defined elsewhere)
        loss = FocalLoss(alpha, gamma=2.0)(logits, labels.to(logits.device))
        # return both loss and outputs if asked
        return (loss, outputs) if return_outputs else loss


# 11. Metrics: accuracy + macro-F1
def compute_metrics(pred):
    preds  = pred.predictions.argmax(-1)
    labels = pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [6]:
# 12. Load and prepare BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0:"incorrect",1:"partial",2:"correct"},
    label2id={"incorrect":0,"partial":1,"correct":2}
).to(device)


# 13. TrainingArguments (transformers 4.51.1)
training_args = TrainingArguments(
    output_dir="./bert_sa",
    run_name="sa_grading",
    report_to="none",                  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    logging_strategy="steps",
    logging_steps=200,
    dataloader_pin_memory=False,      
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True
)

trainer = FocalTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


# 14. Train & Validation Evaluation
trainer.train()
print(trainer.evaluate())

# 15. Final Test Performance
preds_out = trainer.predict(test_ds)
preds     = preds_out.predictions.argmax(-1)
print(classification_report(y_test, preds, target_names=["incorrect","partial","correct"]))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,Macro F1
200,0.6007,0.51941,0.951111,0.643022
400,0.3824,0.492994,0.933333,0.632456
600,0.2877,0.46381,0.946667,0.772704
800,0.1502,0.348241,0.942222,0.731838
1000,0.094,0.458349,0.955556,0.768096
1200,0.0371,0.414171,0.946667,0.776282


{'eval_loss': 0.4141712486743927, 'eval_accuracy': 0.9466666666666667, 'eval_macro_f1': 0.7762820512820513, 'eval_runtime': 1.204, 'eval_samples_per_second': 186.876, 'eval_steps_per_second': 6.644, 'epoch': 10.0}
              precision    recall  f1-score   support

   incorrect       0.92      0.92      0.92     16614
     partial       0.08      0.35      0.13       320
     correct       0.94      0.86      0.90     13532

    accuracy                           0.89     30466
   macro avg       0.65      0.71      0.65     30466
weighted avg       0.92      0.89      0.90     30466

