In [None]:
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [None]:
import torch
import pickle
import logging
import numpy as np
import pandas as pd
import evaluate
import nlpaug.augmenter.word as naw

from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch import nn

from datasets import load_dataset, Dataset, ClassLabel
from transformers import (
    RobertaTokenizer, RobertaModel, RobertaForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding,
    RobertaPreTrainedModel, AutoConfig
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score

# Configure logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Global hyperparameters and switches
class Settings:
    MODEL_NAME = "roberta-base"
    SAVE_DIR = "my_best_model"
    AUGMENT = False
    EARLY_STOP = True
    FNN_ENABLED = False
    FREEZE_MODEL = True
    WEIGHT_DECAY_ON = False
    MC_DROPOUT = False
    MAX_LENGTH = 64
    TRAIN_BATCH = 32
    EVAL_BATCH = 64
    EPOCHS = 1
    LEARNING_RATE = 5e-6
    EARLY_STOP_PATIENCE = 5
    DROPOUT_ITER = 10
    TRAIN_LAST_LAYERS = 2

    LORA_CFG = dict(
        r=3,
        lora_alpha=6,
        lora_dropout=0.05,
        bias="none",
        target_modules=["query", "value"],
        task_type="SEQ_CLS"
    )

# Custom Roberta with additional feed-forward layers
class ExtendedRoberta(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.fnn = nn.Sequential(
            nn.Linear(config.hidden_size, 512), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(512, 512), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(512, 256), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(256, config.num_labels)
        )
        self.num_labels = config.num_labels
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        pooled = self.roberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = self.fnn(pooled)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
            return {"loss": loss_fn(logits, labels), "logits": logits}
        return {"logits": logits}

# Preprocessing helpers
def tokenize_set(tokenizer, data, length):
    def map_fn(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=length)
    return data.map(map_fn, batched=True, remove_columns=["text"])

def augment_train(data):
    augmenter = naw.SynonymAug(aug_src='wordnet')
    def augment_fn(example):
        try:
            aug = augmenter.augment(example["text"])
            return {"text": aug[0] if isinstance(aug, list) else aug}
        except:
            return example
    aug_data = data["train"].map(augment_fn)
    combined = Dataset.from_list(data["train"].to_list() + aug_data.to_list())
    return combined, data["test"]

# Model evaluation metric
def compute_accuracy(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

# Monte Carlo Dropout prediction function
def mc_dropout(model, dataset, collator, device, repeat):
    model.train()
    loader = DataLoader(dataset, batch_size=64, collate_fn=collator)
    predictions = []
    for _ in range(repeat):
        iter_preds = []
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                out = model(**inputs)
            iter_preds.append(out.logits.cpu().numpy())
        predictions.append(np.concatenate(iter_preds, axis=0))
    return np.argmax(np.mean(predictions, axis=0), axis=1)

# Freeze all except LoRA and classifier
def lock_parameters(model):
    log.info("Freezing base layers")
    for name, param in model.named_parameters():
        if "lora" not in name and "classifier" not in name:
            param.requires_grad = False

# Model loading and prediction helpers
def fetch_model_and_tokenizer(path):
    log.info("Fetching model and tokenizer")
    tokenizer = RobertaTokenizer.from_pretrained(f"./trained_models/{path}/final_model")
    model = RobertaForSequenceClassification.from_pretrained(
        f"./trained_models/{path}/final_model", num_labels=4
    ).to(DEVICE)
    model.eval()
    return tokenizer, model


def retrieve_test_data(path):
    with open(path, "rb") as file:
        return pickle.load(file)

def encode_custom_data(data, tokenizer):
    def enc_fn(batch):
        return tokenizer(batch["text"], truncation=True, max_length=128, padding="max_length")
    return data.map(enc_fn, batched=True, remove_columns=["text"])

def infer(model, data_loader):
    log.info("Starting inference loop")
    preds = []
    for batch in tqdm(data_loader, desc="Evaluating"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        with torch.no_grad():
            logits = model(**batch).logits
        preds.append(logits.argmax(dim=1).cpu())
    return torch.cat(preds, dim=0)

def export_to_csv(predictions, path):
    log.info("Writing results to disk")
    pd.DataFrame({"ID": list(range(len(predictions))), "Label": predictions.numpy()}).to_csv(path, index=False)

# Main pipeline
def run_pipeline():
    cfg = Settings()
    tokenizer = RobertaTokenizer.from_pretrained(cfg.MODEL_NAME)
    tokenizer.model_max_length = cfg.MAX_LENGTH

    if cfg.AUGMENT:
        train_data, test_data = augment_train(load_dataset("ag_news"))
    else:
        raw = load_dataset("ag_news")
        train_data, test_data = raw["train"], raw["test"]

    train_encoded = tokenize_set(tokenizer, train_data, cfg.MAX_LENGTH).rename_column("label", "labels")
    test_encoded = tokenize_set(tokenizer, test_data, cfg.MAX_LENGTH).rename_column("label", "labels")

    if isinstance(train_encoded.features["labels"], ClassLabel):
        labels = train_encoded.features["labels"].names
    else:
        labels = ["World", "Sports", "Business", "Sci/Tech"]

    num_labels = len(set(train_encoded["labels"]))
    label_map = {i: name for i, name in enumerate(labels)}
    reverse_map = {v: k for k, v in label_map.items()}

    if cfg.FNN_ENABLED:
        conf = AutoConfig.from_pretrained(cfg.MODEL_NAME, num_labels=num_labels)
        model = ExtendedRoberta.from_pretrained(cfg.MODEL_NAME, config=conf)
    else:
        model = RobertaForSequenceClassification.from_pretrained(
            cfg.MODEL_NAME, num_labels=num_labels, id2label=label_map, label2id=reverse_map
        )

    model = get_peft_model(model, LoraConfig(**cfg.LORA_CFG))
    if cfg.FREEZE_MODEL:
        lock_parameters(model)

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    assert trainable < 1_000_000, f"Too many trainable parameters: {trainable}"
    log.info(f"Trainable parameters: {trainable}")

    args = TrainingArguments(
        output_dir=f"./trained_models/{cfg.SAVE_DIR}",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=500,
        save_steps=4000,
        learning_rate=cfg.LEARNING_RATE,
        per_device_train_batch_size=cfg.TRAIN_BATCH,
        per_device_eval_batch_size=cfg.EVAL_BATCH,
        num_train_epochs=cfg.EPOCHS,
        weight_decay=cfg.WEIGHT_DECAY_ON * 0.01,
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        report_to="wandb",
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_encoded,
        eval_dataset=test_encoded,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt"),
        compute_metrics=compute_accuracy,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.EARLY_STOP_PATIENCE)] if cfg.EARLY_STOP else []
    )

    trainer.train()

    # === Inference and Save Predictions ===
    test_loader = DataLoader(
        test_encoded.remove_columns("labels"),
        batch_size=cfg.EVAL_BATCH,
        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    )
    model.eval()
    predictions = infer(model, test_loader)
    export_to_csv(predictions, "predictions.csv")



# Run the main pipeline when this script is executed
def main():
    run_pipeline()

if __name__ == "__main__":
    main()

2025-04-21 17:46:41.953406: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-21 17:46:43.490711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745272004.015069    5483 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745272004.170681    5483 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745272005.452690    5483 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Step,Training Loss,Validation Loss,Accuracy
500,1.3764,1.371926,0.356184
1000,1.3331,1.32709,0.811579
1500,1.2138,1.193648,0.864868
2000,0.9034,0.869584,0.875526
2500,0.5675,0.523323,0.877895
3000,0.4528,0.423279,0.881447
3500,0.4335,0.398339,0.881053


INFO:__main__:Starting inference loop


Evaluating:   0%|          | 0/119 [00:00<?, ?it/s]

INFO:__main__:Writing results to disk
