In [None]:
!pip install datasets evaluate > /dev/null 2>&1

In [None]:
# ── Standard Library ───────────────────────────────────────────────────────────
import os
import sys
import uuid
import json
import zipfile
import pickle
from collections import OrderedDict
import logging

# ── Third‑Party Libraries ─────────────────────────────────────────────────────
import numpy as np
import random
import pandas as pd
import string
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import evaluate
from evaluate import load

# ── PyTorch ────────────────────────────────────────────────────────────────────
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import wordnet

# ── Hugging Face Transformers ─────────────────────────────────────────────────
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    logging as hf_logging
)

# ── Hugging Face Datasets ─────────────────────────────────────────────────────
from datasets import Dataset as HFDataset, load_dataset

# ── PEFT (Parameter‑Efficient Fine‑Tuning) ─────────────────────────────────────
from peft import (
    PeftModel,
    PeftConfig,
    LoraConfig,
    TaskType,
    get_peft_model,
)

# ── Safetensors ────────────────────────────────────────────────────────────────
from safetensors.torch import load_file

# ── Logging Configuration ─────────────────────────────────────────────────────
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(message)s",
    level=logging.INFO,
)
log = logging.getLogger(__name__)

# os.environ["TRANSFORMERS_VERBOSITY"] = "error"
# hf_logging.set_verbosity_error()

In [None]:
# Define Kaggle credentials
kaggle_creds = {
    "username": "hurryingauto3",
    "key": "17e33c07cfd0993aecbc770b33c7054e"
}

# Ensure the Kaggle config directory exists
os.makedirs(os.path.expanduser("~/.config/kaggle/"), exist_ok=True)

# Write credentials to kaggle.json
with open(os.path.expanduser("~/.config/kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_creds, f)

# Set correct permissions
os.chmod(os.path.expanduser("~/.config/kaggle/kaggle.json"), 0o600)

# Remove the "data/" directory if it exists
os.system("rm -rf data/")

# --- Kaggle API ---
from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
def synonym_replacement(text, n=1):
    """Replace n words in the sentence with their synonyms using WordNet."""
    words = text.split()
    new_words = words.copy()
    # Filter out short words and potentially common stop words if desired
    random_word_list = list(set([word for word in words if len(word) > 3])) # Keep simple filter
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            # Get lemmas from the first synset
            syn_words = [lemma.name() for lemma in synonyms[0].lemmas()]
            # Filter out the original word itself if it appears as a synonym
            syn_words = [s.replace('_', ' ') for s in syn_words if s.lower() != random_word.lower()]
            if syn_words:
                synonym = random.choice(syn_words) # Choose a random synonym
                # More robust replacement (case-insensitive match)
                for i, word in enumerate(new_words):
                    if word.lower() == random_word.lower():
                        new_words[i] = synonym
                num_replaced += 1
        if num_replaced >= n:
            break
    return " ".join(new_words)

def word_dropout(text, dropout_prob=0.1):
    """Randomly drops words with a given probability."""
    words = text.split()
    if not words: return "" # Handle empty strings
    # Keep at least one word if dropout_prob is high
    survivors = [word for word in words if random.random() > dropout_prob]
    if not survivors and words: # Ensure at least one word remains if original wasn't empty
        return random.choice(words)
    return " ".join(survivors)

def random_swap(text, n=1):
    """Randomly swaps two words n times."""
    words = text.split()
    length = len(words)
    if length < 2: # Cannot swap if less than 2 words
        return text
    for _ in range(n):
        idx1, idx2 = random.sample(range(length), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return " ".join(words)


def add_noise(text, noise_prob=0.05):
    """Add various types of random noise to words."""
    def corrupt(word):
        if random.random() > noise_prob or not word:
            return word

        # Choose a random character index (safer for substitution/insertion)
        char_idx = random.randint(0, len(word) - 1) if len(word) > 0 else 0
        insert_idx = random.randint(0, len(word))

        # More balanced noise types
        noise_type = random.choice(['substitute', 'insert_char', 'delete_char', 'swap_char', 'repeat_char', 'capitalize'])

        if noise_type == 'substitute' and len(word) > 0:
            return word[:char_idx] + random.choice(string.ascii_lowercase) + word[char_idx+1:]
        elif noise_type == 'insert_char':
             return word[:insert_idx] + random.choice(string.ascii_lowercase + string.punctuation) + word[insert_idx:]
        elif noise_type == 'delete_char' and len(word) > 0:
             return word[:char_idx] + word[char_idx+1:]
        elif noise_type == 'swap_char' and len(word) > 1:
             swap_idx = random.randint(0, len(word) - 2)
             word_list = list(word)
             word_list[swap_idx], word_list[swap_idx+1] = word_list[swap_idx+1], word_list[swap_idx]
             return "".join(word_list)
        elif noise_type == 'repeat_char' and len(word) > 0:
            return word[:insert_idx] + word[char_idx] + word[insert_idx:] # Repeat random char
        elif noise_type == 'capitalize':
            return ''.join(c.upper() if random.random() < 0.5 else c.lower() for c in word) # Random case changes

        return word # Fallback

    # Apply corruption word by word, filter out potential empty strings from deletion
    return " ".join(filter(None, [corrupt(word) for word in text.split()]))


# Define your preferred default augmentation settings here
def rich_text_augment(
    text,
    synonym_prob=0.15,
    dropout_prob=0.05,
    swap_prob=0.05,
    noise_prob=0.0 # Noise was disabled in your last run
):
    """Apply multiple augmentation strategies with given probabilities."""
    # Apply augmentations sequentially, allowing multiple per text
    if random.random() < synonym_prob:
        text = synonym_replacement(text, n=1) # Limit to 1 synonym replacement
    if random.random() < dropout_prob:
        text = word_dropout(text, dropout_prob)
    if random.random() < swap_prob:
        text = random_swap(text, n=1)
    if random.random() < noise_prob:
        text = add_noise(text, noise_prob)
    return text



In [None]:
# --- Custom Dataset for the Competition Test File ---
class AGNewsTestDataset(Dataset):
    def __init__(self, pkl_file, tokenizer, max_length=512, text_column="text"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_column = text_column
        self.texts = []

        try:
            with open(pkl_file, 'rb') as f:
                loaded = pickle.load(f)

            if HFDataset and isinstance(loaded, HFDataset):
                if self.text_column in loaded.column_names:
                    self.texts = loaded[self.text_column]
                else:
                    raise ValueError(f"Missing column '{self.text_column}'; cols: {loaded.column_names}")
            elif isinstance(loaded, list):
                self.texts = loaded
            elif isinstance(loaded, dict):
                for key in ('text','data','description'):
                    if key in loaded and isinstance(loaded[key], list):
                        self.texts = loaded[key]
                        break
                else:
                    raise ValueError(f"No list field found; keys: {list(loaded.keys())}")
            else:
                raise TypeError(f"Unsupported pickle type: {type(loaded)}")

            if not self.texts:
                raise ValueError(f"No text data extracted from {pkl_file}")

        except Exception as e:
            log.error("Failed to load test pickle '%s': %s", pkl_file, e)
            raise

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        text = str(text)
        enc = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors=None,
        )
        out = {k: v for k,v in enc.items() if k != 'token_type_ids'}
        out['index'] = idx
        return out


class AGNewsDataModule:
    def __init__(
        self,
        model_name_or_path="roberta-base",
        data_dir="./data_agnews",
        competition_name="deep-learning-spring-2025-project-2",
        batch_size=32,
        test_batch_size=32,
        num_workers=2,
        max_seq_length=512,
        val_split_percentage=0.0,
        filter_max_words=256,
        filter_max_nonalpha_ratio=0.1,
        apply_augmentation=True,
    ):
        self.model_name_or_path = model_name_or_path
        self.data_dir = data_dir
        self.competition_name = competition_name
        self.batch_size = batch_size
        self.test_batch_size = test_batch_size
        self.num_workers = num_workers
        self.max_seq_length = max_seq_length
        self.val_split_percentage = val_split_percentage
        self.filter_max_words = filter_max_words
        self.filter_max_nonalpha_ratio = filter_max_nonalpha_ratio
        self.apply_augmentation = apply_augmentation

        self.competition_path = os.path.join(self.data_dir, self.competition_name)
        self.zip_path = os.path.join(self.competition_path, f"{self.competition_name}.zip")
        self.test_pkl = os.path.join(self.competition_path, "test_unlabelled.pkl")
        self.hf_cache_dir = os.path.join(self.data_dir, "hf_cache")

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        self.train_dataset = None
        self.val_dataset = None
        self.predict_dataset = None

    def _keep_example(self, ex):
        if self.filter_max_words and len(ex["text"].split()) > self.filter_max_words:
            return False
        if self.filter_max_nonalpha_ratio is not None:
            nonalpha = sum(1 for c in ex["text"] if not c.isalnum() and not c.isspace())
            if nonalpha / max(1,len(ex["text"])) > self.filter_max_nonalpha_ratio:
                return False
        return True

    def _tokenize_function_train(self, examples):
        texts = examples["text"]
        if self.apply_augmentation:
            texts = [rich_text_augment(t) for t in texts]
        return self.tokenizer(
            texts,
            truncation=True,
            padding=False,
            max_length=self.max_seq_length,
        )

    def _tokenize_function_eval(self, examples):
        return self.tokenizer(
            examples["text"],
            truncation=True,
            padding=False,
            max_length=self.max_seq_length,
        )

    def prepare_data(self):
        load_dataset("ag_news", cache_dir=self.hf_cache_dir, trust_remote_code=True)
        self.download_competition_data()

    def setup(self, stage=None):
        if stage in (None, "fit","validate"):
            ds = load_dataset("ag_news", cache_dir=self.hf_cache_dir, trust_remote_code=True)
            train_raw, test_raw = ds['train'], ds['test']

            if self.val_split_percentage>0:
                split = train_raw.train_test_split(test_size=self.val_split_percentage, seed=42)
                train_src, val_src = split['train'], split['test']
            else:
                train_src, val_src = train_raw, test_raw

            train_f = train_src.filter(self._keep_example, num_proc=self.num_workers)
            self.train_dataset = train_f.map(
                self._tokenize_function_train, batched=True,
                remove_columns=["text"], num_proc=self.num_workers
            )
            self.train_dataset.set_format("torch")

            val_t = val_src.map(
                self._tokenize_function_eval, batched=True,
                remove_columns=["text"], num_proc=self.num_workers
            )
            self.val_dataset = val_t
            self.val_dataset.set_format("torch")

        if stage in (None, "test"):
            if not self.predict_dataset:
                self.predict_dataset = AGNewsTestDataset(
                    self.test_pkl, self.tokenizer, self.max_seq_length
                )

    def get_train_loader(self):
        if not self.train_dataset: self.setup("fit")
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            collate_fn=self.data_collator,
            pin_memory=torch.cuda.is_available(),
        )

    def get_val_loader(self):
        if not self.val_dataset: self.setup("validate")
        return DataLoader(
            self.val_dataset,
            batch_size=self.test_batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self.data_collator,
            pin_memory=torch.cuda.is_available(),
        )

    def get_competition_test_loader(self):
        if not self.predict_dataset: self.setup("test")
        return DataLoader(
            self.predict_dataset,
            batch_size=self.test_batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            collate_fn=self.data_collator,
            pin_memory=torch.cuda.is_available(),
        )

    def download_competition_data(self):
        if not os.path.exists(self.test_pkl):
            os.makedirs(self.competition_path, exist_ok=True)
            try:
                from kaggle.api.kaggle_api_extended import KaggleApi
                api = KaggleApi(); api.authenticate()
                api.competition_download_files(self.competition_name, path=self.competition_path)
                if os.path.exists(self.zip_path):
                    with zipfile.ZipFile(self.zip_path, 'r') as z: z.extractall(self.competition_path)
                    os.remove(self.zip_path)
                else:
                    log.warning("Zip not found after download: %s", self.zip_path)
            except ImportError:
                log.warning("Kaggle API missing; download manually to %s", self.test_pkl)
            except Exception as e:
                log.error("Kaggle download/extract failed: %s", e)

        if not os.path.exists(self.test_pkl):
            raise FileNotFoundError(f"Missing test file: {self.test_pkl}")


In [None]:
# Example configuration
MODEL_ID = "roberta-base"
COMPETITION_ID = "deep-learning-spring-2025-project-2" # Double-check this ID
DATA_DIR = "./agnews_data"
BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

# Instantiate the data module
data_module = AGNewsDataModule(
    model_name_or_path=MODEL_ID,
    data_dir=DATA_DIR,
    competition_name=COMPETITION_ID,
    batch_size=BATCH_SIZE,
    test_batch_size=TEST_BATCH_SIZE,
    num_workers=2,
    max_seq_length=512,
    val_split_percentage=0.0,
    filter_max_words = None,  # Increase significantly from 100
    filter_max_nonalpha_ratio = None, # Slighatly relax non-alpha ratio too
    apply_augmentation = True
)

data_module.prepare_data() # Downloads HF data and competition data if needed
data_module.setup() # Sets up train, val, and test


In [None]:
try:
    if data_module.predict_dataset:
        test_data_for_df = []
        # Iterate through the custom dataset using __getitem__
        num_samples_to_show = min(8000, len(data_module.predict_dataset))
        for i in range(num_samples_to_show):
            sample = data_module.predict_dataset[i] # Fetches the dictionary item
            text = data_module.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
            # Ensure 'index' exists, otherwise use a placeholder like None or -1
            original_index = sample.get('index', None)
            test_data_for_df.append({'Decoded Text': text, 'Original Index': original_index})

        test_df = pd.DataFrame(test_data_for_df)
    else:
        print("Competition test dataset not loaded or empty.")
except Exception as e:
    print(f"Error displaying competition test samples: {e}")

test_df.head()

In [None]:
# ── Model / tokenizer ──────────────────────────────────────────────────────────
def load_model_and_tokenizer(base_model_name, adapter_path, num_labels, device):
    log.info("Loading tokenizer and base model")
    tok = AutoTokenizer.from_pretrained(base_model_name)
    base = AutoModelForSequenceClassification.from_pretrained(
        base_model_name, num_labels=num_labels, return_dict=True
    )

    log.info("Building PEFT wrapper")
    model = PeftModel.from_pretrained(base, adapter_path, is_trainable=False)

    # manual weight load (robust to .safetensors / .bin)
    w_path = (
        os.path.join(adapter_path, "adapter_model.safetensors")
        if os.path.exists(os.path.join(adapter_path, "adapter_model.safetensors"))
        else os.path.join(adapter_path, "adapter_model.bin")
    )
    state = load_file(w_path) if w_path.endswith(".safetensors") else torch.load(w_path, map_location="cpu")
    missing, unexpected = model.load_state_dict(state, strict=False)
    if missing or unexpected:
        log.warning(f"Missing: {len(missing)}, unexpected: {len(unexpected)} keys")

    model.to(device).eval()
    return model, tok

# ── Device helper ──────────────────────────────────────────────────────────────
def get_device():
    if torch.cuda.is_available():
        log.info(f"CUDA: {torch.cuda.get_device_name(0)}")
        return torch.device("cuda")
    log.info("CPU selected")
    return torch.device("cpu")

# ── Inference ------------------------------------------------------------------
def run_inference(model, tokenizer, loader, device):
    preds, idxs = [], []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(loader, desc="Predict"):
            inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
            logits = model(**inputs).logits
            preds.extend(logits.argmax(-1).cpu().tolist())
            idxs.extend(batch["index"].tolist())
    return idxs, preds

# ── CSV helpers ────────────────────────────────────────────────────────────────
def save_predictions(idxs, preds, path):
    df = pd.DataFrame({"ID": idxs, "Label": preds}).sort_values("ID")
    df.to_csv(path, index=False)
    log.info(f"Saved submission → {path}  ({len(df)} rows)")
    return df

def merge_with_test_data(test_df, sub_df):
    df = (
        test_df.merge(sub_df, left_on="Original Index", right_on="ID")
        .drop(columns=["Original Index", "ID"])
        .rename(columns={"Label": "Predicted Label"})
    )
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    df["Predicted Label"] = df["Predicted Label"].map(label_map)
    return df

# ── Pipeline ───────────────────────────────────────────────────────────────────
def inference_pipeline(output_dir, adapter_path, num_labels, eval_metrics, test_df, data_module):
    device = get_device()
    acc = eval_metrics.get("eval_accuracy", 0.0)
    sub_path = os.path.join(output_dir, f"submission_acc_new_{acc:.4f}.csv")
    view_path = os.path.join(output_dir, f"view_df_acc_new_{acc:.4f}.csv")

    model, tok = load_model_and_tokenizer("roberta-base", adapter_path, num_labels, device)
    loader = data_module.get_competition_test_loader()

    idxs, preds = run_inference(model, tok, loader, device)
    sub_df = save_predictions(idxs, preds, sub_path)

    if test_df is not None:
        view_df = merge_with_test_data(test_df, sub_df)
        view_df.to_csv(view_path, index=False)
        log.info(f"Saved view_df → {view_path}")
        return view_df
    
    return None

# Reproduce Final Kaggle Submission Results Using Checkpoints

In [None]:
output_dir  = "./results/713ce1e9/results"
adapter_dir = "./results/713ce1e9/trained_adapters" 

view_df = inference_pipeline(
        output_dir=output_dir,
        num_labels=4,
        adapter_path=adapter_dir,
        eval_metrics={},
        test_df=test_df,
        data_module=data_module,
    )
