In [7]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-2.1.3-py3-none-any.whl.metadata (27 kB)
Collecting pydantic-settings>=2.10.1 (from tabpfn)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting eval-type-backport>=0.2.2 (from tabpfn)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings>=2.10.1->tabpfn)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting n

In [2]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, confusion_matrix
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tabpfn import TabPFNClassifier
import random
import logging
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass

# =============================================================================
# STEP 1: GLOBAL CONFIGURATION AND SETUP
# =============================================================================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Global Configuration ---
BASE_URL = "/kaggle/input/dataset"
OUTPUT_DIR = "/kaggle/working/"
SYNTHETIC_DATA_SIZE = 0
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- SEEDS FOR 5 RUNS ---
SEEDS = [42, 83, 456, 789, 101]

def set_seed(seed_value):
    """Sets the seed for reproducibility for all relevant libraries."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    logger.info(f"Global seed set to {seed_value}")

# =============================================================================
# STEP 2: DATA PREPARATION CLASSES AND FUNCTIONS
# =============================================================================
class DataPreprocessor:
    """Handles loading and splitting of the dataset based on a given seed."""
    def __init__(self, base_url, synthetic_data_size, random_seed):
        self.base_url = base_url
        self.synthetic_data_size = synthetic_data_size
        self.random_seed = random_seed

    def prepare_datasets(self):
        """Loads and splits data into train, validation, and test sets."""
        logger.info(f"Preparing datasets with seed: {self.random_seed}")
        df_org = pd.read_csv(f"{self.base_url}/df_org.csv")
        df_syn = pd.read_csv(f"{self.base_url}/df_syn.csv")

        df_syn_class0 = df_syn[df_syn['orig_label'] == 0].reset_index(drop=True)
        if self.synthetic_data_size > 0:
            df_syn_class0 = df_syn_class0.sample(
                n=min(self.synthetic_data_size, len(df_syn_class0)),
                random_state=self.random_seed
            )
        else:
            df_syn_class0 = pd.DataFrame(columns=df_syn_class0.columns).rename(columns={"synthetic_code": "code", "orig_label": "label"})


        # The rest of your data splitting logic
        matched_codes = set(df_syn_class0["orig_code"].unique()) if not df_syn_class0.empty else set()
        df_org_match = df_org[df_org["code"].isin(matched_codes)].copy()
        df_org_nonmatch = df_org[~df_org["code"].isin(matched_codes)].copy()

        df_org_match["source"] = "original"
        df_org_nonmatch["source"] = "original"

        df_syn_renamed = df_syn_class0.rename(columns={"synthetic_code": "code", "orig_label": "label"})
        df_syn_renamed["source"] = "synthetic"

        class0_df = df_org_nonmatch[df_org_nonmatch["label"] == 0].copy()
        nonclass0_df = df_org_nonmatch[df_org_nonmatch["label"] != 0].copy()

        class0_test = class0_df.sample(n=41, random_state=self.random_seed)
        class0_val = class0_df.drop(class0_test.index).sample(n=41, random_state=self.random_seed)
        class0_train = class0_df.drop(class0_test.index).drop(class0_val.index)

        nonclass0_trainval, nonclass0_test = train_test_split(
            nonclass0_df, test_size=0.15, random_state=self.random_seed, stratify=nonclass0_df["label"]
        )
        nonclass0_train, nonclass0_val = train_test_split(
            nonclass0_trainval, test_size=0.1765, random_state=self.random_seed, stratify=nonclass0_trainval["label"]
        )

        train_nonmatch = pd.concat([class0_train, nonclass0_train], ignore_index=True)
        valid_df = pd.concat([class0_val, nonclass0_val], ignore_index=True)
        test_df = pd.concat([class0_test, nonclass0_test], ignore_index=True)

        train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)
        
        logger.info(f"Data prepared: Train size={len(train_df)}, Val size={len(valid_df)}, Test size={len(test_df)}")
        return train_df, valid_df, test_df

class BugSeverityDataset(Dataset):
    """Custom PyTorch Dataset for loading code, numerical features, and labels."""
    def __init__(self, data, tokenizer, block_size=512):
        self.examples = data
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        js = self.examples[idx]
        code = js["code"]
        code_tokens = self.tokenizer.tokenize(str(code))[:self.block_size - 2]
        tokens = [self.tokenizer.cls_token] + code_tokens + [self.tokenizer.eos_token]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        padding_length = self.block_size - len(input_ids)
        input_ids += [self.tokenizer.pad_token_id] * padding_length
        
        num_features = torch.tensor([
            js.get("sloc_robust", 0.0), js.get("proxy_indentation_robust", 0.0),
            js.get("mcCabe_robust", 0.0), js.get("mcClure_robust", 0.0),
            js.get("nested_block_depth_robust", 0.0), js.get("difficulty_robust", 0.0),
            js.get("maintainability_index_robust", 0.0), js.get("fan_out_robust", 0.0),
            js.get("readability_robust", 0.0), js.get("effort_robust", 0.0)
        ], dtype=torch.float)
        
        return (
            torch.tensor(input_ids),
            num_features,
            torch.tensor(js["label"], dtype=torch.long)
        )

def convert_df_to_json_format(df):
    """Converts a DataFrame to the list of dicts format required by the Dataset."""
    return df.to_dict('records')

# =============================================================================
# STEP 3: PYTORCH MODEL, TRAINING, AND EVALUATION
# =============================================================================
class ConcatClsModel(nn.Module):
    """CodeBERT encoder with a classification head for concatenated features."""
    def __init__(self, encoder, config):
        super().__init__()
        self.encoder = encoder
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_layer = nn.Linear(config.hidden_size + 10, config.num_labels)

    def forward(self, input_ids, num_features, labels=None):
        attention_mask = input_ids.ne(self.encoder.config.pad_token_id).long()
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeds = outputs.last_hidden_state[:, 0, :]
        concat = torch.cat((cls_embeds, num_features), dim=-1)
        logits = self.out_layer(self.dropout(concat))
        probs = torch.softmax(logits, dim=-1)

        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
            return loss, probs
        return probs

def run_pytorch_training(train_loader, valid_loader, seed, output_dir):
    """Runs the entire PyTorch training and evaluation process for one seed."""
    logger.info(f"Starting PyTorch training for seed {seed}.")
    
    # Using exact best parameters from your original script
    best_params = {
        'lr': 4.818976027099782e-05,
        'weight_decay': 0.00010045918919119982,
        'warmup_ratio': 0.28606124459699783,
        'dropout': 0.16316509043013103,
        'epochs': 8,
    }
    
    config = RobertaConfig.from_pretrained("microsoft/codebert-base", num_labels=4, hidden_dropout_prob=best_params['dropout'])
    encoder = RobertaModel.from_pretrained("microsoft/codebert-base", config=config, add_pooling_layer=False)
    model = ConcatClsModel(encoder, config).to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
    total_steps = len(train_loader) * best_params['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(total_steps * best_params['warmup_ratio']), num_training_steps=total_steps
    )
    
    best_f1 = 0
    best_model_path = os.path.join(output_dir, f"best_model_seed_{seed}.pt")

    for epoch in range(best_params['epochs']):
        model.train()
        for batch in train_loader:
            input_ids, num_features, labels = [b.to(DEVICE) for b in batch]
            optimizer.zero_grad()
            loss, _ = model(input_ids, num_features, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        
        model.eval()
        preds, labels_all = [], []
        with torch.no_grad():
            for batch in valid_loader:
                input_ids, num_features, labels = [b.to(DEVICE) for b in batch]
                _, probs = model(input_ids, num_features, labels)
                preds.extend(torch.argmax(probs, dim=1).cpu().tolist())
                labels_all.extend(labels.cpu().tolist())
        val_f1 = f1_score(labels_all, preds, average="macro")
        
        logger.info(f"Seed {seed} | Epoch {epoch+1}/{best_params['epochs']} | Val F1: {val_f1:.4f}")
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), best_model_path)
            
    return best_model_path
    
# =============================================================================
# STEP 6: MAIN EXECUTION LOOP
# =============================================================================
all_runs_results = []
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

for seed in SEEDS:
    try:
        set_seed(seed)
        
        # --- Data Prep ---
        preprocessor = DataPreprocessor(BASE_URL, SYNTHETIC_DATA_SIZE, seed)
        train_df, valid_df, test_df = preprocessor.prepare_datasets()
        
        train_dataset = BugSeverityDataset(convert_df_to_json_format(train_df), tokenizer)
        valid_dataset = BugSeverityDataset(convert_df_to_json_format(valid_df), tokenizer)
        test_dataset = BugSeverityDataset(convert_df_to_json_format(test_df), tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
        valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=2)

        # --- PyTorch Training ---
        best_model_path = run_pytorch_training(train_loader, valid_loader, seed, OUTPUT_DIR)

    except Exception as e:
        logger.error(f"Run for seed {seed} failed: {e}", exc_info=True)
        continue

2025-09-01 00:04:07.140225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756685047.165605      91 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756685047.173374      91 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 0):   0%|          | 0/10 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197500
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 778
[LightGBM] [Info] Start training from score -2.496238
[LightGBM] [Info] Start training from score -0.473390
[LightGBM] [Info] Start training from score -2.442171
[LightGBM] [Info] Start training from score -1.571437




tabpfn-v2-classifier-finetuned-zk73skhh.(…):   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

KeyError: 'Model'

In [3]:
import os
import json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaConfig, RobertaTokenizer, RobertaModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, confusion_matrix
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tabpfn import TabPFNClassifier
import random
import logging
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass

# =============================================================================
# STEP 1: GLOBAL CONFIGURATION AND SETUP
# =============================================================================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Global Configuration ---
BASE_URL = "/kaggle/input/dataset"
OUTPUT_DIR = "/kaggle/working/"
SYNTHETIC_DATA_SIZE = 0
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- SEEDS AND SAVED MODELS ---
SEEDS = [42, 83, 456, 789, 101]
SAVED_MODEL_PATHS = {
    42: "/kaggle/working/best_model_seed_42.pt",
    83: "/kaggle/working/best_model_seed_83.pt", 
    456: "/kaggle/working/best_model_seed_456.pt",
    789: "/kaggle/working/best_model_seed_789.pt",
    101: "/kaggle/working/best_model_seed_101.pt"
}

def set_seed(seed_value):
    """Sets the seed for reproducibility for all relevant libraries."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    logger.info(f"Global seed set to {seed_value}")

# =============================================================================
# STEP 2: DATA PREPARATION CLASSES AND FUNCTIONS
# =============================================================================
class DataPreprocessor:
    """Handles loading and splitting of the dataset based on a given seed."""
    def __init__(self, base_url, synthetic_data_size, random_seed):
        self.base_url = base_url
        self.synthetic_data_size = synthetic_data_size
        self.random_seed = random_seed

    def prepare_datasets(self):
        """Loads and splits data into train, validation, and test sets."""
        logger.info(f"Preparing datasets with seed: {self.random_seed}")
        df_org = pd.read_csv(f"{self.base_url}/df_org.csv")
        df_syn = pd.read_csv(f"{self.base_url}/df_syn.csv")

        df_syn_class0 = df_syn[df_syn['orig_label'] == 0].reset_index(drop=True)
        if self.synthetic_data_size > 0:
            df_syn_class0 = df_syn_class0.sample(
                n=min(self.synthetic_data_size, len(df_syn_class0)),
                random_state=self.random_seed
            )
        else:
            df_syn_class0 = pd.DataFrame(columns=df_syn_class0.columns).rename(columns={"synthetic_code": "code", "orig_label": "label"})

        matched_codes = set(df_syn_class0["orig_code"].unique()) if not df_syn_class0.empty else set()
        df_org_match = df_org[df_org["code"].isin(matched_codes)].copy()
        df_org_nonmatch = df_org[~df_org["code"].isin(matched_codes)].copy()

        df_org_match["source"] = "original"
        df_org_nonmatch["source"] = "original"

        df_syn_renamed = df_syn_class0.rename(columns={"synthetic_code": "code", "orig_label": "label"})
        df_syn_renamed["source"] = "synthetic"

        class0_df = df_org_nonmatch[df_org_nonmatch["label"] == 0].copy()
        nonclass0_df = df_org_nonmatch[df_org_nonmatch["label"] != 0].copy()

        class0_test = class0_df.sample(n=41, random_state=self.random_seed)
        class0_val = class0_df.drop(class0_test.index).sample(n=41, random_state=self.random_seed)
        class0_train = class0_df.drop(class0_test.index).drop(class0_val.index)

        nonclass0_trainval, nonclass0_test = train_test_split(
            nonclass0_df, test_size=0.15, random_state=self.random_seed, stratify=nonclass0_df["label"]
        )
        nonclass0_train, nonclass0_val = train_test_split(
            nonclass0_trainval, test_size=0.1765, random_state=self.random_seed, stratify=nonclass0_trainval["label"]
        )

        train_nonmatch = pd.concat([class0_train, nonclass0_train], ignore_index=True)
        valid_df = pd.concat([class0_val, nonclass0_val], ignore_index=True)
        test_df = pd.concat([class0_test, nonclass0_test], ignore_index=True)

        train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)
        
        logger.info(f"Data prepared: Train size={len(train_df)}, Val size={len(valid_df)}, Test size={len(test_df)}")
        return train_df, valid_df, test_df

class BugSeverityDataset(Dataset):
    """Custom PyTorch Dataset for loading code, numerical features, and labels."""
    def __init__(self, data, tokenizer, block_size=512):
        self.examples = data
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        js = self.examples[idx]
        code = js["code"]
        code_tokens = self.tokenizer.tokenize(str(code))[:self.block_size - 2]
        tokens = [self.tokenizer.cls_token] + code_tokens + [self.tokenizer.eos_token]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        padding_length = self.block_size - len(input_ids)
        input_ids += [self.tokenizer.pad_token_id] * padding_length
        
        num_features = torch.tensor([
            js.get("sloc_robust", 0.0), js.get("proxy_indentation_robust", 0.0),
            js.get("mcCabe_robust", 0.0), js.get("mcClure_robust", 0.0),
            js.get("nested_block_depth_robust", 0.0), js.get("difficulty_robust", 0.0),
            js.get("maintainability_index_robust", 0.0), js.get("fan_out_robust", 0.0),
            js.get("readability_robust", 0.0), js.get("effort_robust", 0.0)
        ], dtype=torch.float)
        
        return (
            torch.tensor(input_ids),
            num_features,
            torch.tensor(js["label"], dtype=torch.long)
        )

def convert_df_to_json_format(df):
    """Converts a DataFrame to the list of dicts format required by the Dataset."""
    return df.to_dict('records')

# =============================================================================
# STEP 3: PYTORCH MODEL
# =============================================================================
class ConcatClsModel(nn.Module):
    """CodeBERT encoder with a classification head for concatenated features."""
    def __init__(self, encoder, config):
        super().__init__()
        self.encoder = encoder
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_layer = nn.Linear(config.hidden_size + 10, config.num_labels)

    def forward(self, input_ids, num_features, labels=None):
        attention_mask = input_ids.ne(self.encoder.config.pad_token_id).long()
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeds = outputs.last_hidden_state[:, 0, :]
        concat = torch.cat((cls_embeds, num_features), dim=-1)
        logits = self.out_layer(self.dropout(concat))
        probs = torch.softmax(logits, dim=-1)

        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
            return loss, probs
        return probs

def evaluate_codebert_model(model, dataloader, device, seed):
    """Evaluates the CodeBERT model and returns all metrics."""
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating CodeBERT (Seed {seed})", leave=False):
            input_ids, num_features, labels = [b.to(device) for b in batch]
            probs = model(input_ids, num_features)
            preds = torch.argmax(probs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate all metrics
    metrics = {
        "Model": "CodeBERT",
        "seed": seed,
        "Accuracy": accuracy_score(all_labels, all_preds),
        "Precision_macro": precision_score(all_labels, all_preds, average='macro', zero_division=0),
        "Recall_macro": recall_score(all_labels, all_preds, average='macro', zero_division=0),
        "F1_macro": f1_score(all_labels, all_preds, average='macro', zero_division=0),
        "Precision_weighted": precision_score(all_labels, all_preds, average='weighted', zero_division=0),
        "Recall_weighted": recall_score(all_labels, all_preds, average='weighted', zero_division=0),
        "F1_weighted": f1_score(all_labels, all_preds, average='weighted', zero_division=0),
        "ROC-AUC_macro": roc_auc_score(all_labels, all_probs, multi_class='ovr', average='macro'),
        "ROC-AUC_weighted": roc_auc_score(all_labels, all_probs, multi_class='ovr', average='weighted'),
        "MCC": matthews_corrcoef(all_labels, all_preds)
    }
    
    # Calculate G-Mean (geometric mean of recall for each class)
    cm = confusion_matrix(all_labels, all_preds)
    recalls = np.diag(cm) / np.sum(cm, axis=1)
    recalls = recalls[~np.isnan(recalls)]  # Remove NaN values if any class has 0 samples
    metrics["G-Mean"] = np.exp(np.mean(np.log(recalls))) if len(recalls) > 0 else 0.0
    
    return metrics

# =============================================================================
# STEP 4: EMBEDDING EXTRACTION
# =============================================================================
def extract_embeddings(model, dataset, tokenizer, device):
    """Extracts combined [CLS] and numerical embeddings for downstream models."""
    embeddings, labels = [], []
    model.eval()
    with torch.no_grad():
        for item in tqdm(dataset, desc="Extracting embeddings", leave=False):
            input_ids, num_features, label = item
            input_ids = input_ids.unsqueeze(0).to(device)
            num_features = num_features.unsqueeze(0).to(device)

            outputs = model.encoder(input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id).long())
            cls_embeds = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()
            num_features_np = num_features.cpu().numpy().flatten()
            
            embeddings.append(np.concatenate([cls_embeds, num_features_np]))
            labels.append(label.item())
            
    return np.array(embeddings), np.array(labels)

# =============================================================================
# STEP 5: CLASSICAL & NON-CLASSICAL MODEL TRAINING AND EVALUATION
# =============================================================================
@dataclass
class ModelConfig:
    name: str
    model: Any
    params: Dict[str, Any] = None

def evaluate_predictions(y_true, y_pred, y_proba, model_name, seed):
    """Calculates a dictionary of metrics for a model's predictions."""
    metrics = {
        "Model": model_name,
        "seed": seed,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision_macro": precision_score(y_true, y_pred, average='macro', zero_division=0),
        "Recall_macro": recall_score(y_true, y_pred, average='macro', zero_division=0),
        "F1_macro": f1_score(y_true, y_pred, average='macro', zero_division=0),
        "Precision_weighted": precision_score(y_true, y_pred, average='weighted', zero_division=0),
        "Recall_weighted": recall_score(y_true, y_pred, average='weighted', zero_division=0),
        "F1_weighted": f1_score(y_true, y_pred, average='weighted', zero_division=0),
        "ROC-AUC_macro": roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro'),
        "ROC-AUC_weighted": roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted'),
        "MCC": matthews_corrcoef(y_true, y_pred)
    }
    
    # Calculate G-Mean
    cm = confusion_matrix(y_true, y_pred)
    recalls = np.diag(cm) / np.sum(cm, axis=1)
    recalls = recalls[~np.isnan(recalls)]
    metrics["G-Mean"] = np.exp(np.mean(np.log(recalls))) if len(recalls) > 0 else 0.0
    
    return metrics

def run_downstream_model_training(X_trainval, y_trainval, X_test, y_test, seed):
    """Trains and evaluates a suite of classical and non-classical models."""
    logger.info(f"Starting downstream model training for seed {seed}.")
    results = []
    
    model_configs = [
        ModelConfig("KNN", KNeighborsClassifier(), {'n_neighbors': 3, 'weights': 'distance', 'p': 1}),
        ModelConfig("SVM", SVC(probability=True, random_state=seed), {"C": 2.5, "kernel": "rbf", "gamma": "scale"}),
        ModelConfig("Naive Bayes", GaussianNB(), {"var_smoothing": 1e-8}),
        ModelConfig("Decision Tree", DecisionTreeClassifier(random_state=seed), {"max_depth": 14, "min_samples_split": 4}),
        ModelConfig("RandomForest", RandomForestClassifier(random_state=seed, n_jobs=-1), {"n_estimators": 600, "max_depth": 18, "min_samples_split": 3, "class_weight": "balanced_subsample"}),
        ModelConfig("AdaBoost", AdaBoostClassifier(random_state=seed), {"n_estimators": 500, "learning_rate": 0.85}),
        ModelConfig("XGBoost", xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', seed=seed, tree_method="gpu_hist"), {
            "max_depth": 9, "eta": 0.24627429143007107, "subsample": 0.45321841598276075,
            "colsample_bytree": 0.7227038914198726, "lambda": 0.06640744768945579, "alpha": 0.21504472646446163
        }),
        ModelConfig("LightGBM", lgb.LGBMClassifier(random_state=seed), {
            "objective": "multiclass", "metric": "multi_logloss", "learning_rate": 0.085, "max_depth": 7
        }),
        ModelConfig("CatBoost", cb.CatBoostClassifier(random_seed=seed, verbose=0, task_type="GPU"), {
            "learning_rate": 0.093, "depth": 7, "l2_leaf_reg": 7.07, "iterations": 829
        }),
        ModelConfig("TabPFN", TabPFNClassifier(device=DEVICE, ignore_pretraining_limits=True))
    ]

    for config in tqdm(model_configs, desc=f"Training Downstream Models (Seed {seed})"):
        try:
            model = config.model
            if config.params:
                model.set_params(**config.params)
            
            model.fit(X_trainval, y_trainval)
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
            
            metrics = evaluate_predictions(y_test, y_pred, y_proba, config.name, seed)
            results.append(metrics)
        except Exception as e:
            logger.error(f"Failed to train {config.name} for seed {seed}: {e}")
            
    return results

# =============================================================================
# STEP 6: MAIN EXECUTION LOOP
# =============================================================================
all_results = []  # This will store results for ALL models for ALL seeds
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Initialize the base model architecture
config = RobertaConfig.from_pretrained("microsoft/codebert-base", num_labels=4)
encoder = RobertaModel.from_pretrained("microsoft/codebert-base", config=config, add_pooling_layer=False)
base_model = ConcatClsModel(encoder, config).to(DEVICE)

for seed in SEEDS:
    try:
        set_seed(seed)
        
        # --- Data Prep ---
        preprocessor = DataPreprocessor(BASE_URL, SYNTHETIC_DATA_SIZE, seed)
        train_df, valid_df, test_df = preprocessor.prepare_datasets()
        
        train_dataset = BugSeverityDataset(convert_df_to_json_format(train_df), tokenizer)
        valid_dataset = BugSeverityDataset(convert_df_to_json_format(valid_df), tokenizer)
        test_dataset = BugSeverityDataset(convert_df_to_json_format(test_df), tokenizer)
        
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

        # --- Load Pre-trained Model ---
        model_path = SAVED_MODEL_PATHS[seed]
        if not os.path.exists(model_path):
            logger.error(f"Model not found: {model_path}. Skipping seed {seed}.")
            continue
            
        logger.info(f"Loading saved model from {model_path} for seed {seed}.")
        base_model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model = base_model

        # --- Evaluate CodeBERT Model Itself ---
        logger.info(f"Evaluating CodeBERT model performance for seed {seed}.")
        codebert_metrics = evaluate_codebert_model(model, test_loader, DEVICE, seed)
        all_results.append(codebert_metrics)
        logger.info(f"CodeBERT Seed {seed} Results: Accuracy={codebert_metrics['Accuracy']:.4f}, F1_macro={codebert_metrics['F1_macro']:.4f}")

        # --- Embedding Extraction for Downstream Models ---
        X_train, y_train = extract_embeddings(model, train_dataset, tokenizer, DEVICE)
        X_valid, y_valid = extract_embeddings(model, valid_dataset, tokenizer, DEVICE)
        X_test, y_test = extract_embeddings(model, test_dataset, tokenizer, DEVICE)
        
        X_trainval = np.vstack([X_train, X_valid])
        y_trainval = np.concatenate([y_train, y_valid])

        # --- Downstream Model Training & Evaluation ---
        downstream_results = run_downstream_model_training(X_trainval, y_trainval, X_test, y_test, seed)
        all_results.extend(downstream_results)
        
        logger.info(f"Successfully completed evaluation for seed {seed}.")

    except Exception as e:
        logger.error(f"Run for seed {seed} failed: {e}", exc_info=True)
        continue

# =============================================================================
# STEP 7: AGGREGATE AND DISPLAY FINAL RESULTS
# =============================================================================
if all_results:
    # Create a DataFrame from all results
    final_results_df = pd.DataFrame(all_results)
    
    # 1. Display Individual Results for Each Seed
    print("\n" + "="*120)
    print("INDIVIDUAL RESULTS FOR EACH SEED AND MODEL")
    print("="*120)
    with pd.option_context('display.max_rows', None, 'display.width', None, 'display.max_columns', None):
        print(final_results_df.round(4))
    
    # 2. Calculate and Display Averaged Results
    print("\n\n" + "="*120)
    print("AVERAGED RESULTS ACROSS ALL SEEDS (Mean ± Std)")
    print("="*120)
    
    # Group by model and calculate mean and std for all metrics
    numeric_cols = final_results_df.columns.difference(['Model', 'seed'])
    aggregated = final_results_df.groupby('Model')[numeric_cols].agg(['mean', 'std'])
    
    # Format results as "mean ± std"
    formatted_results = pd.DataFrame(index=aggregated.index)
    for metric in numeric_cols:
        mean_vals = aggregated[(metric, 'mean')]
        std_vals = aggregated[(metric, 'std')]
        formatted_results[metric] = [f"{m:.4f} ± {s:.4f}" for m, s in zip(mean_vals, std_vals)]
    
    with pd.option_context('display.max_rows', None, 'display.width', None, 'display.max_columns', None):
        print(formatted_results)
    
    # 3. Save results to files
    final_results_df.to_csv(os.path.join(OUTPUT_DIR, "all_seeds_individual_results.csv"), index=False)
    formatted_results.to_csv(os.path.join(OUTPUT_DIR, "averaged_results_summary.csv"))
    
    logger.info(f"Results saved to {OUTPUT_DIR}")
    
    # 4. Show best performing models for each metric
    print("\n\n" + "="*80)
    print("BEST PERFORMING MODELS FOR EACH METRIC (Based on Average)")
    print("="*80)
    
    best_models = {}
    for metric in numeric_cols:
        best_idx = aggregated[(metric, 'mean')].idxmax()
        best_mean = aggregated.loc[best_idx, (metric, 'mean')]
        best_std = aggregated.loc[best_idx, (metric, 'std')]
        best_models[metric] = (best_idx, best_mean, best_std)
        print(f"{metric:20s}: {best_idx:15s} {best_mean:.4f} ± {best_std:.4f}")

else:
    logger.warning("No results were generated. Cannot create summary.")

print("\nEvaluation completed!")

Using device: cuda


Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


Evaluating CodeBERT (Seed 42):   0%|          | 0/16 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 42):   0%|          | 0/10 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197503
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 778
[LightGBM] [Info] Start training from score -2.496238
[LightGBM] [Info] Start training from score -0.473390
[LightGBM] [Info] Start training from score -2.442171
[LightGBM] [Info] Start training from score -1.571437


  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


Evaluating CodeBERT (Seed 83):   0%|          | 0/16 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 83):   0%|          | 0/10 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197500
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 778
[LightGBM] [Info] Start training from score -2.496238
[LightGBM] [Info] Start training from score -0.473390
[LightGBM] [Info] Start training from score -2.442171
[LightGBM] [Info] Start training from score -1.571437


  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


Evaluating CodeBERT (Seed 456):   0%|          | 0/16 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 456):   0%|          | 0/10 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197499
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 778
[LightGBM] [Info] Start training from score -2.496238
[LightGBM] [Info] Start training from score -0.473390
[LightGBM] [Info] Start training from score -2.442171
[LightGBM] [Info] Start training from score -1.571437


  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


Evaluating CodeBERT (Seed 789):   0%|          | 0/16 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 789):   0%|          | 0/10 [00:00<?, ?it/s]

  train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)


Evaluating CodeBERT (Seed 101):   0%|          | 0/16 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/2339 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/501 [00:00<?, ?it/s]

Extracting embeddings:   0%|          | 0/502 [00:00<?, ?it/s]

Training Downstream Models (Seed 101):   0%|          | 0/10 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197505
[LightGBM] [Info] Number of data points in the train set: 2840, number of used features: 778
[LightGBM] [Info] Start training from score -2.496238
[LightGBM] [Info] Start training from score -0.473390
[LightGBM] [Info] Start training from score -2.442171
[LightGBM] [Info] Start training from score -1.571437





INDIVIDUAL RESULTS FOR EACH SEED AND MODEL
            Model  seed  Accuracy  Precision_macro  Recall_macro  F1_macro  \
0        CodeBERT    42    0.7769           0.7348        0.7198    0.7232   
1             KNN    42    0.7769           0.7281        0.7251    0.7246   
2             SVM    42    0.7849           0.7487        0.7251    0.7324   
3     Naive Bayes    42    0.7530           0.6958        0.7229    0.7073   
4   Decision Tree    42    0.7470           0.6925        0.6904    0.6883   
5    RandomForest    42    0.7869           0.7627        0.7210    0.7352   
6        AdaBoost    42    0.6952           0.6430        0.6929    0.6556   
7         XGBoost    42    0.7769           0.7386        0.7064    0.7177   
8        LightGBM    42    0.7849           0.7537        0.7080    0.7245   
9        CatBoost    42    0.7849           0.7479        0.7235    0.7309   
10         TabPFN    42    0.7809           0.7401        0.7251    0.7277   
11       CodeBERT   