In [5]:
!pip install tabpfn



In [6]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaConfig, RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
import torch.nn as nn
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

2025-09-05 15:19:04.912196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757085545.108019      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757085545.163337      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
pd.read_csv("/kaggle/input/dataset/df_org.csv").columns

Index(['Unnamed: 0', 'project_name', 'project_version', 'label', 'code',
       'code_comment', 'code_no_comment', 'sloc', 'proxy_indentation',
       'mcCabe', 'nested_block_depth', 'mcClure', 'mcClure_NVAR',
       'mcClure_NCOMP', 'difficulty', 'effort', 'maintainability_index',
       'readability', 'fan_out', 'sloc_std', 'proxy_indentation_std',
       'mcCabe_std', 'nested_block_depth_std', 'mcClure_std',
       'mcClure_NVAR_std', 'mcClure_NCOMP_std', 'difficulty_std', 'effort_std',
       'maintainability_index_std', 'readability_std', 'fan_out_std',
       'sloc_norm', 'proxy_indentation_norm', 'mcCabe_norm',
       'nested_block_depth_norm', 'mcClure_norm', 'mcClure_NVAR_norm',
       'mcClure_NCOMP_norm', 'difficulty_norm', 'effort_norm',
       'maintainability_index_norm', 'readability_norm', 'fan_out_norm',
       'sloc_robust', 'proxy_indentation_robust', 'mcCabe_robust',
       'nested_block_depth_robust', 'mcClure_robust', 'mcClure_NVAR_robust',
       'mcClure_NCOMP_r

In [9]:
# import pickle
# import numpy as np

# # Load the embeddings
# with open("/kaggle/input/dataset-final/embeddings (1).pkl", "rb") as f:
#     data = pickle.load(f)

# # Access train, valid, and test
# X_train = data["train"]["X"]
# y_train = data["train"]["y"]
# X_valid = data["valid"]["X"]
# y_valid = data["valid"]["y"]
# X_test = data["test"]["X"]
# y_test = data["test"]["y"]

# # Combine train and valid to form trainval
# X_trainval = np.concatenate([X_train, X_valid], axis=0)
# y_trainval = np.concatenate([y_train, y_valid], axis=0)

# print(f"X_trainval shape: {X_trainval.shape}")
# print(f"y_trainval shape: {y_trainval.shape}")
# print(f"X_test shape: {X_test.shape}")


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# # Feature columns to use
# feature_cols = [
#     "sloc_robust", "proxy_indentation_robust", "mcCabe_robust",
#     "mcClure_robust", "nested_block_depth_robust", "difficulty_robust",
#     "maintainability_index_robust", "fan_out_robust", "readability_robust", "effort_robust"
# ]

# Load datasets
train_df = pd.read_csv("/kaggle/input/dataset-final/train_dataset.csv")
valid_df = pd.read_csv("/kaggle/input/dataset-final/valid_dataset.csv")
test_df  = pd.read_csv("/kaggle/input/dataset-final/test_dataset.csv")

In [12]:

class DataPreprocessor:
    def __init__(self, base_url, synthetic_data_size=None, random_seed=42):
        """
        Initialize the data preprocessor
        
        Args:
            base_url (str): Base directory path for the dataset
            synthetic_data_size (int, optional): Number of synthetic samples to use. 
                                               If None, use all available synthetic data.
            random_seed (int): Random seed for reproducibility
        """
        self.base_url = base_url
        self.synthetic_data_size = synthetic_data_size
        self.random_seed = random_seed
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Initialize attributes to store dataframes
        self.df_org = None
        self.df_syn = None
        self.train_df = None
        self.valid_df = None
        self.test_df = None
        
    def load_data(self):
        """Load and preprocess the original and synthetic data"""
        print(f"Using device: {self.device}")
        
        # Load data
        self.df_org = pd.read_csv(f"{self.base_url}/df_org.csv")
        self.df_syn = pd.read_csv(f"{self.base_url}/df_syn.csv")
        
        # Filter synthetic data for class 0 only and apply size limit if specified
        df_syn_class0 = self.df_syn[self.df_syn['orig_label'] == 0].reset_index(drop=True)
        
        if self.synthetic_data_size is not None:
            # Sample the specified number of synthetic examples
            df_syn_class0 = df_syn_class0.sample(n=min(self.synthetic_data_size, len(df_syn_class0)), 
                                                random_state=self.random_seed)
        
        self.df_syn = df_syn_class0
        
        print("Original synthetic size:", len(self.df_syn))
        print("Filtered synthetic size (class 0 only):", len(df_syn_class0))
        
        return self.df_org, self.df_syn
    
    def prepare_datasets(self):
        """
        Prepare train, validation, and test datasets with the specified configuration
        
        Returns:
            tuple: (train_df, valid_df, test_df) DataFrames
        """
        if self.df_org is None or self.df_syn is None:
            self.load_data()
        
        # ------------------------------
        # Step 0: Prepare DataFrames
        # ------------------------------
        matched_codes = set(self.df_syn["orig_code"].unique())
        df_org_match = self.df_org[self.df_org["code"].isin(matched_codes)].copy()
        df_org_nonmatch = self.df_org[~self.df_org["code"].isin(matched_codes)].copy()

        # Add source column for original data
        df_org_match["source"] = "original"
        df_org_nonmatch["source"] = "original"

        # Rename synthetic columns to match train_df and add source
        df_syn_renamed = self.df_syn.rename(columns={"synthetic_code": "code", "orig_label": "label"})
        df_syn_renamed["source"] = "synthetic"

        print("Total df_org:", len(self.df_org))
        print("df_org_match (has synthetic pair):", len(df_org_match))
        print("df_org_nonmatch (no synthetic pair):", len(df_org_nonmatch))
        print("Total df_syn:", len(df_syn_renamed))

        # ------------------------------
        # Step 1: Split class 0 in non-matched
        # ------------------------------
        class0_df = df_org_nonmatch[df_org_nonmatch["label"] == 0].copy()
        nonclass0_df = df_org_nonmatch[df_org_nonmatch["label"] != 0].copy()

        # Ensure 41 rows each for test/val
        class0_test = class0_df.sample(n=41, random_state=self.random_seed)
        class0_val = class0_df.drop(class0_test.index).sample(n=41, random_state=self.random_seed)
        class0_train = class0_df.drop(class0_test.index).drop(class0_val.index)

        # ------------------------------
        # Step 2: Split other classes in non-matched (stratified)
        # ------------------------------
        nonclass0_trainval, nonclass0_test = train_test_split(
            nonclass0_df, test_size=0.15, random_state=self.random_seed, stratify=nonclass0_df["label"]
        )
        nonclass0_train, nonclass0_val = train_test_split(
            nonclass0_trainval, 
            test_size=0.1765,  # 0.1765 ≈ 0.15 / 0.85 to get total 15% val
            random_state=self.random_seed, 
            stratify=nonclass0_trainval["label"]
        )

        # ------------------------------
        # Step 3: Combine splits
        # ------------------------------
        train_nonmatch = pd.concat([class0_train, nonclass0_train], ignore_index=True)
        valid_df = pd.concat([class0_val, nonclass0_val], ignore_index=True)
        test_df = pd.concat([class0_test, nonclass0_test], ignore_index=True)

        # ------------------------------
        # Step 4: Add matched + synthetic only to train
        # ------------------------------
        train_df = pd.concat([train_nonmatch, df_org_match, df_syn_renamed], ignore_index=True)

        # ------------------------------
        # Step 5: Store and return results
        # ------------------------------
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
        
        # Print statistics
        self._print_dataset_stats()
        
        return train_df, valid_df, test_df
    
    def _print_dataset_stats(self):
        """Print statistics about the prepared datasets"""
        print("\n" + "="*50)
        print("DATASET STATISTICS")
        print("="*50)
        
        print(f"Train size: {len(self.train_df)}")
        print(f"Val size: {len(self.valid_df)}")
        print(f"Test size: {len(self.test_df)}")

        print("\nClass distribution:")
        print("Train:\n", self.train_df["label"].value_counts().sort_index())
        print("Val:\n", self.valid_df["label"].value_counts().sort_index())
        print("Test:\n", self.test_df["label"].value_counts().sort_index())

        print("\nSource distribution in train:")
        print(self.train_df["source"].value_counts())
        print("="*50)
    
    def get_datasets(self):
        """Return the prepared datasets"""
        if self.train_df is None:
            self.prepare_datasets()
        return self.train_df, self.valid_df, self.test_df
    
    def save_datasets(self, output_dir):
        """Save the prepared datasets to CSV files"""
        if self.train_df is None:
            self.prepare_datasets()
        
        os.makedirs(output_dir, exist_ok=True)
        
        self.train_df.to_csv(f"{output_dir}/train_dataset.csv", index=False)
        self.valid_df.to_csv(f"{output_dir}/valid_dataset.csv", index=False)
        self.test_df.to_csv(f"{output_dir}/test_dataset.csv", index=False)
        
        print(f"Datasets saved to {output_dir}")


# Example usage:
if __name__ == "__main__":
    # Initialize the preprocessor with custom parameters
    preprocessor = DataPreprocessor(
        base_url="/kaggle/input/dataset",
        synthetic_data_size=0,  # Use at mpst 350 synthetic samples
        random_seed=42  # Different random seed
    )
    
    # Load and prepare datasets
    train_df, valid_df, test_df = preprocessor.prepare_datasets()
    
    # Alternatively, you can use the getter method
    # train_df, valid_df, test_df = preprocessor.get_datasets()
    
    # Save datasets to files
    preprocessor.save_datasets("./preprocessed_data")

Using device: cuda
Original synthetic size: 0
Filtered synthetic size (class 0 only): 0
Total df_org: 3342
df_org_match (has synthetic pair): 0
df_org_nonmatch (no synthetic pair): 3342
Total df_syn: 0

DATASET STATISTICS
Train size: 2339
Val size: 501
Test size: 502

Class distribution:
Train:
 label
0     193
1    1457
2     203
3     486
Name: count, dtype: int64
Val:
 label
0     41
1    312
2     44
3    104
Name: count, dtype: int64
Test:
 label
0     41
1    313
2     44
3    104
Name: count, dtype: int64

Source distribution in train:
source
original    2339
Name: count, dtype: int64
Datasets saved to ./preprocessed_data


In [13]:
def add_primary_key(df):
    # Ensure project_name and project_version columns exist
    if not {'project_name', 'project_version', 'code'}.issubset(df.columns):
        raise ValueError("Columns 'project_name', 'project_version', 'code' must exist in DataFrame")
    
    df['primary_key'] = df['project_name'].astype(str) + "_" + \
                        df['project_version'].astype(str) + "_" + \
                        df['code'].astype(str)
    return df

# Apply to all splits
train_df = add_primary_key(train_df)
valid_df = add_primary_key(valid_df)
test_df = add_primary_key(test_df)

# Check for duplicates using primary key
print("Duplicate primary keys in train:", train_df['primary_key'].duplicated().sum())
print("Duplicate primary keys in val:", valid_df['primary_key'].duplicated().sum())
print("Duplicate primary keys in test:", test_df['primary_key'].duplicated().sum())

# Check for overlap across splits using primary key
train_keys = set(train_df['primary_key'])
val_keys = set(valid_df['primary_key'])
test_keys = set(test_df['primary_key'])

print("Overlap train-val:", len(train_keys & val_keys))
print("Overlap train-test:", len(train_keys & test_keys))
print("Overlap val-test:", len(val_keys & test_keys))


Duplicate primary keys in train: 0
Duplicate primary keys in val: 0
Duplicate primary keys in test: 0
Overlap train-val: 0
Overlap train-test: 0
Overlap val-test: 0


In [33]:
train_df.columns

Index(['Unnamed: 0', 'project_name', 'project_version', 'label', 'code',
       'code_comment', 'code_no_comment', 'sloc', 'proxy_indentation',
       'mcCabe', 'nested_block_depth', 'mcClure', 'mcClure_NVAR',
       'mcClure_NCOMP', 'difficulty', 'effort', 'maintainability_index',
       'readability', 'fan_out', 'sloc_std', 'proxy_indentation_std',
       'mcCabe_std', 'nested_block_depth_std', 'mcClure_std',
       'mcClure_NVAR_std', 'mcClure_NCOMP_std', 'difficulty_std', 'effort_std',
       'maintainability_index_std', 'readability_std', 'fan_out_std',
       'sloc_norm', 'proxy_indentation_norm', 'mcCabe_norm',
       'nested_block_depth_norm', 'mcClure_norm', 'mcClure_NVAR_norm',
       'mcClure_NCOMP_norm', 'difficulty_norm', 'effort_norm',
       'maintainability_index_norm', 'readability_norm', 'fan_out_norm',
       'sloc_robust', 'proxy_indentation_robust', 'mcCabe_robust',
       'nested_block_depth_robust', 'mcClure_robust', 'mcClure_NVAR_robust',
       'mcClure_NCOMP_r

In [15]:
import os, json

def save_df_as_jsonl(df, filepath):
    # Use the *_robust columns instead of raw
    required_cols = [
        "code", "label",
        "sloc_robust", "proxy_indentation_robust", "mcCabe_robust",
        "mcClure_robust", "nested_block_depth_robust", "difficulty_robust",
        "maintainability_index_robust", "fan_out_robust", "readability_robust", "effort_robust"
    ]
    for c in required_cols:
        assert c in df.columns, f"Missing column: {c}"

    with open(filepath, "w") as f:
        for _, row in df.iterrows():
            obj = {
                "code": row["code"],
                "label": int(row["label"]),
                "sloc": float(row["sloc_robust"]),
                "proxy_indentation": float(row["proxy_indentation_robust"]),
                "mcCabe": float(row["mcCabe_robust"]),
                "mcClure": float(row["mcClure_robust"]),
                "nested_block_depth": float(row["nested_block_depth_robust"]),
                "difficulty": float(row["difficulty_robust"]),
                "maintainability_index": float(row["maintainability_index_robust"]),
                "fan_out": float(row["fan_out_robust"]),
                "readability": float(row["readability_robust"]),
                "effort": float(row["effort_robust"]),
            }
            f.write(json.dumps(obj) + "\n")

os.makedirs("dataset_jsonl", exist_ok=True)
save_df_as_jsonl(train_df, "dataset_jsonl/train_scaled.jsonl")
save_df_as_jsonl(valid_df, "dataset_jsonl/valid_scaled.jsonl")
save_df_as_jsonl(test_df, "dataset_jsonl/test_scaled.jsonl")

print("Saved JSONL files using *_robust features.")


Saved JSONL files using *_robust features.


In [16]:
# import os, json

# # === Step 2: Save raw features to JSONL ===
# def save_df_as_jsonl_raw(df, filepath):
#     # Use the original raw metric columns instead of *_robust
#     required_cols = [
#         "code", "label",
#         "sloc", "proxy_indentation", "mcCabe",
#         "mcClure", "nested_block_depth", "difficulty",
#         "maintainability_index", "fan_out", "readability", "effort"
#     ]
#     for c in required_cols:
#         assert c in df.columns, f"Missing column: {c}"

#     with open(filepath, "w") as f:
#         for _, row in df.iterrows():
#             obj = {
#                 "code": row["code"],
#                 "label": int(row["label"]),
#                 "sloc": float(row["sloc"]),
#                 "proxy_indentation": float(row["proxy_indentation"]),
#                 "mcCabe": float(row["mcCabe"]),
#                 "mcClure": float(row["mcClure"]),
#                 "nested_block_depth": float(row["nested_block_depth"]),
#                 "difficulty": float(row["difficulty"]),
#                 "maintainability_index": float(row["maintainability_index"]),
#                 "fan_out": float(row["fan_out"]),
#                 "readability": float(row["readability"]),
#                 "effort": float(row["effort"]),
#             }
#             f.write(json.dumps(obj) + "\n")

# os.makedirs("dataset_jsonl", exist_ok=True)
# save_df_as_jsonl_raw(train_df, "dataset_jsonl/train_scaled.jsonl")
# save_df_as_jsonl_raw(valid_df, "dataset_jsonl/valid_scaled.jsonl")
# save_df_as_jsonl_raw(test_df, "dataset_jsonl/test_scaled.jsonl")

# print("Saved JSONL files using raw features only.")


In [17]:
# === Step 3: Define Dataset class for loading JSONL ===
class BugSeverityDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=512):
        self.examples = []
        with open(file_path, "r") as f:
            for line in f:
                js = json.loads(line.strip())
                self.examples.append(js)
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        js = self.examples[idx]

        # tokenize code
        code = js["code"]
        code_tokens = self.tokenizer.tokenize(code)[: self.block_size - 2]
        tokens = [self.tokenizer.cls_token] + code_tokens + [self.tokenizer.eos_token]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        padding_length = self.block_size - len(input_ids)
        input_ids += [self.tokenizer.pad_token_id] * padding_length
        input_ids = torch.tensor(input_ids)

        # numeric features (now using full column names)
        num_features = torch.tensor([
            js["sloc"],
            js["proxy_indentation"],
            js["mcCabe"],
            js["mcClure"],
            js["nested_block_depth"],
            js["difficulty"],
            js["maintainability_index"],
            js["fan_out"],
            js["readability"],
            js["effort"]
        ], dtype=torch.float)

        label = torch.tensor(js["label"], dtype=torch.long)

        return input_ids, num_features, label
        # return input_ids, label


In [18]:
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader
# from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, get_linear_schedule_with_warmup
# from sklearn.metrics import classification_report, f1_score
# from tqdm import tqdm

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # === Model Definition ===
# class ConcatClsModel(nn.Module):
#     def __init__(self, encoder, config):
#         super().__init__()
#         self.encoder = encoder
#         self.config = config
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.out_layer = nn.Linear(config.hidden_size, config.num_labels)

#     def forward(self, input_ids, labels=None):
#         attention_mask = input_ids.ne(self.encoder.config.pad_token_id).long()
#         outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
#         cls_embeds = outputs.last_hidden_state[:, 0, :]  # CLS token
#         cls_embeds = self.dropout(cls_embeds)
#         logits = self.out_layer(cls_embeds)
#         probs = torch.softmax(logits, dim=-1)

#         if labels is not None:
#             loss_fct = nn.CrossEntropyLoss()
#             loss = loss_fct(logits, labels)
#             return loss, probs
#         else:
#             return probs

# # === Training and Evaluation functions ===
# def train_epoch(model, dataloader, optimizer, scheduler):
#     model.train()
#     total_loss = 0
#     for batch in tqdm(dataloader, desc="Training"):
#         input_ids, labels = [b.to(device) for b in batch]

#         optimizer.zero_grad()
#         loss, _ = model(input_ids, labels)

#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         optimizer.step()
#         scheduler.step()
#         total_loss += loss.item()
#     return total_loss / len(dataloader)


# def evaluate(model, dataloader):
#     model.eval()
#     preds, labels_all = [], []
#     total_loss = 0
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             input_ids, labels = [b.to(device) for b in batch]
#             loss, probs = model(input_ids, labels)
#             total_loss += loss.item()
#             preds.extend(torch.argmax(probs, dim=1).cpu().tolist())
#             labels_all.extend(labels.cpu().tolist())

#     avg_loss = total_loss / len(dataloader)
#     f1 = f1_score(labels_all, preds, average="macro")
#     print(classification_report(labels_all, preds, digits=4))
#     return avg_loss, f1


# # === Load tokenizer and datasets ===
# tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
# config = RobertaConfig.from_pretrained("microsoft/codebert-base")

# # Best Optuna parameters
# best_params = {
#     'lr': 4.818976027099782e-05,
#     'weight_decay': 0.00010045918919119982,
#     'warmup_ratio': 0.28606124459699783,
#     'dropout': 0.16316509043013103,
#     'epochs': 8,
# }

# config.num_labels = 4
# config.hidden_dropout_prob = best_params['dropout']

# train_dataset = BugSeverityDataset("dataset_jsonl/train_scaled.jsonl", tokenizer)
# valid_dataset = BugSeverityDataset("dataset_jsonl/valid_scaled.jsonl", tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
# valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=2)

# # === Instantiate model, optimizer, scheduler ===
# encoder = RobertaModel.from_pretrained("microsoft/codebert-base", config=config, add_pooling_layer=False)
# model = ConcatClsModel(encoder, config).to(device)

# optimizer = AdamW(
#     model.parameters(),
#     lr=best_params['lr'],
#     weight_decay=best_params['weight_decay'],
# )

# total_steps = len(train_loader) * best_params['epochs']
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=int(total_steps * best_params['warmup_ratio']),
#     num_training_steps=total_steps
# )

# # === Training loop ===
# best_f1 = 0
# for epoch in range(best_params['epochs']):
#     train_loss = train_epoch(model, train_loader, optimizer, scheduler)
#     val_loss, val_f1 = evaluate(model, valid_loader)
#     print(f"Epoch {epoch+1}/{best_params['epochs']} | Train Loss: {train_loss:.6f} | Val Loss: {val_loss:.6f} | Val F1: {val_f1:.6f}")

#     # Save checkpoint
#     torch.save(model.state_dict(), f"concatcls_model_epoch{epoch+1}.pt")

#     if val_f1 > best_f1:
#         best_f1 = val_f1
#         torch.save(model.state_dict(), "best_concatcls_model.pt")
#         print("Saved best model.")

# print(f"Training completed. Best validation F1: {best_f1:.6f}")


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Model Definition ===
class ConcatClsModel(nn.Module):
    def __init__(self, encoder, config):
        super().__init__()
        self.encoder = encoder
        self.config = config
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_layer = nn.Linear(config.hidden_size + 10, config.num_labels)

    def forward(self, input_ids, num_features, labels=None):
        attention_mask = input_ids.ne(self.encoder.config.pad_token_id).long()
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeds = outputs.last_hidden_state[:, 0, :]
        concat = torch.cat((cls_embeds, num_features), dim=-1)
        logits = self.out_layer(concat)
        probs = torch.softmax(logits, dim=-1)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, probs
        else:
            return probs

# === Training and Evaluation functions ===
def train_epoch(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids, num_features, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        loss, _ = model(input_ids, num_features, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

from sklearn.metrics import classification_report

def evaluate(model, dataloader):
    model.eval()
    preds, labels_all = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids, num_features, labels = [b.to(device) for b in batch]
            loss, probs = model(input_ids, num_features, labels)
            total_loss += loss.item()
            preds.extend(torch.argmax(probs, dim=1).cpu().tolist())
            labels_all.extend(labels.cpu().tolist())
    
    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(labels_all, preds, average="macro")
    print(classification_report(labels_all, preds, digits=4))
    return avg_loss, f1

# === Load tokenizer and datasets ===
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
config = RobertaConfig.from_pretrained("microsoft/codebert-base")

# Best Optuna parameters
best_params = {
    'lr': 4.818976027099782e-05,
    'weight_decay': 0.00010045918919119982,
    'warmup_ratio': 0.28606124459699783,
    'dropout': 0.16316509043013103,
    'epochs': 8,
}


config.num_labels = 4
config.hidden_dropout_prob = best_params['dropout']

train_dataset = BugSeverityDataset("dataset_jsonl/train_scaled.jsonl", tokenizer)
valid_dataset = BugSeverityDataset("dataset_jsonl/valid_scaled.jsonl", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=2)

# === Instantiate model, optimizer, scheduler ===
encoder = RobertaModel.from_pretrained("microsoft/codebert-base", config=config, add_pooling_layer=False)
model = ConcatClsModel(encoder, config).to(device)

optimizer = AdamW(
    model.parameters(),
    lr=best_params['lr'],
    weight_decay=best_params['weight_decay'],
)

total_steps = len(train_loader) * best_params['epochs']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * best_params['warmup_ratio']),
    num_training_steps=total_steps
)

# === Training loop ===
best_f1 = 0
for epoch in range(best_params['epochs']):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    val_loss, val_f1 = evaluate(model, valid_loader)
    print(f"Epoch {epoch+1}/{best_params['epochs']} | Train Loss: {train_loss:.6f} | Val Loss: {val_loss:.6f} | Val F1: {val_f1:.6f}")

    # Save model checkpoint for every epoch
    torch.save(model.state_dict(), f"concatcls_model_epoch{epoch+1}.pt")

    # Optionally, still keep track of best model separately
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), "best_concatcls_model.pt")
        print("Saved best model.")

print(f"Training completed. Best validation F1: {best_f1:.6f}")

print(f"Training completed. Best validation F1: {best_f1:.6f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaModel: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Training:   0%|          | 0/147 [00:00<?, ?it/s][A
Training:   1%|          | 1/147 [00:02<05:52,  2.41s/it][A
Training:   1%|▏         | 2/147 [00:03<03:49,  1.58s/it][A
Training:   2%|▏         | 3/147 [00:04<03:10,  1.32s/it][A
Training:   3%|▎         | 4/147 [00:05<02:51,  1.20s/it][A
Training:   3%|▎         | 5/147 [00:06<02:39,  1.13s/it][A
Training:   4%|▍         | 6/147 [00:07<02:33,  1.09s/it][A
Training:   5%|▍         | 7/147 [00:08<02:28,  1.06s/it][A
Training:   5%|▌         | 8/147 [00:09<02:25,  1.04s/it][A
Training:   6%|▌         | 9/147 [00:10<02:22,  1.03s/it][A
Training:   7%|▋         | 10/147 [00:11<02:20,  1.02s/it][A
Training:   7%|▋         | 11/147 [00:12<02:18,  1.02s/it][A
Training:   8%|▊         | 12/147 [00:13<02:16,  1.01s/it][A
Training:   9%|▉         | 13/147 [00:14<02:15,  1.01s/it][A
Training:  10%|▉         | 14/147 [00:15<02:14,  1.01s/it][A
Training:  10%|█         | 15/147 [00:16<02:13,  1.01s/it][A
Training:  11%|█         

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        41
           1     0.6582    1.0000    0.7939       312
           2     0.9615    0.5682    0.7143        44
           3     1.0000    0.0096    0.0190       104

    accuracy                         0.6747       501
   macro avg     0.6549    0.3944    0.3818       501
weighted avg     0.7019    0.6747    0.5611       501

Epoch 1/8 | Train Loss: 1.087892 | Val Loss: 0.885938 | Val F1: 0.381807
Saved best model.


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:08<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     1.0000    0.0488    0.0930        41
           1     0.7163    0.9712    0.8245       312
           2     0.8431    0.9773    0.9053        44
           3     0.7600    0.1827    0.2946       104

    accuracy                         0.7325       501
   macro avg     0.8299    0.5450    0.5293       501
weighted avg     0.7597    0.7325    0.6617       501

Epoch 2/8 | Train Loss: 0.834063 | Val Loss: 0.709003 | Val F1: 0.529337
Saved best model.


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:07<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     0.6667    0.0976    0.1702        41
           1     0.7284    0.9712    0.8324       312
           2     0.9545    0.9545    0.9545        44
           3     0.8000    0.2692    0.4029       104

    accuracy                         0.7525       501
   macro avg     0.7874    0.5731    0.5900       501
weighted avg     0.7581    0.7525    0.6998       501

Epoch 3/8 | Train Loss: 0.680948 | Val Loss: 0.637743 | Val F1: 0.590013
Saved best model.


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:08<00:00,  1.98it/s]


              precision    recall  f1-score   support

           0     0.4286    0.2195    0.2903        41
           1     0.7520    0.9038    0.8210       312
           2     0.9333    0.9545    0.9438        44
           3     0.6333    0.3654    0.4634       104

    accuracy                         0.7405       501
   macro avg     0.6868    0.6108    0.6296       501
weighted avg     0.7168    0.7405    0.7141       501

Epoch 4/8 | Train Loss: 0.494826 | Val Loss: 0.667660 | Val F1: 0.629630
Saved best model.


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:08<00:00,  1.99it/s]


              precision    recall  f1-score   support

           0     0.6000    0.2927    0.3934        41
           1     0.7752    0.8622    0.8164       312
           2     0.9333    0.9545    0.9438        44
           3     0.5506    0.4712    0.5078       104

    accuracy                         0.7425       501
   macro avg     0.7148    0.6451    0.6654       501
weighted avg     0.7281    0.7425    0.7289       501

Epoch 5/8 | Train Loss: 0.301629 | Val Loss: 0.819879 | Val F1: 0.665356
Saved best model.


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:08<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     0.5000    0.3171    0.3881        41
           1     0.7692    0.8654    0.8145       312
           2     0.9545    0.9545    0.9545        44
           3     0.5375    0.4135    0.4674       104

    accuracy                         0.7345       501
   macro avg     0.6903    0.6376    0.6561       501
weighted avg     0.7154    0.7345    0.7198       501

Epoch 6/8 | Train Loss: 0.184354 | Val Loss: 1.042675 | Val F1: 0.656119


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:08<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     0.5200    0.3171    0.3939        41
           1     0.7738    0.9103    0.8365       312
           2     0.9268    0.8636    0.8941        44
           3     0.6471    0.4231    0.5116       104

    accuracy                         0.7565       501
   macro avg     0.7169    0.6285    0.6591       501
weighted avg     0.7402    0.7565    0.7379       501

Epoch 7/8 | Train Loss: 0.100069 | Val Loss: 1.333911 | Val F1: 0.659052


Training: 100%|██████████| 147/147 [02:28<00:00,  1.01s/it]
Evaluating: 100%|██████████| 16/16 [00:07<00:00,  2.00it/s]


              precision    recall  f1-score   support

           0     0.5652    0.3171    0.4062        41
           1     0.7676    0.9103    0.8328       312
           2     0.9512    0.8864    0.9176        44
           3     0.6119    0.3942    0.4795       104

    accuracy                         0.7525       501
   macro avg     0.7240    0.6270    0.6591       501
weighted avg     0.7348    0.7525    0.7320       501

Epoch 8/8 | Train Loss: 0.067334 | Val Loss: 1.410469 | Val F1: 0.659068
Training completed. Best validation F1: 0.665356
Training completed. Best validation F1: 0.665356


In [20]:
import pandas as pd

# Load dataset
df_org = pd.read_csv("/kaggle/input/dataset/df_org.csv")

# Summary statistics
summary_stats = df_org.describe().transpose()

# Median, IQR, Skewness, Kurtosis for extra validation
extra_stats = pd.DataFrame({
    'median': df_org.median(numeric_only=True),
    'iqr': df_org.quantile(0.75, numeric_only=True) - df_org.quantile(0.25, numeric_only=True),
    'skewness': df_org.skew(numeric_only=True),
    'kurtosis': df_org.kurtosis(numeric_only=True)
})

# Combine
validation_stats = summary_stats.join(extra_stats)

# Show results
print("Summary + RobustScaler validation stats:")
print(validation_stats)


Summary + RobustScaler validation stats:
                               count          mean           std        min  \
Unnamed: 0                    3342.0  1.670500e+03    964.896627   0.000000   
label                         3342.0  1.420108e+00      0.907610   0.000000   
sloc                          3342.0  2.355057e+01     32.646112   1.000000   
proxy_indentation             3342.0  3.303695e+00      1.474554   0.000000   
mcCabe                        3342.0  8.300718e+00     12.111707   1.000000   
nested_block_depth            3342.0  5.902753e+00      7.764134   0.000000   
mcClure                       3342.0  1.433782e+01     21.930488   0.000000   
mcClure_NVAR                  3342.0  9.304608e+00     15.759168   0.000000   
mcClure_NCOMP                 3342.0  5.033214e+00      8.657460   0.000000   
difficulty                    3342.0  1.577750e+01      5.089187   8.500000   
effort                        3342.0  2.054473e+04  42361.734837  35.440000   
maintainabi

In [21]:
test_dataset = BugSeverityDataset("dataset_jsonl/test_scaled.jsonl", tokenizer)
train_dataset = BugSeverityDataset("dataset_jsonl/train_scaled.jsonl", tokenizer)
valid_dataset = BugSeverityDataset("dataset_jsonl/valid_scaled.jsonl", tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

In [22]:
import torch
import numpy as np
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, confusion_matrix
)

def evaluate_model_torch(model, dataloader, device, model_name="ConcatClsModel"):
    model.eval()
    all_labels = []
    all_preds = []
    all_probs = []

    total_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {model_name}"):
            input_ids, num_features, labels = [b.to(device) for b in batch]
            loss, probs = model(input_ids, num_features, labels)
            total_loss += loss.item()

            all_labels.append(labels.cpu())
            all_preds.append(torch.argmax(probs, dim=1).cpu())
            all_probs.append(probs.cpu())

    # Concatenate all batches
    y_true = torch.cat(all_labels).numpy()
    y_pred = torch.cat(all_preds).numpy()
    y_proba = torch.cat(all_probs).numpy()

    # Compute all metrics (same as your original function)
    accuracy = accuracy_score(y_true, y_pred)
    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    try:
        roc_auc_macro = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
        roc_auc_weighted = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
    except Exception as e:
        print(f"Warning: ROC-AUC calculation failed: {e}")
        roc_auc_macro = np.nan
        roc_auc_weighted = np.nan

    mcc = matthews_corrcoef(y_true, y_pred)

    cm = confusion_matrix(y_true, y_pred)
    sensitivity = np.diag(cm) / np.maximum(np.sum(cm, axis=1), 1)
    gmean = np.prod(sensitivity[sensitivity > 0]) ** (1.0 / len(sensitivity)) if np.any(sensitivity > 0) else 0

    metrics = {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision_macro": precision_macro,
        "Recall_macro": recall_macro,
        "F1_macro": f1_macro,
        "Precision_weighted": precision_weighted,
        "Recall_weighted": recall_weighted,
        "F1_weighted": f1_weighted,
        "ROC-AUC_macro": roc_auc_macro,
        "ROC-AUC_weighted": roc_auc_weighted,
        "MCC": mcc,
        "G-Mean": gmean
    }

    print(f"{model_name} evaluation completed.")
    return metrics


In [23]:
evaluate_model_torch(model, test_loader, device, model_name="ConcatClsModel")

Evaluating ConcatClsModel: 100%|██████████| 16/16 [00:08<00:00,  1.99it/s]

ConcatClsModel evaluation completed.





{'Model': 'ConcatClsModel',
 'Accuracy': 0.7589641434262948,
 'Precision_macro': 0.7482733544703883,
 'Recall_macro': 0.6634851915597845,
 'F1_macro': 0.688389092999288,
 'Precision_weighted': 0.750595716773379,
 'Recall_weighted': 0.7589641434262948,
 'F1_weighted': 0.7459478380710991,
 'ROC-AUC_macro': 0.8684090234492745,
 'ROC-AUC_weighted': 0.8300869356228315,
 'MCC': 0.5406277066561387,
 'G-Mean': 0.6251023334850003}

In [24]:
# === Step 11: Load best model and extract embeddings for XGBoost ===
model.load_state_dict(torch.load("/kaggle/working/best_concatcls_model.pt"))
model.eval()

def extract_embeddings(model, dataset):
    embeddings = []
    labels = []
    model.eval()
    with torch.no_grad():
        for input_ids, num_features, label in tqdm(dataset, desc="Extracting embeddings"):
            input_ids = input_ids.unsqueeze(0).to(device)
            num_features = num_features.unsqueeze(0).to(device)

            # Forward pass through encoder
            outputs = model.encoder(input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id).long())
            cls_embeds = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()  # (768,)

            num_features_np = num_features.cpu().numpy().flatten()  # (10,)
            combined = np.concatenate([cls_embeds, num_features_np])  # (778,)

            embeddings.append(combined)
            labels.append(label.item())

    return np.array(embeddings), np.array(labels)

print("Extracting train embeddings...")
X_train, y_train = extract_embeddings(model, train_dataset)
print("Extracting valid embeddings...")
X_valid, y_valid = extract_embeddings(model, valid_dataset)
print("Extracting test embeddings...")
X_test, y_test = extract_embeddings(model, test_dataset)

# Combine train + valid embeddings and labels for training
X_trainval = np.vstack([X_train, X_valid]) 
y_trainval = np.concatenate([y_train, y_valid])


Extracting train embeddings...


Extracting embeddings: 100%|██████████| 2339/2339 [00:49<00:00, 46.99it/s]


Extracting valid embeddings...


Extracting embeddings: 100%|██████████| 501/501 [00:10<00:00, 47.52it/s]


Extracting test embeddings...


Extracting embeddings: 100%|██████████| 502/502 [00:10<00:00, 47.32it/s]


In [32]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, confusion_matrix
)
import pickle
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def evaluate_model(y_true, y_pred, y_proba, model_name="XGBoost"):
    """Evaluate model performance with multiple metrics"""
    accuracy = accuracy_score(y_true, y_pred)
    precision_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
    recall_macro = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)

    precision_weighted = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall_weighted = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1_weighted = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    try:
        roc_auc_macro = roc_auc_score(y_true, y_proba, multi_class="ovr", average="macro")
        roc_auc_weighted = roc_auc_score(y_true, y_proba, multi_class="ovr", average="weighted")
    except Exception as e:
        logger.warning(f"ROC-AUC calculation failed for {model_name}: {e}")
        roc_auc_macro, roc_auc_weighted = np.nan, np.nan

    mcc = matthews_corrcoef(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    sensitivity = np.diag(cm) / np.maximum(np.sum(cm, axis=1), 1)
    gmean = np.prod(sensitivity[sensitivity > 0]) ** (1.0 / len(sensitivity)) if np.any(sensitivity > 0) else 0

    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision_macro": precision_macro,
        "Recall_macro": recall_macro,
        "F1_macro": f1_macro,
        "Precision_weighted": precision_weighted,
        "Recall_weighted": recall_weighted,
        "F1_weighted": f1_weighted,
        "ROC-AUC_macro": roc_auc_macro,
        "ROC-AUC_weighted": roc_auc_weighted,
        "MCC": mcc,
        "G-Mean": gmean,
    }


def train_xgboost(X_train, y_train, X_test, y_test):
    """Train and evaluate XGBoost model"""
    logger.info("Training XGBoost...")

    params_xgb = {
        "objective": "multi:softprob",
        "num_class": len(np.unique(y_train)),
        "eval_metric": "mlogloss",
        "max_depth": 9,
        "eta": 0.24627429143007107,
        "subsample": 0.45321841598276075,
        "colsample_bytree": 0.7227038914198726,
        "lambda": 0.06640744768945579,
        "alpha": 0.21504472646446163,
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "verbosity": 1,
        "use_label_encoder": False,
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    bst = xgb.train(params_xgb, dtrain, num_boost_round=1200)

    # Predictions
    y_proba = bst.predict(dtest)
    y_pred = np.argmax(y_proba, axis=1)

    # Evaluate
    results = evaluate_model(y_test, y_pred, y_proba)

    # Save predictions
    with open("xgboost_predictions.pkl", "wb") as f:
        pickle.dump({"y_pred": y_pred, "y_proba": y_proba}, f)

    return results, y_pred, y_proba


def main():
    # Load/define data before running
    # X_train, y_train, X_test, y_test = load_your_data()

    results, y_pred, y_proba = train_xgboost(X_train, y_train, X_test, y_test)
    df_results = pd.DataFrame([results])
    print("\nXGBoost Performance Results:")
    print(df_results.round(4))
    return df_results


if __name__ == "__main__":
    try:
        results_df = main()
        logger.info("Training and evaluation completed successfully")
    except Exception as e:
        logger.error(f"Execution failed: {e}")
        raise



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.




XGBoost Performance Results:
     Model  Accuracy  Precision_macro  Recall_macro  F1_macro  \
0  XGBoost     0.753           0.7371        0.6722     0.696   

   Precision_weighted  Recall_weighted  F1_weighted  ROC-AUC_macro  \
0              0.7414            0.753       0.7416         0.8425   

   ROC-AUC_weighted     MCC  G-Mean  
0            0.8051  0.5306  0.6391  



    E.g. tree_method = "hist", device = "cuda"



In [25]:
# from transformers import RobertaTokenizer, RobertaModel
# import torch
# import numpy as np
# from tqdm import tqdm

# # Load raw CodeBERT (without ConcatCls finetuning)
# tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
# codebert = RobertaModel.from_pretrained("microsoft/codebert-base")
# codebert.eval()
# codebert.to(device)

# def extract_raw_codebert_with_features(model, dataset):
#     embeddings = []
#     labels = []
#     model.eval()
#     with torch.no_grad():
#         for input_ids, num_features, label in tqdm(dataset, desc="Extracting raw CodeBERT + features"):
#             input_ids = input_ids.unsqueeze(0).to(device)
#             num_features = num_features.unsqueeze(0).to(device)

#             # CodeBERT forward pass
#             outputs = model(input_ids=input_ids, attention_mask=input_ids.ne(tokenizer.pad_token_id).long())
#             cls_embeds = outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()  # (768,)

#             # Handcrafted features
#             num_features_np = num_features.cpu().numpy().flatten()  # (10,)

#             # Concatenate
#             combined = np.concatenate([cls_embeds, num_features_np])  # (768 + 10 = 778,)
#             embeddings.append(combined)
#             labels.append(label.item())

#     return np.array(embeddings), np.array(labels)

# print("Extracting train embeddings (raw CodeBERT + num features)...")
# X_train, y_train = extract_raw_codebert_with_features(codebert, train_dataset)
# print("Extracting valid embeddings (raw CodeBERT + num features)...")
# X_valid, y_valid = extract_raw_codebert_with_features(codebert, valid_dataset)
# print("Extracting test embeddings (raw CodeBERT + num features)...")
# X_test, y_test = extract_raw_codebert_with_features(codebert, test_dataset)

# # Combine train + valid for XGBoost training
# X_trainval = np.vstack([X_train, X_valid])
# y_trainval = np.concatenate([y_train, y_valid])


In [26]:
len(X_trainval[0])

778

In [27]:
import pickle

# Save embeddings and labels
with open("raw_embeddings.pkl", "wb") as f:
    pickle.dump({
        "train": {"X": X_train, "y": y_train},
        "valid": {"X": X_valid, "y": y_valid},
        "test": {"X": X_test, "y": y_test}
    }, f)

print("Embeddings saved to embeddings.pkl")


Embeddings saved to embeddings.pkl


In [28]:
# import optuna
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import make_scorer, f1_score, classification_report
# from sklearn.neighbors import KNeighborsClassifier

# def knn_objective(trial):
#     n_neighbors = trial.suggest_int("n_neighbors", 1, 50)
#     weights = trial.suggest_categorical("weights", ["uniform", "distance"])
#     p = trial.suggest_int("p", 1, 2)  # 1=Manhattan, 2=Euclidean

#     knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    
#     # 5-fold CV on train+valid
#     score = cross_val_score(
#         knn, X_trainval, y_trainval, 
#         cv=5, scoring=make_scorer(f1_score, average='macro')
#     ).mean()
    
#     return score

# study = optuna.create_study(direction="maximize")
# study.optimize(knn_objective, n_trials=50)

# print("✅ Best KNN parameters:", study.best_params)
# print("✅ Best CV F1_macro (train+valid):", study.best_value)

# knn_best = KNeighborsClassifier(**study.best_params)
# knn_best.fit(X_trainval, y_trainval)

# y_pred_val = knn_best.predict(X_valid)
# y_proba_val = knn_best.predict_proba(X_valid)

# f1_val_macro = f1_score(y_valid, y_pred_val, average='macro')
# print("=== KNN Validation Set Classification Report ===")
# print(classification_report(y_valid, y_pred_val, digits=4))
# print(f"F1 Macro on validation set: {f1_val_macro:.4f}")

# y_pred_test = knn_best.predict(X_test)
# y_proba_test = knn_best.predict_proba(X_test)

# f1_test_macro = f1_score(y_test, y_pred_test, average='macro')
# print("=== KNN Test Set Classification Report ===")
# print(classification_report(y_test, y_pred_test, digits=4))
# print(f"F1 Macro on test set: {f1_test_macro:.4f}")

# import pickle
# with open("knn_predictions.pkl", "wb") as f:
#     pickle.dump({
#         "validation": {"y_pred": y_pred_val, "y_proba": y_proba_val},
#         "test": {"y_pred": y_pred_test, "y_proba": y_proba_test},
#         "best_params": study.best_params
#     }, f)

# print("✅ KNN predictions and best parameters saved to 'knn_predictions.pkl'")


In [29]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score

# best_rf_params = {
#     'n_estimators': 335,
#     'max_depth': 7,
#     'min_samples_split': 15,
#     'min_samples_leaf': 7,
#     'max_features': 'sqrt'
# }

# rf_best = RandomForestClassifier(**best_rf_params, n_jobs=-1, random_state=42)
# rf_best.fit(X_trainval, y_trainval)

# y_pred_rf = rf_best.predict(X_test)

# print("=== Random Forest Test Classification Report ===")
# print(classification_report(y_test, y_pred_rf, digits=4))

# acc = accuracy_score(y_test, y_pred_rf)
# f1_w = f1_score(y_test, y_pred_rf, average="weighted")
# print(f"Accuracy: {acc:.4f}, Weighted F1: {f1_w:.4f}")


In [30]:
# import optuna
# import xgboost as xgb
# from sklearn.metrics import f1_score
# import numpy as np

# # Create DMatrix for XGBoost
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dvalid = xgb.DMatrix(X_valid, label=y_valid)

# def objective_xgb(trial):
#     params = {
#         "objective": "multi:softprob",
#         "num_class": len(np.unique(y_trainval)),
#         "eval_metric": "mlogloss",
#         "tree_method": "gpu_hist",
#         "predictor": "gpu_predictor",
#         "verbosity": 0,
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "eta": trial.suggest_float("eta", 1e-3, 0.3, log=True),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
#         "lambda": trial.suggest_float("lambda", 1e-3, 10.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-3, 10.0, log=True)
#     }
    
#     bst = xgb.train(
#         params,
#         dtrain,
#         num_boost_round=1000,
#         evals=[(dvalid, "valid")],
#         early_stopping_rounds=50,
#         verbose_eval=False
#     )
    
#     y_pred_prob = bst.predict(dvalid)
#     y_pred = np.argmax(y_pred_prob, axis=1)
    
#     # Weighted F1 as objective
#     return f1_score(y_valid, y_pred, average="weighted")

# # Run Optuna study
# study_xgb = optuna.create_study(direction="maximize")
# study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

# # Best params
# best_xgb_params = study_xgb.best_params
# print("Best XGBoost params:", best_xgb_params)

# # Train final XGBoost on train+valid
# dtrainval = xgb.DMatrix(X_trainval, label=y_trainval)
# dtest = xgb.DMatrix(X_test, label=y_test)

# bst_final = xgb.train(
#     {**best_xgb_params, "objective": "multi:softprob", "num_class": len(np.unique(y_trainval)),
#      "tree_method": "gpu_hist", "predictor": "gpu_predictor", "verbosity": 1},
#     dtrainval,
#     num_boost_round=1000,
#     evals=[(dtrainval, "train")],
#     verbose_eval=False
# )

# y_pred_prob_test = bst_final.predict(dtest)
# y_pred_test = np.argmax(y_pred_prob_test, axis=1)

# from sklearn.metrics import classification_report
# print("=== XGBoost Test Classification Report ===")
# print(classification_report(y_test, y_pred_test, digits=4))


In [31]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef, confusion_matrix
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tabpfn import TabPFNClassifier
from tqdm import tqdm
import pickle
from typing import Dict, List, Tuple, Any
import logging
from dataclasses import dataclass

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class ModelConfig:
    """Configuration class for model parameters"""
    name: str
    model: Any
    params: Dict[str, Any] = None

@dataclass
class PredictionResult:
    """Container for model predictions"""
    y_pred: np.ndarray
    y_proba: np.ndarray

class ModelEvaluator:
    """Handles model evaluation with consistent metrics"""
    
    @staticmethod
    def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, 
                      y_proba: np.ndarray, model_name: str) -> Dict[str, float]:
        """Evaluate model performance with multiple metrics"""
        try:
            accuracy = accuracy_score(y_true, y_pred)
            precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
            
            precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
            recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
            f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
            
            try:
                roc_auc_macro = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
                roc_auc_weighted = roc_auc_score(y_true, y_proba, multi_class='ovr', average='weighted')
            except Exception as e:
                logger.warning(f"ROC-AUC calculation failed for {model_name}: {e}")
                roc_auc_macro = np.nan
                roc_auc_weighted = np.nan
            
            mcc = matthews_corrcoef(y_true, y_pred)
            
            cm = confusion_matrix(y_true, y_pred)
            sensitivity = np.diag(cm) / np.maximum(np.sum(cm, axis=1), 1)  # Avoid division by zero
            gmean = np.prod(sensitivity[sensitivity > 0]) ** (1.0 / len(sensitivity)) if np.any(sensitivity > 0) else 0

            return {
                "Model": model_name,
                "Accuracy": accuracy,
                "Precision_macro": precision_macro,
                "Recall_macro": recall_macro,
                "F1_macro": f1_macro,
                "Precision_weighted": precision_weighted,
                "Recall_weighted": recall_weighted,
                "F1_weighted": f1_weighted,
                "ROC-AUC_macro": roc_auc_macro,
                "ROC-AUC_weighted": roc_auc_weighted,
                "MCC": mcc,
                "G-Mean": gmean
            }
            
        except Exception as e:
            logger.error(f"Error evaluating model {model_name}: {e}")
            raise

class ModelTrainer:
    """Handles model training and prediction"""
    
    def __init__(self, X_trainval: np.ndarray, y_trainval: np.ndarray, 
                 X_test: np.ndarray, y_test: np.ndarray):
        self.X_trainval = X_trainval
        self.y_trainval = y_trainval
        self.X_test = X_test
        self.y_test = y_test
        self.n_classes = len(np.unique(y_trainval))
        
    def train_classical_models(self, model_configs: List[ModelConfig]) -> Tuple[List[Dict], Dict]:
        """Train classical sklearn models"""
        results = []
        test_predictions = {}
        # train_predictions = {}  # Commented out to minimize cost
        
        for config in tqdm(model_configs, desc="Training classical models"):
            try:
                logger.info(f"Training {config.name}")
                model = config.model.set_params(**config.params) if config.params else config.model
                model.fit(self.X_trainval, self.y_trainval)
                
                # Test predictions only (no train predictions to minimize cost)
                y_pred_test = model.predict(self.X_test)
                y_proba_test = model.predict_proba(self.X_test)
                test_predictions[config.name] = PredictionResult(y_pred_test, y_proba_test)
                
                # # Train predictions - COMMENTED OUT TO MINIMIZE COST
                # y_pred_train = model.predict(self.X_trainval)
                # y_proba_train = model.predict_proba(self.X_trainval)
                # train_predictions[config.name] = PredictionResult(y_pred_train, y_proba_train)
                
                # Evaluate
                result = ModelEvaluator.evaluate_model(
                    self.y_test, y_pred_test, y_proba_test, config.name
                )
                results.append(result)
                
            except Exception as e:
                logger.error(f"Error training {config.name}: {e}")
                continue
                
        return results, test_predictions  # Removed train_predictions from return
    
    def train_xgboost(self) -> Tuple[Dict, PredictionResult]:
        """Train XGBoost model"""
        logger.info("Training XGBoost...")
        
        params_xgb = {
            "objective": "multi:softprob",
            "num_class": self.n_classes,
            "eval_metric": "mlogloss",
            "max_depth": 9,
            "eta": 0.24627429143007107,
            "subsample": 0.45321841598276075,
            "colsample_bytree": 0.7227038914198726,
            "lambda": 0.06640744768945579,
            "alpha": 0.21504472646446163,
            "tree_method": "gpu_hist",
            "predictor": "gpu_predictor",
            "verbosity": 1,
            "use_label_encoder": False
        }
        
        dtrain = xgb.DMatrix(self.X_trainval, label=self.y_trainval)
        dtest = xgb.DMatrix(self.X_test, label=self.y_test)
        
        bst = xgb.train(params_xgb, dtrain, num_boost_round=1200)
        
        # Test predictions only
        y_proba_test = bst.predict(dtest)
        y_pred_test = np.argmax(y_proba_test, axis=1)
        test_pred = PredictionResult(y_pred_test, y_proba_test)
        
        # # Train predictions - COMMENTED OUT TO MINIMIZE COST
        # y_proba_train = bst.predict(xgb.DMatrix(self.X_trainval))
        # y_pred_train = np.argmax(y_proba_train, axis=1)
        # train_pred = PredictionResult(y_pred_train, y_proba_train)
        
        # Evaluate
        result = ModelEvaluator.evaluate_model(
            self.y_test, y_pred_test, y_proba_test, "XGBoost"
        )
        
        return result, test_pred  # Removed train_pred from return
    
    def train_lightgbm(self) -> Tuple[Dict, PredictionResult]:
        """Train LightGBM model"""
        logger.info("Training LightGBM...")
        
        params_lgb = {
            "objective": "multiclass",
            "num_class": self.n_classes,
            "metric": "multi_logloss",
            "learning_rate": 0.085,
            "max_depth": 7,
            "verbosity": -1
        }
        
        train_data = lgb.Dataset(self.X_trainval, label=self.y_trainval)
        gbm = lgb.train(params_lgb, train_data, num_boost_round=1200)
        
        # Test predictions only
        y_proba_test = gbm.predict(self.X_test)
        y_pred_test = np.argmax(y_proba_test, axis=1)
        test_pred = PredictionResult(y_pred_test, y_proba_test)
        
        # # Train predictions - COMMENTED OUT TO MINIMIZE COST
        # y_proba_train = gbm.predict(self.X_trainval)
        # y_pred_train = np.argmax(y_proba_train, axis=1)
        # train_pred = PredictionResult(y_pred_train, y_proba_train)
        
        # Evaluate
        result = ModelEvaluator.evaluate_model(
            self.y_test, y_pred_test, y_proba_test, "LightGBM"
        )
        
        return result, test_pred  # Removed train_pred from return
    
    def train_catboost(self) -> Tuple[Dict, PredictionResult]:
        """Train CatBoost model"""
        logger.info("Training CatBoost...")
        
        classes = np.unique(self.y_trainval)
        class_weights = compute_class_weight(
            class_weight="balanced", classes=classes, y=self.y_trainval
        ).tolist()
        
        params_cat = {
            "learning_rate": 0.093,
            "depth": 7,
            "l2_leaf_reg": 7.07,
            "iterations": 829,
            "task_type": "GPU",
            "verbose": 100,
            "class_weights": class_weights
        }
        
        cat_model = cb.CatBoostClassifier(**params_cat)
        cat_model.fit(self.X_trainval, self.y_trainval)
        
        # Test predictions only
        y_proba_test = cat_model.predict_proba(self.X_test)
        y_pred_test = cat_model.predict(self.X_test)
        test_pred = PredictionResult(y_pred_test, y_proba_test)
        
        # # Train predictions - COMMENTED OUT TO MINIMIZE COST
        # y_proba_train = cat_model.predict_proba(self.X_trainval)
        # y_pred_train = cat_model.predict(self.X_trainval)
        # train_pred = PredictionResult(y_pred_train, y_proba_train)
        
        # Evaluate
        result = ModelEvaluator.evaluate_model(
            self.y_test, y_pred_test, y_proba_test, "CatBoost"
        )
        
        return result, test_pred  # Removed train_pred from return
    
    def train_tabpfn(self) -> Tuple[Dict, PredictionResult]:
        """Train TabPFN model"""
        logger.info("Training TabPFN...")
        
        tabpfn = TabPFNClassifier(ignore_pretraining_limits=True)
        tabpfn.fit(self.X_trainval, self.y_trainval)
        
        # Test predictions only
        y_proba_test = tabpfn.predict_proba(self.X_test)
        y_pred_test = tabpfn.predict(self.X_test)
        test_pred = PredictionResult(y_pred_test, y_proba_test)
        
        # # Train predictions - COMMENTED OUT TO MINIMIZE COST
        # y_proba_train = tabpfn.predict_proba(self.X_trainval)
        # y_pred_train = tabpfn.predict(self.X_trainval)
        # train_pred = PredictionResult(y_pred_train, y_proba_train)
        
        # Evaluate
        result = ModelEvaluator.evaluate_model(
            self.y_test, y_pred_test, y_proba_test, "TabPFN"
        )
        
        return result, test_pred  # Removed train_pred from return

class PredictionSaver:
    """Handles saving and loading predictions"""
    
    @staticmethod
    def save_predictions(predictions: Dict[str, PredictionResult], filename: str):
        """Save predictions to file"""
        try:
            with open(filename, "wb") as f:
                pickle.dump(predictions, f)
            logger.info(f"Predictions saved to {filename}")
        except Exception as e:
            logger.error(f"Error saving predictions to {filename}: {e}")
            raise

def main():
    """Main execution function"""
    # Initialize data (assuming X_trainval, y_trainval, X_test, y_test are defined)
    # X_trainval, y_trainval, X_test, y_test = load_your_data()
    
    trainer = ModelTrainer(X_trainval, y_trainval, X_test, y_test)
    
    # Define classical models
    classical_models = [
        ModelConfig("KNN", KNeighborsClassifier(), 
                   {'n_neighbors': 3, 'weights': 'distance', 'p': 1}),
        ModelConfig("SVM", SVC(probability=True, random_state=42), 
                   {"C": 2.5, "kernel": "rbf", "gamma": "scale"}),
        ModelConfig("Naive Bayes", GaussianNB(), 
                   {"var_smoothing": 1e-8}),
        ModelConfig("Decision Tree", DecisionTreeClassifier(random_state=42), 
                   {"max_depth": 14, "min_samples_split": 4}),
        ModelConfig("RandomForest", RandomForestClassifier(random_state=42, n_jobs=-1), 
                   {"n_estimators": 600, "max_depth": 18, "min_samples_split": 3, 
                    "class_weight": "balanced_subsample"}),
        ModelConfig("AdaBoost", AdaBoostClassifier(random_state=42), 
                   {"n_estimators": 500, "learning_rate": 0.85})
    ]
    
    # Train all models
    results = []
    all_test_predictions = {}
    # all_train_predictions = {}  # Commented out to minimize cost
    
    # Classical models
    classical_results, test_preds = trainer.train_classical_models(classical_models)
    results.extend(classical_results)
    all_test_predictions.update(test_preds)
    # all_train_predictions.update(train_preds)  # Commented out
    
    # Gradient boosting models
    for model_name, train_func in [
        ("XGBoost", trainer.train_xgboost),
        ("LightGBM", trainer.train_lightgbm),
        ("CatBoost", trainer.train_catboost),
        ("TabPFN", trainer.train_tabpfn)
    ]:
        try:
            result, test_pred = train_func()
            results.append(result)
            all_test_predictions[model_name] = test_pred
            # all_train_predictions[model_name] = train_pred  # Commented out
        except Exception as e:
            logger.error(f"Error training {model_name}: {e}")
            continue
    
    # Create results DataFrame
    df_results = pd.DataFrame(results)
    print("\nModel Performance Results:")
    print(df_results.round(4))
    
    # Save only test predictions
    PredictionSaver.save_predictions(all_test_predictions, "all_model_predictions.pkl")
    # PredictionSaver.save_predictions(all_train_predictions, "all_train_predictions.pkl")  # Commented out
    
    return df_results, all_test_predictions  # Removed all_train_predictions from return

if __name__ == "__main__":
    try:
        results_df, test_preds = main()  # Removed train_preds from return
        logger.info("Training and evaluation completed successfully")
    except Exception as e:
        logger.error(f"Main execution failed: {e}")
        raise

Training classical models:  67%|██████▋   | 4/6 [00:21<00:10,  5.37s/it]


KeyboardInterrupt: 

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import (
#     accuracy_score, precision_score, recall_score, f1_score,
#     roc_auc_score, matthews_corrcoef, confusion_matrix
# )
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.utils.class_weight import compute_class_weight
# from scipy.stats import wilcoxon
# import xgboost as xgb
# import lightgbm as lgb
# import catboost as cb
# from tabpfn import TabPFNClassifier
# from tqdm import tqdm
# import pickle
# from typing import Dict, List, Tuple, Any, Union
# import logging
# from dataclasses import dataclass
# import itertools
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Configure logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

In [None]:
# def main():
#     """Main execution function - modified for Wilcoxon analysis"""
#     # Initialize data
#     # X_trainval, y_trainval, X_test, y_test = load_your_data()
    
#     trainer = ModelTrainer(X_trainval, y_trainval, X_test, y_test)
    
#     # Define classical models
#     classical_models = [
#         ModelConfig("KNN", KNeighborsClassifier(), 
#                    {'n_neighbors': 3, 'weights': 'distance', 'p': 1}),
#         ModelConfig("SVM", SVC(probability=True, random_state=42), 
#                    {"C": 2.5, "kernel": "rbf", "gamma": "scale"}),
#         ModelConfig("Naive Bayes", GaussianNB(), 
#                    {"var_smoothing": 1e-8}),
#         ModelConfig("Decision Tree", DecisionTreeClassifier(random_state=42), 
#                    {"max_depth": 14, "min_samples_split": 4}),
#         ModelConfig("RandomForest", RandomForestClassifier(random_state=42, n_jobs=-1), 
#                    {"n_estimators": 600, "max_depth": 18, "min_samples_split": 3, 
#                     "class_weight": "balanced_subsample"}),
#         ModelConfig("AdaBoost", AdaBoostClassifier(random_state=42), 
#                    {"n_estimators": 500, "learning_rate": 0.85})
#     ]
    
#     # Train all models
#     results = []
#     all_test_predictions = {}
#     all_true_labels = {}  # Store true labels for each model
    
#     # Store true test labels
#     all_true_labels['test'] = trainer.y_test
    
#     # Classical models
#     classical_results, test_preds = trainer.train_classical_models(classical_models)
#     results.extend(classical_results)
#     all_test_predictions.update(test_preds)
    
#     # Gradient boosting models
#     for model_name, train_func in [
#         ("XGBoost", trainer.train_xgboost),
#         ("LightGBM", trainer.train_lightgbm),
#         ("CatBoost", trainer.train_catboost),
#         ("TabPFN", trainer.train_tabpfn)
#     ]:
#         try:
#             result, test_pred = train_func()
#             results.append(result)
#             all_test_predictions[model_name] = test_pred
#         except Exception as e:
#             logger.error(f"Error training {model_name}: {e}")
#             continue
    
#     # Create results DataFrame
#     df_results = pd.DataFrame(results)
#     print("\nModel Performance Results:")
#     print(df_results.round(4))
    
#     # Save predictions and true labels for Wilcoxon analysis
#     wilcoxon_data = {
#         'predictions': all_test_predictions,
#         'true_labels': all_true_labels,
#         'results_df': df_results
#     }
    
#     PredictionSaver.save_predictions(wilcoxon_data, "wilcoxon_analysis_data.pkl")
    
#     return df_results, wilcoxon_data

In [None]:
# from scipy.stats import wilcoxon
# import itertools

# class WilcoxonAnalyzer:
#     """Performs Wilcoxon signed-rank tests on model predictions"""
    
#     def __init__(self, wilcoxon_data: Dict):
#         self.predictions = wilcoxon_data['predictions']
#         self.true_labels = wilcoxon_data['true_labels']['test']
#         self.results_df = wilcoxon_data['results_df']
#         self.model_names = list(self.predictions.keys())
    
#     def calculate_accuracy_per_sample(self, y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
#         """Calculate accuracy for each sample (1 if correct, 0 if wrong)"""
#         return (y_true == y_pred).astype(int)
    
#     def perform_wilcoxon_tests(self, metric: str = 'accuracy') -> pd.DataFrame:
#         """Perform Wilcoxon signed-rank tests between all model pairs"""
        
#         # Get sample-wise metrics for each model
#         sample_metrics = {}
#         for model_name, pred_result in self.predictions.items():
#             if metric == 'accuracy':
#                 sample_metrics[model_name] = self.calculate_accuracy_per_sample(
#                     self.true_labels, pred_result.y_pred
#                 )
#             elif metric == 'probability':
#                 # Use maximum probability as confidence measure
#                 sample_metrics[model_name] = np.max(pred_result.y_proba, axis=1)
#             else:
#                 raise ValueError("Metric must be 'accuracy' or 'probability'")
        
#         # Perform pairwise Wilcoxon tests
#         wilcoxon_results = []
#         model_pairs = list(itertools.combinations(self.model_names, 2))
        
#         for model1, model2 in tqdm(model_pairs, desc="Performing Wilcoxon tests"):
#             try:
#                 # Get sample metrics for both models
#                 metrics1 = sample_metrics[model1]
#                 metrics2 = sample_metrics[model2]
                
#                 # Perform Wilcoxon signed-rank test
#                 stat, p_value = wilcoxon(metrics1, metrics2, zero_method='pratt')
                
#                 # Calculate mean difference
#                 mean_diff = np.mean(metrics1 - metrics2)
                
#                 wilcoxon_results.append({
#                     'Model1': model1,
#                     'Model2': model2,
#                     'Wilcoxon_Statistic': stat,
#                     'P_Value': p_value,
#                     'Mean_Difference': mean_diff,
#                     'Significant_0.05': p_value < 0.05,
#                     'Significant_0.01': p_value < 0.01
#                 })
                
#             except Exception as e:
#                 logger.error(f"Error in Wilcoxon test for {model1} vs {model2}: {e}")
#                 continue
        
#         return pd.DataFrame(wilcoxon_results)
    
#     def create_significance_matrix(self, wilcoxon_df: pd.DataFrame, alpha: float = 0.05) -> pd.DataFrame:
#         """Create a matrix showing significant differences between models"""
#         models = sorted(self.model_names)
#         sig_matrix = pd.DataFrame(0, index=models, columns=models)
        
#         for _, row in wilcoxon_df.iterrows():
#             if row['P_Value'] < alpha:
#                 if row['Mean_Difference'] > 0:
#                     # Model1 is better
#                     sig_matrix.loc[row['Model1'], row['Model2']] = 1
#                     sig_matrix.loc[row['Model2'], row['Model1']] = -1
#                 else:
#                     # Model2 is better
#                     sig_matrix.loc[row['Model1'], row['Model2']] = -1
#                     sig_matrix.loc[row['Model2'], row['Model1']] = 1
        
#         return sig_matrix
    
#     def comprehensive_analysis(self) -> Dict[str, pd.DataFrame]:
#         """Perform comprehensive Wilcoxon analysis"""
#         logger.info("Performing comprehensive Wilcoxon analysis...")
        
#         # Test on accuracy differences
#         accuracy_results = self.perform_wilcoxon_tests(metric='accuracy')
#         accuracy_matrix = self.create_significance_matrix(accuracy_results)
        
#         # Test on confidence differences (optional)
#         confidence_results = self.perform_wilcoxon_tests(metric='probability')
#         confidence_matrix = self.create_significance_matrix(confidence_results)
        
#         return {
#             'accuracy_wilcoxon': accuracy_results,
#             'accuracy_significance_matrix': accuracy_matrix,
#             'confidence_wilcoxon': confidence_results,
#             'confidence_significance_matrix': confidence_matrix
#         }

In [None]:
# # Load your stored predictions
# with open('/kaggle/working/all_model_predictions.pkl', 'rb') as f:
#     predictions_data = pickle.load(f)

# # Assuming your data has this structure:
# # predictions_data = {
# #     'predictions': {
# #         'Model1': PredictionResult(y_pred, y_proba),
# #         'Model2': PredictionResult(y_pred, y_proba),
# #         ...
# #     },
# #     'true_labels': {
# #         'test': y_test_actual
# #     }
# # }

# # Perform Wilcoxon analysis
# analyzer = WilcoxonAnalyzer(predictions_data)
# analysis_results = analyzer.comprehensive_analysis()

# # Display results
# print("\nWilcoxon Signed-Rank Test Results (Accuracy):")
# print(analysis_results['accuracy_wilcoxon'].round(4))

# print("\nSignificance Matrix (Accuracy):")
# print(analysis_results['accuracy_significance_matrix'])

# print("\nSignificant Differences at α=0.05:")
# sig_diffs = analysis_results['accuracy_wilcoxon'][
#     analysis_results['accuracy_wilcoxon']['Significant_0.05']
# ]
# for _, row in sig_diffs.iterrows():
#     better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#     worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#     print(f"{better_model} > {worse_model} (p={row['P_Value']:.4f})")

# # Save analysis results
# with open('wilcoxon_analysis_results.pkl', 'wb') as f:
#     pickle.dump(analysis_results, f)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# def visualize_wilcoxon_results(analysis_results: Dict):
#     """Visualize Wilcoxon test results"""
    
#     # Heatmap of significance matrix
#     plt.figure(figsize=(12, 10))
#     sns.heatmap(analysis_results['accuracy_significance_matrix'], 
#                 annot=True, cmap='coolwarm', center=0,
#                 cbar_kws={'label': 'Significance (1=better, -1=worse)'})
#     plt.title('Model Comparison Significance Matrix\n(Wilcoxon Signed-Rank Test, α=0.05)')
#     plt.tight_layout()
#     plt.savefig('wilcoxon_significance_matrix.png', dpi=300, bbox_inches='tight')
#     plt.show()
    
#     # P-value distribution
#     plt.figure(figsize=(10, 6))
#     plt.hist(analysis_results['accuracy_wilcoxon']['P_Value'], bins=20, alpha=0.7)
#     plt.axvline(0.05, color='red', linestyle='--', label='α=0.05')
#     plt.axvline(0.01, color='darkred', linestyle='--', label='α=0.01')
#     plt.xlabel('P-value')
#     plt.ylabel('Frequency')
#     plt.title('Distribution of Wilcoxon Test P-values')
#     plt.legend()
#     plt.savefig('wilcoxon_pvalue_distribution.png', dpi=300, bbox_inches='tight')
#     plt.show()

# # Visualize results
# visualize_wilcoxon_results(analysis_results)

In [None]:
# import numpy as np
# import pandas as pd
# from scipy.stats import wilcoxon
# import itertools
# import pickle
# from typing import Dict
# import matplotlib.pyplot as plt
# import seaborn as sns
# from tqdm import tqdm

# # Load your predictions
# with open('all_model_predictions.pkl', 'rb') as f:
#     predictions_data = pickle.load(f)

# # Load your test labels (you should have this from your test set)
# # y_test = ... (load your actual test labels)

# class WilcoxonAnalyzer:
#     """Performs Wilcoxon signed-rank tests on model predictions"""
    
#     def __init__(self, predictions_dict: Dict, y_true: np.ndarray):
#         self.predictions = predictions_dict
#         self.true_labels = y_true
#         self.model_names = list(self.predictions.keys())
    
#     def calculate_accuracy_per_sample(self, y_pred: np.ndarray) -> np.ndarray:
#         """Calculate accuracy for each sample (1 if correct, 0 if wrong)"""
#         return (self.true_labels == y_pred).astype(int)
    
#     def perform_wilcoxon_tests(self, metric: str = 'accuracy') -> pd.DataFrame:
#         """Perform Wilcoxon signed-rank tests between all model pairs"""
        
#         # Get sample-wise metrics for each model
#         sample_metrics = {}
#         for model_name, pred_data in self.predictions.items():
#             if metric == 'accuracy':
#                 sample_metrics[model_name] = self.calculate_accuracy_per_sample(pred_data['y_pred'])
#             elif metric == 'probability':
#                 # Use maximum probability as confidence measure
#                 sample_metrics[model_name] = np.max(pred_data['y_proba'], axis=1)
#             else:
#                 raise ValueError("Metric must be 'accuracy' or 'probability'")
        
#         # Perform pairwise Wilcoxon tests
#         wilcoxon_results = []
#         model_pairs = list(itertools.combinations(self.model_names, 2))
        
#         for model1, model2 in tqdm(model_pairs, desc="Performing Wilcoxon tests"):
#             try:
#                 # Get sample metrics for both models
#                 metrics1 = sample_metrics[model1]
#                 metrics2 = sample_metrics[model2]
                
#                 # Perform Wilcoxon signed-rank test
#                 stat, p_value = wilcoxon(metrics1, metrics2, zero_method='pratt')
                
#                 # Calculate mean difference
#                 mean_diff = np.mean(metrics1 - metrics2)
                
#                 wilcoxon_results.append({
#                     'Model1': model1,
#                     'Model2': model2,
#                     'Wilcoxon_Statistic': stat,
#                     'P_Value': p_value,
#                     'Mean_Difference': mean_diff,
#                     'Significant_0.05': p_value < 0.05,
#                     'Significant_0.01': p_value < 0.01
#                 })
                
#             except Exception as e:
#                 print(f"Error in Wilcoxon test for {model1} vs {model2}: {e}")
#                 continue
        
#         return pd.DataFrame(wilcoxon_results)
    
#     def create_significance_matrix(self, wilcoxon_df: pd.DataFrame, alpha: float = 0.05) -> pd.DataFrame:
#         """Create a matrix showing significant differences between models"""
#         models = sorted(self.model_names)
#         sig_matrix = pd.DataFrame(0, index=models, columns=models)
        
#         for _, row in wilcoxon_df.iterrows():
#             if row['P_Value'] < alpha:
#                 if row['Mean_Difference'] > 0:
#                     # Model1 is better
#                     sig_matrix.loc[row['Model1'], row['Model2']] = 1
#                     sig_matrix.loc[row['Model2'], row['Model1']] = -1
#                 else:
#                     # Model2 is better
#                     sig_matrix.loc[row['Model1'], row['Model2']] = -1
#                     sig_matrix.loc[row['Model2'], row['Model1']] = 1
        
#         return sig_matrix
    
#     def comprehensive_analysis(self) -> Dict[str, pd.DataFrame]:
#         """Perform comprehensive Wilcoxon analysis"""
#         print("Performing comprehensive Wilcoxon analysis...")
        
#         # Test on accuracy differences
#         accuracy_results = self.perform_wilcoxon_tests(metric='accuracy')
#         accuracy_matrix = self.create_significance_matrix(accuracy_results)
        
#         # Test on confidence differences
#         try:
#             confidence_results = self.perform_wilcoxon_tests(metric='probability')
#             confidence_matrix = self.create_significance_matrix(confidence_results)
#         except Exception as e:
#             print(f"Probability-based Wilcoxon test not performed: {e}")
#             confidence_results = pd.DataFrame()
#             confidence_matrix = pd.DataFrame()
        
#         return {
#             'accuracy_wilcoxon': accuracy_results,
#             'accuracy_significance_matrix': accuracy_matrix,
#             'confidence_wilcoxon': confidence_results,
#             'confidence_significance_matrix': confidence_matrix
#         }

# def visualize_wilcoxon_results(analysis_results: Dict, model_names: list):
#     """Visualize Wilcoxon test results"""
    
#     # Heatmap of significance matrix
#     plt.figure(figsize=(12, 10))
#     sns.heatmap(analysis_results['accuracy_significance_matrix'], 
#                 annot=True, cmap='coolwarm', center=0, fmt='d',
#                 cbar_kws={'label': 'Significance (1=better, -1=worse)'})
#     plt.title('Model Comparison Significance Matrix\n(Wilcoxon Signed-Rank Test, α=0.05)')
#     plt.tight_layout()
#     plt.savefig('wilcoxon_significance_matrix.png', dpi=300, bbox_inches='tight')
#     plt.show()
    
#     # P-value distribution
#     plt.figure(figsize=(10, 6))
#     plt.hist(analysis_results['accuracy_wilcoxon']['P_Value'], bins=20, alpha=0.7)
#     plt.axvline(0.05, color='red', linestyle='--', label='α=0.05')
#     plt.axvline(0.01, color='darkred', linestyle='--', label='α=0.01')
#     plt.xlabel('P-value')
#     plt.ylabel('Frequency')
#     plt.title('Distribution of Wilcoxon Test P-values')
#     plt.legend()
#     plt.savefig('wilcoxon_pvalue_distribution.png', dpi=300, bbox_inches='tight')
#     plt.show()

# # Load your test labels (make sure you have this)
# # y_test = np.load('y_test.npy')  # or however you stored it

# # Initialize analyzer
# analyzer = WilcoxonAnalyzer(predictions_data['all_model_predictions'], y_test)

# # Perform comprehensive analysis
# analysis_results = analyzer.comprehensive_analysis()

# # Display results
# print("\n" + "="*60)
# print("WILCOXON SIGNED-RANK TEST RESULTS")
# print("="*60)

# print("\nAccuracy-based Comparisons:")
# print(analysis_results['accuracy_wilcoxon'].round(4))

# print("\nSignificance Matrix (Accuracy, α=0.05):")
# print(analysis_results['accuracy_significance_matrix'])

# print("\nSIGNIFICANT DIFFERENCES AT α=0.05:")
# print("-" * 40)
# sig_diffs = analysis_results['accuracy_wilcoxon'][analysis_results['accuracy_wilcoxon']['Significant_0.05']]

# if len(sig_diffs) > 0:
#     for _, row in sig_diffs.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         significance_level = "**" if row['P_Value'] < 0.01 else "*"
#         print(f"{better_model} > {worse_model} {significance_level} (p={row['P_Value']:.4f}, diff={row['Mean_Difference']:.4f})")
# else:
#     print("No statistically significant differences found at α=0.05")

# print("\nHIGHLY SIGNIFICANT DIFFERENCES AT α=0.01:")
# print("-" * 50)
# highly_sig = analysis_results['accuracy_wilcoxon'][analysis_results['accuracy_wilcoxon']['Significant_0.01']]

# if len(highly_sig) > 0:
#     for _, row in highly_sig.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         print(f"{better_model} >> {worse_model} ** (p={row['P_Value']:.4f}, diff={row['Mean_Difference']:.4f})")
# else:
#     print("No highly significant differences found at α=0.01")

# # Visualize results
# visualize_wilcoxon_results(analysis_results, analyzer.model_names)

# # Save analysis results
# with open('wilcoxon_analysis_results.pkl', 'wb') as f:
#     pickle.dump(analysis_results, f)

# print("\nAnalysis complete! Results saved to 'wilcoxon_analysis_results.pkl'")

In [None]:
# # Let's investigate why Wilcoxon shows identical predictions when metrics differ
# print("INVESTIGATING THE CONTRADICTION:")
# print("=" * 50)

# # Check if predictions are actually identical
# model_names = list(all_test_predictions.keys())
# all_identical = True

# for i in range(len(model_names)):
#     for j in range(i + 1, len(model_names)):
#         model1, model2 = model_names[i], model_names[j]
#         pred1 = all_test_predictions[model1].y_pred
#         pred2 = all_test_predictions[model2].y_pred
        
#         if not np.array_equal(pred1, pred2):
#             all_identical = False
#             # Find where they differ
#             diff_indices = np.where(pred1 != pred2)[0]
#             print(f"{model1} vs {model2}: DIFFER at {len(diff_indices)} samples")
#             if len(diff_indices) > 0:
#                 print(f"  First difference at sample {diff_indices[0]}: {pred1[diff_indices[0]]} vs {pred2[diff_indices[0]]}")
#         else:
#             print(f"{model1} vs {model2}: IDENTICAL predictions")

# if all_identical:
#     print("\n❌ CONTRADICTION: Models have different accuracy scores but identical predictions!")
#     print("This suggests an issue with the accuracy calculation or data handling")
# else:
#     print("\n✅ Models do make different predictions as expected")

# # Let's recalculate accuracy manually to verify
# print("\nVERIFYING ACCURACY CALCULATIONS:")
# print("=" * 40)

# for model_name, pred_result in all_test_predictions.items():
#     manual_accuracy = np.mean(pred_result.y_pred == y_test)
#     print(f"{model_name:15}: Reported={results_df[results_df['Model']==model_name]['Accuracy'].values[0]:.4f}, Manual={manual_accuracy:.4f}")

# # Now let's perform a proper Wilcoxon test that actually detects differences
# print("\nPROPER WILCOXON ANALYSIS:")
# print("=" * 30)

# # Calculate accuracy per sample for each model
# sample_accuracies = {}
# for model_name, pred_result in all_test_predictions.items():
#     sample_accuracies[model_name] = (pred_result.y_pred == y_test).astype(int)

# # Perform Wilcoxon tests correctly
# wilcoxon_results = []
# model_pairs = list(itertools.combinations(model_names, 2))

# for model1, model2 in model_pairs:
#     acc1 = sample_accuracies[model1]
#     acc2 = sample_accuracies[model2]
    
#     # Only perform test if there are differences
#     if not np.array_equal(acc1, acc2):
#         try:
#             stat, p_value = wilcoxon(acc1, acc2, zero_method='pratt')
#             mean_diff = np.mean(acc1 - acc2)
            
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': stat,
#                 'P_Value': p_value,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': p_value < 0.05,
#                 'Significant_0.01': p_value < 0.01
#             })
#         except:
#             # If Wilcoxon fails, use a simpler approach
#             mean_diff = np.mean(acc1 - acc2)
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': np.nan,
#                 'P_Value': 1.0,  # Conservative approach
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': False,
#                 'Significant_0.01': False
#             })
#     else:
#         # Models have identical accuracy patterns
#         wilcoxon_results.append({
#             'Model1': model1,
#             'Model2': model2,
#             'Wilcoxon_Statistic': np.nan,
#             'P_Value': 1.0,
#             'Mean_Difference': 0.0,
#             'Significant_0.05': False,
#             'Significant_0.01': False
#         })

# # Convert to DataFrame
# wilcoxon_df = pd.DataFrame(wilcoxon_results)

# # Display significant results
# print("\nSIGNIFICANT DIFFERENCES (Proper Analysis):")
# print("=" * 50)

# sig_results = wilcoxon_df[wilcoxon_df['Significant_0.05']]
# if len(sig_results) > 0:
#     for _, row in sig_results.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         sig_level = "**" if row['Significant_0.01'] else "*"
#         print(f"{better_model} > {worse_model} {sig_level} (p={row['P_Value']:.4f}, diff={row['Mean_Difference']:.4f})")
# else:
#     print("No statistically significant differences found at α=0.05")

# # Let's also check which models differ the most
# print("\nMODELS WITH LARGEST DIFFERENCES:")
# print("=" * 35)

# # Sort by absolute mean difference
# diff_analysis = []
# for model1, model2 in model_pairs:
#     acc1 = sample_accuracies[model1]
#     acc2 = sample_accuracies[model2]
#     mean_diff = np.mean(acc1 - acc2)
#     diff_count = np.sum(acc1 != acc2)
#     diff_analysis.append({
#         'Model1': model1,
#         'Model2': model2,
#         'Mean_Difference': mean_diff,
#         'Different_Samples': diff_count,
#         'Percentage_Different': diff_count / len(y_test) * 100
#     })

# diff_df = pd.DataFrame(diff_analysis)
# diff_df = diff_df.reindex(diff_df['Mean_Difference'].abs().sort_values(ascending=False).index)

# print(diff_df.head(10).round(4))

# # Show the pair with maximum difference
# max_diff = diff_df.iloc[0]
# print(f"\nLargest difference: {max_diff['Model1']} vs {max_diff['Model2']}")
# print(f"Mean difference: {max_diff['Mean_Difference']:.4f}")
# print(f"Different on {max_diff['Different_Samples']} samples ({max_diff['Percentage_Different']:.2f}%)")

In [None]:
# # Since we can't trust the provided y_test, let's analyze model differences directly
# print("ANALYZING MODEL DIFFERENCES (Without True Labels):")
# print("=" * 50)

# # Create a matrix of pairwise differences
# model_names = list(all_test_predictions.keys())
# n_models = len(model_names)
# n_samples = len(all_test_predictions[model_names[0]].y_pred)

# diff_matrix = pd.DataFrame(0, index=model_names, columns=model_names)
# agreement_matrix = pd.DataFrame(0, index=model_names, columns=model_names)

# for i, model1 in enumerate(model_names):
#     for j, model2 in enumerate(model_names):
#         if i == j:
#             diff_matrix.loc[model1, model2] = 0
#             agreement_matrix.loc[model1, model2] = n_samples
#         else:
#             pred1 = all_test_predictions[model1].y_pred
#             pred2 = all_test_predictions[model2].y_pred
#             diff_count = np.sum(pred1 != pred2)
#             diff_matrix.loc[model1, model2] = diff_count
#             agreement_matrix.loc[model1, model2] = n_samples - diff_count

# print("Number of differing predictions between models:")
# print(diff_matrix)

# print(f"\nPercentage agreement between models:")
# agreement_pct = (agreement_matrix / n_samples * 100).round(2)
# print(agreement_pct)

# # Find which models are most similar
# print("\nMOST SIMILAR MODEL PAIRS:")
# model_pairs = []
# for i in range(n_models):
#     for j in range(i + 1, n_models):
#         model1, model2 = model_names[i], model_names[j]
#         agreement = agreement_pct.loc[model1, model2]
#         model_pairs.append((model1, model2, agreement))

# # Sort by agreement (descending)
# model_pairs.sort(key=lambda x: x[2], reverse=True)
# for model1, model2, agreement in model_pairs[:10]:
#     print(f"{model1} & {model2}: {agreement}% agreement")

# print("\nLEAST SIMILAR MODEL PAIRS:")
# for model1, model2, agreement in model_pairs[-10:]:
#     print(f"{model1} & {model2}: {agreement}% agreement")

# # The CatBoost issue is clear - it's very different from others
# print(f"\nCatBoost seems to be the outlier:")
# for model in model_names:
#     if model != 'CatBoost':
#         agreement = agreement_pct.loc[model, 'CatBoost']
#         print(f"CatBoost vs {model}: {agreement}% agreement")

# # Check if CatBoost has a different data format
# catboost_preds = all_test_predictions['CatBoost'].y_pred
# print(f"\nCatBoost predictions shape: {catboost_preds.shape}")
# print(f"CatBoost predictions type: {catboost_preds.dtype}")
# print(f"CatBoost predictions sample: {catboost_preds[:5]}")
# print(f"CatBoost predictions structure: {type(catboost_preds[0])}")

# # It seems CatBoost might be returning arrays instead of scalars
# if hasattr(catboost_preds[0], '__len__') and len(catboost_preds[0]) > 0:
#     print("CatBoost predictions are arrays, extracting first element...")
#     catboost_preds_fixed = np.array([x[0] if hasattr(x, '__len__') else x for x in catboost_preds])
#     print(f"Fixed CatBoost sample: {catboost_preds_fixed[:5]}")
    
#     # Update the predictions
#     all_test_predictions['CatBoost'] = PredictionResult(
#         y_pred=catboost_preds_fixed, 
#         y_proba=all_test_predictions['CatBoost'].y_proba
#     )

In [None]:
# # Fix CatBoost predictions
# print("FIXING CatBoost PREDICTIONS:")
# catboost_preds = all_test_predictions['CatBoost'].y_pred
# if len(catboost_preds.shape) == 2 and catboost_preds.shape[1] == 1:
#     catboost_preds_fixed = catboost_preds.flatten()
#     print(f"Fixed CatBoost from shape {catboost_preds.shape} to {catboost_preds_fixed.shape}")
#     print(f"Sample: {catboost_preds_fixed[:10]}")
    
#     # Update the predictions
#     all_test_predictions['CatBoost'] = PredictionResult(
#         y_pred=catboost_preds_fixed, 
#         y_proba=all_test_predictions['CatBoost'].y_proba
#     )

# # Now let's try to find the correct y_test
# print("\nFINDING CORRECT TEST LABELS:")
# print("=" * 40)

# # Since we have model predictions, we can try to infer the true labels
# # The most common prediction across models for each sample is likely the true label
# n_samples = len(all_test_predictions['KNN'].y_pred)
# inferred_labels = np.zeros(n_samples, dtype=int)

# for i in range(n_samples):
#     # Collect predictions from all models for sample i
#     predictions_i = []
#     for model_name, pred_result in all_test_predictions.items():
#         if model_name != 'CatBoost':  # Exclude CatBoost for now since it might be wrong
#             predictions_i.append(pred_result.y_pred[i])
    
#     # Use the most common prediction as inferred true label
#     unique, counts = np.unique(predictions_i, return_counts=True)
#     inferred_labels[i] = unique[np.argmax(counts)]

# print(f"Inferred labels shape: {inferred_labels.shape}")
# print(f"Inferred labels unique: {np.unique(inferred_labels)}")
# print(f"Inferred labels sample: {inferred_labels[:20]}")

# # Now let's calculate accuracy with inferred labels
# print("\nACCURACY WITH INFERRED LABELS:")
# print("=" * 30)

# for model_name, pred_result in all_test_predictions.items():
#     accuracy = np.mean(pred_result.y_pred == inferred_labels)
#     print(f"{model_name:15}: {accuracy:.4f}")

# # These should be close to your reported accuracies
# print("\nCOMPARISON WITH REPORTED ACCURACIES:")
# print("=" * 35)
# reported_accuracies = {
#     'KNN': 0.7968, 'SVM': 0.7908, 'Naive Bayes': 0.7530, 
#     'Decision Tree': 0.7390, 'RandomForest': 0.7928, 'AdaBoost': 0.6514,
#     'XGBoost': 0.8008, 'LightGBM': 0.7968, 'CatBoost': 0.7869, 'TabPFN': 0.7829
# }

# for model_name in all_test_predictions.keys():
#     inferred_acc = np.mean(all_test_predictions[model_name].y_pred == inferred_labels)
#     reported_acc = reported_accuracies.get(model_name, 0)
#     diff = abs(inferred_acc - reported_acc)
#     print(f"{model_name:15}: Inferred={inferred_acc:.4f}, Reported={reported_acc:.4f}, Diff={diff:.4f}")

# # Now perform Wilcoxon analysis with inferred labels
# print("\nWILCOXON ANALYSIS WITH INFERRED LABELS:")
# print("=" * 40)

# # Calculate accuracy per sample for each model
# sample_accuracies = {}
# for model_name, pred_result in all_test_predictions.items():
#     sample_accuracies[model_name] = (pred_result.y_pred == inferred_labels).astype(int)

# # Perform Wilcoxon tests
# wilcoxon_results = []
# model_names = list(all_test_predictions.keys())
# model_pairs = list(itertools.combinations(model_names, 2))

# for model1, model2 in tqdm(model_pairs, desc="Performing Wilcoxon tests"):
#     acc1 = sample_accuracies[model1]
#     acc2 = sample_accuracies[model2]
    
#     # Only perform test if there are differences
#     if not np.array_equal(acc1, acc2):
#         try:
#             stat, p_value = wilcoxon(acc1, acc2, zero_method='pratt')
#             mean_diff = np.mean(acc1 - acc2)
            
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': stat,
#                 'P_Value': p_value,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': p_value < 0.05,
#                 'Significant_0.01': p_value < 0.01
#             })
#         except Exception as e:
#             # If Wilcoxon fails, use mean difference only
#             mean_diff = np.mean(acc1 - acc2)
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': np.nan,
#                 'P_Value': 1.0,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': False,
#                 'Significant_0.01': False
#             })
#     else:
#         # Models have identical accuracy patterns
#         wilcoxon_results.append({
#             'Model1': model1,
#             'Model2': model2,
#             'Wilcoxon_Statistic': np.nan,
#             'P_Value': 1.0,
#             'Mean_Difference': 0.0,
#             'Significant_0.05': False,
#             'Significant_0.01': False
#         })

# # Convert to DataFrame
# wilcoxon_df = pd.DataFrame(wilcoxon_results)

# # Display significant results
# print("\nSIGNIFICANT DIFFERENCES:")
# print("=" * 25)

# sig_results = wilcoxon_df[wilcoxon_df['Significant_0.05'] == True]
# if len(sig_results) > 0:
#     for _, row in sig_results.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         sig_level = "**" if row['Significant_0.01'] else "*"
#         print(f"{better_model} > {worse_model} {sig_level} (p={row['P_Value']:.4f}, diff={row['Mean_Difference']:.4f})")
# else:
#     print("No statistically significant differences found at α=0.05")

# # Show all comparisons sorted by mean difference
# print("\nALL MODEL COMPARISONS (sorted by absolute difference):")
# print("=" * 55)

# comparison_df = wilcoxon_df.copy()
# comparison_df['Abs_Difference'] = comparison_df['Mean_Difference'].abs()
# comparison_df = comparison_df.sort_values('Abs_Difference', ascending=False)

# for _, row in comparison_df.head(15).iterrows():
#     better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#     worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#     sig_mark = "**" if row['Significant_0.01'] else "*" if row['Significant_0.05'] else ""
#     print(f"{better_model} vs {worse_model}: diff={row['Mean_Difference']:.4f} {sig_mark} (p={row['P_Value']:.4f})")

# # Create a performance ranking
# print("\nMODEL PERFORMANCE RANKING:")
# print("=" * 25)

# model_performance = {}
# for model_name in model_names:
#     model_performance[model_name] = np.mean(sample_accuracies[model_name])

# # Sort by performance
# ranked_models = sorted(model_performance.items(), key=lambda x: x[1], reverse=True)
# for rank, (model, perf) in enumerate(ranked_models, 1):
#     print(f"{rank:2d}. {model:15}: {perf:.4f}")

# # Save the inferred labels for future use
# np.save('inferred_test_labels.npy', inferred_labels)
# print(f"\nInferred labels saved to 'inferred_test_labels.npy'")

In [None]:
# # Fix CatBoost predictions
# print("FIXING CatBoost PREDICTIONS:")
# catboost_preds = all_test_predictions['CatBoost'].y_pred
# if len(catboost_preds.shape) == 2 and catboost_preds.shape[1] == 1:
#     catboost_preds_fixed = catboost_preds.flatten()
#     print(f"Fixed CatBoost from shape {catboost_preds.shape} to {catboost_preds_fixed.shape}")
#     print(f"Sample: {catboost_preds_fixed[:10]}")
    
#     # Update the predictions
#     all_test_predictions['CatBoost'] = PredictionResult(
#         y_pred=catboost_preds_fixed, 
#         y_proba=all_test_predictions['CatBoost'].y_proba
#     )

# # Load the ACTUAL true test labels (you need to have these)
# print("\nUSING ACTUAL TEST LABELS:")
# print("=" * 40)

# # Replace this with how you load your actual y_test
# # y_test = np.load('true_test_labels.npy')  # or however you access them
# # y_test = your_data_loader_function() 

# # Since you don't seem to have the true labels, we'll use the reported results directly
# print("Since true test labels are not available, using reported accuracies for analysis")
# print("This avoids the circular logic problem of inferring labels from predictions")

# # Create a DataFrame with the reported results
# print("\nREPORTED MODEL PERFORMANCE:")
# print("=" * 30)

# reported_results = {
#     'Model': ['KNN', 'SVM', 'Naive Bayes', 'Decision Tree', 'RandomForest', 
#               'AdaBoost', 'XGBoost', 'LightGBM', 'CatBoost', 'TabPFN'],
#     'Accuracy': [0.7968, 0.7908, 0.7530, 0.7390, 0.7928, 0.6514, 0.8008, 0.7968, 0.7869, 0.7829],
#     'Precision_macro': [0.7761, 0.7893, 0.6993, 0.7062, 0.7951, 0.6439, 0.7772, 0.7758, 0.7535, 0.7641],
#     'Recall_macro': [0.7119, 0.6982, 0.7073, 0.6713, 0.6985, 0.7022, 0.7155, 0.6989, 0.7152, 0.6982],
#     'F1_macro': [0.7331, 0.7275, 0.7029, 0.6874, 0.7285, 0.6353, 0.7375, 0.7257, 0.7298, 0.7208]
# }

# results_df = pd.DataFrame(reported_results)
# print(results_df.to_string(index=False))

# # Performance ranking based on reported accuracy
# print("\nMODEL PERFORMANCE RANKING (Based on Reported Accuracy):")
# print("=" * 50)

# ranked_models = results_df.sort_values('Accuracy', ascending=False)
# for rank, (_, row) in enumerate(ranked_models.iterrows(), 1):
#     print(f"{rank:2d}. {row['Model']:15}: {row['Accuracy']:.4f}")

# # Statistical analysis - we can't do Wilcoxon without true labels, but we can analyze patterns
# print("\nPERFORMANCE ANALYSIS:")
# print("=" * 20)

# print("Top Performers (Accuracy > 0.79):")
# top_models = results_df[results_df['Accuracy'] > 0.79]
# print(top_models[['Model', 'Accuracy']].to_string(index=False))

# print("\nWeaker Performers (Accuracy < 0.75):")
# weak_models = results_df[results_df['Accuracy'] < 0.75]
# print(weak_models[['Model', 'Accuracy']].to_string(index=False))

# print(f"\nPerformance Range: {results_df['Accuracy'].min():.4f} - {results_df['Accuracy'].max():.4f}")
# print(f"Average Accuracy: {results_df['Accuracy'].mean():.4f}")
# print(f"Standard Deviation: {results_df['Accuracy'].std():.4f}")

# # Key insights
# print("\nKEY INSIGHTS:")
# print("=" * 15)
# print("1. XGBoost is the top performer (0.8008 accuracy)")
# print("2. Tree-based ensembles (XGBoost, LightGBM, RandomForest, CatBoost) perform well")
# print("3. AdaBoost is significantly worse than other models")
# print("4. Models are very competitive - top 6 within 2% of each other")
# print("5. Consider ensemble methods for best performance")

# # If you want to compare your predictions with reported results
# print("\nCOMPARING YOUR PREDICTIONS WITH REPORTED RESULTS:")
# print("=" * 50)

# # Calculate what the accuracy would be if your predictions matched reported results
# # This is just for validation, not for actual performance measurement
# model_accuracies = {}
# for model_name in all_test_predictions.keys():
#     # This is just to show the predicted class distribution
#     pred_counts = np.bincount(all_test_predictions[model_name].y_pred)
#     model_accuracies[model_name] = {
#         'predicted_class_distribution': pred_counts,
#         'n_predictions': len(all_test_predictions[model_name].y_pred)
#     }
#     print(f"{model_name:15}: {len(pred_counts)} classes, distribution: {pred_counts}")

# print("\nNote: Without true test labels, statistical tests like Wilcoxon cannot be performed")
# print("The reported results from your image are the ground truth for model performance")

In [None]:
# # Fix CatBoost predictions
# print("FIXING CatBoost PREDICTIONS:")
# catboost_preds = all_test_predictions['CatBoost'].y_pred
# if len(catboost_preds.shape) == 2 and catboost_preds.shape[1] == 1:
#     catboost_preds_fixed = catboost_preds.flatten()
#     print(f"Fixed CatBoost from shape {catboost_preds.shape} to {catboost_preds_fixed.shape}")
#     print(f"Sample: {catboost_preds_fixed[:10]}")
    
#     # Update the predictions
#     all_test_predictions['CatBoost'] = PredictionResult(
#         y_pred=catboost_preds_fixed, 
#         y_proba=all_test_predictions['CatBoost'].y_proba
#     )

# # Calculate accuracy with TRUE test labels
# print(f"\nACCURACY WITH TRUE TEST LABELS (y_test shape: {y_test.shape}):")
# print("=" * 50)

# # Calculate actual accuracy for each model
# actual_accuracies = {}
# for model_name, pred_result in all_test_predictions.items():
#     accuracy = np.mean(pred_result.y_pred == y_test)
#     actual_accuracies[model_name] = accuracy
#     print(f"{model_name:15}: {accuracy:.4f}")

# # Compare with reported accuracies
# print("\nCOMPARISON WITH REPORTED ACCURACIES:")
# print("=" * 35)
# reported_accuracies = {
#     'KNN': 0.7968, 'SVM': 0.7908, 'Naive Bayes': 0.7530, 
#     'Decision Tree': 0.7390, 'RandomForest': 0.7928, 'AdaBoost': 0.6514,
#     'XGBoost': 0.8008, 'LightGBM': 0.7968, 'CatBoost': 0.7869, 'TabPFN': 0.7829
# }

# print(f"{'Model':15} {'Actual':8} {'Reported':8} {'Diff':8} {'Match':6}")
# print("-" * 45)
# for model_name in all_test_predictions.keys():
#     actual_acc = actual_accuracies[model_name]
#     reported_acc = reported_accuracies.get(model_name, 0)
#     diff = abs(actual_acc - reported_acc)
#     match = "✓" if abs(diff) < 0.01 else "✗"  # Allow small rounding differences
#     print(f"{model_name:15}: {actual_acc:.4f}  {reported_acc:.4f}  {diff:.4f}  {match}")

# # Now perform proper Wilcoxon analysis with TRUE labels
# print("\nWILCOXON ANALYSIS WITH TRUE LABELS:")
# print("=" * 40)

# # Calculate accuracy per sample for each model (1 if correct, 0 if wrong)
# sample_accuracies = {}
# for model_name, pred_result in all_test_predictions.items():
#     sample_accuracies[model_name] = (pred_result.y_pred == y_test).astype(int)

# # Perform Wilcoxon tests
# wilcoxon_results = []
# model_names = list(all_test_predictions.keys())
# model_pairs = list(itertools.combinations(model_names, 2))

# for model1, model2 in tqdm(model_pairs, desc="Performing Wilcoxon tests"):
#     acc1 = sample_accuracies[model1]
#     acc2 = sample_accuracies[model2]
    
#     # Only perform test if there are differences
#     if not np.array_equal(acc1, acc2):
#         try:
#             stat, p_value = wilcoxon(acc1, acc2, zero_method='pratt')
#             mean_diff = np.mean(acc1 - acc2)
            
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': stat,
#                 'P_Value': p_value,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': p_value < 0.05,
#                 'Significant_0.01': p_value < 0.01
#             })
#         except Exception as e:
#             # If Wilcoxon fails, use mean difference only
#             mean_diff = np.mean(acc1 - acc2)
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'Wilcoxon_Statistic': np.nan,
#                 'P_Value': 1.0,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': False,
#                 'Significant_0.01': False
#             })
#     else:
#         # Models have identical accuracy patterns
#         wilcoxon_results.append({
#             'Model1': model1,
#             'Model2': model2,
#             'Wilcoxon_Statistic': np.nan,
#             'P_Value': 1.0,
#             'Mean_Difference': 0.0,
#             'Significant_0.05': False,
#             'Significant_0.01': False
#         })

# # Convert to DataFrame
# wilcoxon_df = pd.DataFrame(wilcoxon_results)

# # Display significant results
# print("\nSTATISTICALLY SIGNIFICANT DIFFERENCES:")
# print("=" * 40)

# sig_results = wilcoxon_df[wilcoxon_df['Significant_0.05'] == True]
# if len(sig_results) > 0:
#     for _, row in sig_results.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         sig_level = "**" if row['Significant_0.01'] else "*"
#         print(f"{better_model} > {worse_model} {sig_level} (p={row['P_Value']:.4f}, diff={abs(row['Mean_Difference']):.4f})")
# else:
#     print("No statistically significant differences found at α=0.05")

# # Show performance ranking
# print("\nMODEL PERFORMANCE RANKING (Based on True Labels):")
# print("=" * 45)

# # Sort by actual performance
# ranked_models = sorted(actual_accuracies.items(), key=lambda x: x[1], reverse=True)
# for rank, (model, perf) in enumerate(ranked_models, 1):
#     print(f"{rank:2d}. {model:15}: {perf:.4f}")

# # Additional performance metrics
# print("\nDETAILED PERFORMANCE ANALYSIS:")
# print("=" * 30)

# from sklearn.metrics import classification_report, confusion_matrix

# print("Classification report for top performer:")
# top_model = ranked_models[0][0]
# print(f"\n{top_model} Classification Report:")
# print(classification_report(y_test, all_test_predictions[top_model].y_pred))

# # Check if results match reported values
# print("\nVALIDATION CHECK:")
# print("=" * 20)
# mismatches = []
# for model_name in all_test_predictions.keys():
#     actual = actual_accuracies[model_name]
#     reported = reported_accuracies.get(model_name, 0)
#     if abs(actual - reported) > 0.01:  # More than 1% difference
#         mismatches.append((model_name, actual, reported, abs(actual - reported)))

# if mismatches:
#     print("Significant differences found between actual and reported accuracies:")
#     for model, actual, reported, diff in mismatches:
#         print(f"  {model}: Actual={actual:.4f}, Reported={reported:.4f}, Diff={diff:.4f}")
# else:
#     print("All actual accuracies match reported values within 1% tolerance")

# # Save the results
# print(f"\nTrue test labels shape: {y_test.shape}")
# print(f"True labels unique values: {np.unique(y_test)}")

In [None]:
# # Fix CatBoost predictions if needed
# print("FIXING CatBoost PREDICTIONS:")
# catboost_preds = all_test_predictions['CatBoost'].y_pred
# if len(catboost_preds.shape) == 2 and catboost_preds.shape[1] == 1:
#     catboost_preds_fixed = catboost_preds.flatten()
#     print(f"Fixed CatBoost from shape {catboost_preds.shape} to {catboost_preds_fixed.shape}")
    
#     # Update the predictions
#     all_test_predictions['CatBoost'] = PredictionResult(
#         y_pred=catboost_preds_fixed, 
#         y_proba=all_test_predictions['CatBoost'].y_proba
#     )

# # Calculate comprehensive performance metrics
# print(f"\nCOMPREHENSIVE PERFORMANCE METRICS (y_test shape: {y_test.shape}):")
# print("=" * 65)

# from sklearn.metrics import (accuracy_score, f1_score, precision_score, 
#                            recall_score, balanced_accuracy_score, 
#                            classification_report, confusion_matrix)

# # Calculate all metrics for each model
# performance_metrics = {}
# for model_name, pred_result in all_test_predictions.items():
#     y_pred = pred_result.y_pred
    
#     performance_metrics[model_name] = {
#         'accuracy': accuracy_score(y_test, y_pred),
#         'weighted_f1': f1_score(y_test, y_pred, average='weighted'),
#         'macro_f1': f1_score(y_test, y_pred, average='macro'),
#         'weighted_precision': precision_score(y_test, y_pred, average='weighted'),
#         'weighted_recall': recall_score(y_test, y_pred, average='weighted'),
#         'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
#         'micro_f1': f1_score(y_test, y_pred, average='micro')
#     }

# # Display all metrics
# print(f"{'Model':15} {'Acc':6} {'F1-Wtd':7} {'F1-Macro':8} {'Bal-Acc':8} {'F1-Micro':8}")
# print("-" * 65)
# for model_name, metrics in performance_metrics.items():
#     print(f"{model_name:15}: {metrics['accuracy']:.4f}  {metrics['weighted_f1']:.4f}  "
#           f"{metrics['macro_f1']:.4f}    {metrics['balanced_accuracy']:.4f}    {metrics['micro_f1']:.4f}")

# # Class distribution analysis
# print(f"\nCLASS DISTRIBUTION ANALYSIS:")
# print("=" * 30)
# unique_classes, class_counts = np.unique(y_test, return_counts=True)
# class_info = dict(zip([f'Class_{c}' for c in unique_classes], class_counts))
# for class_name, count in class_info.items():
#     print(f"{class_name}: {count} samples ({count/len(y_test)*100:.1f}%)")
# print(f"Imbalance ratio: {max(class_counts)/min(class_counts):.2f}:1")

# # Performance ranking by weighted F1 (best for imbalanced data)
# print(f"\nMODEL RANKING (By Weighted F1 Score):")
# print("=" * 35)
# ranked_models = sorted(performance_metrics.items(), 
#                       key=lambda x: x[1]['weighted_f1'], reverse=True)

# for rank, (model, metrics) in enumerate(ranked_models, 1):
#     print(f"{rank:2d}. {model:15}: F1={metrics['weighted_f1']:.4f}, Acc={metrics['accuracy']:.4f}")

# # Wilcoxon statistical analysis
# print(f"\nSTATISTICAL SIGNIFICANCE ANALYSIS (Wilcoxon):")
# print("=" * 45)

# # Calculate accuracy per sample for each model
# sample_accuracies = {}
# for model_name, pred_result in all_test_predictions.items():
#     sample_accuracies[model_name] = (pred_result.y_pred == y_test).astype(int)

# # Perform Wilcoxon tests
# wilcoxon_results = []
# model_pairs = list(itertools.combinations(performance_metrics.keys(), 2))

# for model1, model2 in tqdm(model_pairs, desc="Performing Wilcoxon tests"):
#     acc1 = sample_accuracies[model1]
#     acc2 = sample_accuracies[model2]
    
#     if not np.array_equal(acc1, acc2):
#         try:
#             stat, p_value = wilcoxon(acc1, acc2, zero_method='pratt')
#             mean_diff = np.mean(acc1 - acc2)
            
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'P_Value': p_value,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': p_value < 0.05,
#                 'Significant_0.01': p_value < 0.01
#             })
#         except:
#             mean_diff = np.mean(acc1 - acc2)
#             wilcoxon_results.append({
#                 'Model1': model1,
#                 'Model2': model2,
#                 'P_Value': 1.0,
#                 'Mean_Difference': mean_diff,
#                 'Significant_0.05': False,
#                 'Significant_0.01': False
#             })

# # Display statistically significant differences
# wilcoxon_df = pd.DataFrame(wilcoxon_results)
# sig_results = wilcoxon_df[wilcoxon_df['Significant_0.05'] == True]

# if len(sig_results) > 0:
#     print("Statistically significant differences (α=0.05):")
#     for _, row in sig_results.iterrows():
#         better_model = row['Model1'] if row['Mean_Difference'] > 0 else row['Model2']
#         worse_model = row['Model2'] if row['Mean_Difference'] > 0 else row['Model1']
#         sig_level = "**" if row['Significant_0.01'] else "*"
#         f1_diff = abs(performance_metrics[better_model]['weighted_f1'] - 
#                      performance_metrics[worse_model]['weighted_f1'])
#         print(f"  {better_model} > {worse_model} {sig_level} (p={row['P_Value']:.4f}, ΔF1={f1_diff:.4f})")
# else:
#     print("No statistically significant differences found")

# # Detailed analysis of top performer
# top_model = ranked_models[0][0]
# print(f"\nDETAILED ANALYSIS OF TOP PERFORMER ({top_model}):")
# print("=" * 45)
# print(classification_report(y_test, all_test_predictions[top_model].y_pred, 
#                            target_names=[f'Class_{c}' for c in unique_classes]))

# # Confusion matrix for top model
# print("Confusion Matrix:")
# cm = confusion_matrix(y_test, all_test_predictions[top_model].y_pred)
# print(cm)

# # Practical significance analysis
# print(f"\nPRACTICAL SIGNIFICANCE (F1 differences > 0.01):")
# print("=" * 40)
# practical_diffs = []
# for i in range(len(ranked_models)):
#     for j in range(i+1, len(ranked_models)):
#         model1, metrics1 = ranked_models[i]
#         model2, metrics2 = ranked_models[j]
#         f1_diff = metrics1['weighted_f1'] - metrics2['weighted_f1']
#         if f1_diff > 0.01:  # 1% F1 difference considered practically significant
#             practical_diffs.append((model1, model2, f1_diff))

# for model1, model2, diff in sorted(practical_diffs, key=lambda x: x[2], reverse=True):
#     print(f"  {model1} outperforms {model2} by {diff:.4f} F1 points")

# # Final recommendations
# print(f"\nFINAL RECOMMENDATIONS:")
# print("=" * 20)
# print("1. Best overall model: XGBoost (F1-weighted: 0.7916)")
# print("2. Consider model ensemble for potentially better performance")
# print("3. Address class imbalance (7.63:1 ratio) with:")
# print("   - Class weighting during training")
# print("   - Oversampling techniques for minority classes")
# print("   - Focus on improving Class_0 performance")
# print("4. Use weighted F1 as primary evaluation metric")