# AI-Generated Text Detection

Stacked ensemble (DeBERTa-v3 + LightGBM + TF-IDF/SGD) on `merged_ai_human_multisocial_features_train.csv` / `merged_ai_human_multisocial_features_test.csv` when present (0=Human, 1=AI). Flow: setup -> data check -> Model A/B/C CV (OOF) -> stacking -> inference. Text column: `text`; label column: `label`; numerical features like `burstiness`, `perplexity_score`, `lexical_diversity`, `gunning_fog_index`.


In [31]:
# Optional: install dependencies if running in a fresh environment
# !pip install -q pandas numpy torch torchvision torchaudio transformers datasets lightgbm scikit-learn
# For CUDA builds of PyTorch: https://pytorch.org/get-started/locally/


In [None]:

import os
import random
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

import lightgbm as lgb
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed,
)
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline

import cudf
import cupy as cp
from cuml.feature_extraction.text import TfidfVectorizer as CuTfidfVectorizer
from cuml.linear_model import MBSGDClassifier


SEED = 42
EXTRA_SEEDS = [123, 2029]  # opcional: sumar seeds para ensembles más robustos
SEED_LIST = [SEED] + EXTRA_SEEDS
set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

# Notebook config (ajusta aquí en vez de usar variables de entorno)
BASE_MODEL = "microsoft/deberta-v3-base"
MAX_LENGTH = 512
N_SPLITS = 5
NUM_EPOCHS = 3
STACK_BASE_MODELS = ["deberta", "lgb", "sgd"]

DEBERTA_LR = 1.5e-5
DEBERTA_WEIGHT_DECAY = 0.05
DEBERTA_WARMUP_RATIO = 0.1
LABEL_SMOOTHING = 0.05
USE_GRADIENT_CHECKPOINTING = True

# Multi-GPU / batch settings
AVAILABLE_GPUS = list(range(torch.cuda.device_count())) if torch.cuda.is_available() else []
REQUESTED_GPU_IDS = []  # ejemplo: [0, 1, 2, 3]; si está vacío usa los disponibles
if REQUESTED_GPU_IDS:
    AVAILABLE_GPUS = [g for g in REQUESTED_GPU_IDS if g in AVAILABLE_GPUS] or REQUESTED_GPU_IDS
PARALLEL_FOLDS = 0  # pon >1 para paralelizar folds
if PARALLEL_FOLDS > len(AVAILABLE_GPUS):
    print(f"PARALLEL_FOLDS limited to {len(AVAILABLE_GPUS)} available GPUs")
    PARALLEL_FOLDS = len(AVAILABLE_GPUS)
DEBERTA_TRAIN_BATCH = 8
DEBERTA_EVAL_BATCH = 16
DEBERTA_GRAD_ACCUM = 4
DEBERTA_NUM_WORKERS = 2
if PARALLEL_FOLDS > 1:
    print(f"Parallel DeBERTa folds enabled: {PARALLEL_FOLDS} workers over GPUs {AVAILABLE_GPUS}")

# Data paths
DATA_PATH = None  # Path a un CSV específico; None intenta auto-descubrir
TEST_PATH_OVERRIDE = None  # Path a test; None para autodetectar

# Preprocesado/ajustes varios
LOG1P_THRESHOLD = 1000.0
GROUP_COLUMN = None  # e.g. "source" para GroupKFold
LGB_TIME_BUDGET = 0  # segundos; 0 = sin límite
CPU_TRIM_CHARS = 1200
CPU_MAX_FEATURES = 300000
TFIDF_LOGREG = True


CWD = Path.cwd()
data_override = Path(DATA_PATH).expanduser() if DATA_PATH else None
candidate_paths = [
    data_override,
    CWD / "merged_ai_human_multisocial_features_cleaned_train.csv",
    CWD / "src/ai_vs_human/merged_ai_human_multisocial_features_cleaned_train.csv",
    CWD / "merged_ai_human_multisocial_features_train.csv",
    CWD / "src/ai_vs_human/merged_ai_human_multisocial_features_train.csv",
    CWD / "merged_ai_human_multisocial_features_cleaned.csv",
    CWD / "src/ai_vs_human/merged_ai_human_multisocial_features_cleaned.csv",
    CWD / "merged_ai_human_multisocial_features.csv",
    CWD / "src/ai_vs_human/merged_ai_human_multisocial_features.csv",
    CWD / "ai_human_content_detection_dataset.csv",
    CWD / "src/ai_vs_human/ai_human_content_detection_dataset.csv",
]
candidate_paths = [p for p in candidate_paths if p is not None]
for candidate_path in candidate_paths:
    if candidate_path.exists():
        DATA_PATH = candidate_path
        break
else:
    raise FileNotFoundError(
        "No training data file found. Set DATA_PATH or place merged_ai_human_multisocial_features_train.csv (or merged_ai_human_multisocial_features.csv / ai_human_content_detection_dataset.csv) in the repo."
    )
print(f"Using training file: {DATA_PATH}")

WORK_DIR = DATA_PATH.parent
MODEL_DIR = WORK_DIR / "models" / "deberta_v3_base"
LGB_MODEL_DIR = WORK_DIR / "models" / "lightgbm"
LGB_MODEL_DIR_LEGACY = WORK_DIR / "models" / "lightgbm_numeric"
SGD_MODEL_DIR = WORK_DIR / "models" / "tfidf_sgd"
STACK_MODEL_DIR = WORK_DIR / "models" / "stack_meta"
for path in (MODEL_DIR, LGB_MODEL_DIR, LGB_MODEL_DIR_LEGACY, SGD_MODEL_DIR, STACK_MODEL_DIR):
    path.mkdir(parents=True, exist_ok=True)
OOF_DIR = WORK_DIR / "oof"
OOF_DIR.mkdir(exist_ok=True)
paired_test = None
if DATA_PATH.suffix:
    name = DATA_PATH.name
    if name.endswith("_train" + DATA_PATH.suffix):
        paired_test = DATA_PATH.with_name(name.replace("_train", "_test"))
    else:
        paired_test = DATA_PATH.with_name(DATA_PATH.stem + "_test" + DATA_PATH.suffix)
test_override = Path(TEST_PATH_OVERRIDE).expanduser() if TEST_PATH_OVERRIDE else None
test_candidates = [
    test_override,
    paired_test,
    WORK_DIR / "merged_ai_human_multisocial_features_test.csv",
    WORK_DIR / "ai_human_content_detection_test.csv",
    WORK_DIR / "ai_human_content_detection_dataset.csv",
    DATA_PATH,  # fallback: reuse train data so inference still runs
]
test_candidates = [p for p in test_candidates if p is not None]
TEST_PATH = next((p for p in test_candidates if p.exists()), test_candidates[0])
print(f"Using test file: {TEST_PATH}")
if TEST_PATH == DATA_PATH:
    print("TEST_PATH not provided; using training data as a smoke-test for inference.")


Device: cuda
Using training file: /home/ruben/EC/src/ai_vs_human/merged_ai_human_multisocial_features_cleaned_train.csv
Using test file: /home/ruben/EC/src/ai_vs_human/merged_ai_human_multisocial_features_cleaned_test.csv


In [33]:

# Load data and inspect
train_df = pd.read_csv(DATA_PATH)
test_df = pd.read_csv(TEST_PATH) if TEST_PATH.exists() else None

text_col = "text"
label_col = "label"
alt_text_cols = ("text_content",)
meta_prefixes = ("src_", "lang_", "model_", "ds_")  # exclude metadata/lang/model/dataset to avoid leakage

if text_col not in train_df.columns:
    for alt_col in alt_text_cols:
        if alt_col in train_df.columns:
            train_df = train_df.rename(columns={alt_col: text_col})
            break
if text_col not in train_df.columns:
    raise ValueError(f"Training data missing `{text_col}` column; set DATA_PATH to a file containing text.")

# Drop exact duplicate text+label rows to reduce leakage
initial_rows = len(train_df)
train_df = train_df.drop_duplicates(subset=[text_col, label_col]).reset_index(drop=True)
if len(train_df) != initial_rows:
    print(f"Dropped {initial_rows - len(train_df)} duplicate rows (text+label)")

# Optional: normalize skewed numeric columns later
log1p_threshold = LOG1P_THRESHOLD
drop_numeric = {"grammar_errors", "length"}
num_cols = [
    c
    for c in train_df.columns
    if c not in [text_col, label_col]
    and c not in drop_numeric
    and pd.api.types.is_numeric_dtype(train_df[c])
    and not any(c.startswith(pref) for pref in meta_prefixes)
]

train_df[num_cols] = train_df[num_cols].apply(pd.to_numeric, errors="coerce")
num_medians = train_df[num_cols].median()
train_df[num_cols] = train_df[num_cols].fillna(num_medians)
if test_df is not None:
    if text_col not in test_df.columns:
        for alt_col in alt_text_cols:
            if alt_col in test_df.columns:
                test_df = test_df.rename(columns={alt_col: text_col})
                break
    if text_col not in test_df.columns:
        raise ValueError(f"Test data missing `{text_col}` column; set TEST_PATH to a file containing text.")
    missing_num_cols = [c for c in num_cols if c not in test_df.columns]
    for col in missing_num_cols:
        test_df[col] = num_medians[col]
    if missing_num_cols:
        print(f"Filled missing numeric columns in test set: {len(missing_num_cols)} (using train medians)")
    test_df[num_cols] = test_df[num_cols].apply(pd.to_numeric, errors="coerce")
    test_df[num_cols] = test_df[num_cols].fillna(num_medians)

# Log1p transform for heavily skewed positive numeric features
log1p_cols = [
    c
    for c in num_cols
    if (train_df[c] > 0).all()
    and train_df[c].max() > log1p_threshold
]
if log1p_cols:
    train_df[log1p_cols] = np.log1p(train_df[log1p_cols])
    if test_df is not None:
        test_df[log1p_cols] = np.log1p(test_df[log1p_cols])
    preview = log1p_cols[:5]
    print(f"Applied log1p to skewed columns: {preview}{'...' if len(log1p_cols) > 5 else ''}")

print(f"Train shape: {train_df.shape}")
print(f"Numeric features ({len(num_cols)}): {num_cols[:10]}{'...' if len(num_cols) > 10 else ''}")
print(train_df[[text_col, label_col]].head(2))
print(train_df[num_cols].describe().T.head())

# Quick correlation check to spot potential leakage/high-signal numeric features
corr = (
    train_df[num_cols + [label_col]]
    .corr()[label_col]
    .drop(label_col)
    .sort_values(key=np.abs, ascending=False)
)
print("Top correlated numeric features with label:")
print(corr.head(10))

y = train_df[label_col].astype(int).values

# Prefer GroupKFold if there is a dataset/source column to avoid leakage
preferred_group_col = GROUP_COLUMN

def detect_group_column(df):
    if preferred_group_col and preferred_group_col in df.columns:
        return preferred_group_col
    for col in df.columns:
        if any(col.startswith(pref) for pref in meta_prefixes):
            nunique = df[col].nunique()
            if 1 < nunique < len(df):
                return col
    return None

_group_col = detect_group_column(train_df)
if _group_col:
    groups = train_df[_group_col]
    n_groups = groups.nunique()
    if n_groups >= N_SPLITS:
        cv_splitter = GroupKFold(n_splits=N_SPLITS)
        folds = list(cv_splitter.split(train_df[text_col], y, groups))
        print(f"Using GroupKFold on column `{_group_col}` (n_groups={n_groups})")
    elif n_groups > 1:
        cv_splitter = GroupKFold(n_splits=n_groups)
        folds = list(cv_splitter.split(train_df[text_col], y, groups))
        print(f"Using GroupKFold on `{_group_col}` with reduced splits={n_groups} (n_groups={n_groups})")
    else:
        cv_splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
        folds = list(cv_splitter.split(train_df[text_col], y))
        print(f"Group column `{_group_col}` has <=1 group; falling back to StratifiedKFold")
else:
    cv_splitter = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    folds = list(cv_splitter.split(train_df[text_col], y))
    print("Using StratifiedKFold")


Applied log1p to skewed columns: ['character_count']
Train shape: (464171, 52)
Numeric features (13): ['word_count', 'character_count', 'sentence_count', 'lexical_diversity', 'avg_sentence_length', 'avg_word_length', 'punctuation_ratio', 'flesch_reading_ease', 'gunning_fog_index', 'passive_voice_ratio']...
                                                text  label
0  Cars. Cars have been around since they became ...      0
1  Transportation is a large necessity in most co...      0
                        count        mean         std       min         25%  \
word_count           464171.0  396.088289  162.141661  3.000000  283.000000   
character_count      464171.0    7.634398    0.423063  2.708050    7.367709   
sentence_count       464171.0   20.511417    8.551840  1.000000   14.000000   
lexical_diversity    464171.0    0.450273    0.089061  0.277778    0.388350   
avg_sentence_length  464171.0   20.170071    5.702400  2.000000   16.500000   

                            50%      

In [34]:
# Helper classes and metrics
class HFTextDataset(Dataset):
    def __init__(self, df, tokenizer, text_col, label_col=None, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.text_col = text_col
        self.label_col = label_col
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.loc[idx, self.text_col])
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding=False,
        )
        if self.label_col is not None:
            enc["labels"] = int(self.df.loc[idx, self.label_col])
        return enc


def softmax_logits(logits):
    logits = torch.tensor(logits)
    probs = torch.softmax(logits, dim=1).cpu().numpy()
    return probs[:, 1]


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = softmax_logits(logits)
    return {"roc_auc": roc_auc_score(labels, probs)}


In [35]:

# Model A: DeBERTa-v3-base with Stratified/Group K-Fold OOF (optionally multi-GPU parallel across folds)
import multiprocessing as mp

def _train_deberta_fold(job):
    fold = job["fold"]
    device_id = job.get("device_id")
    train_slice = job["train_df"]
    val_slice = job["val_df"]
    val_idx = job["val_idx"]

    if device_id is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    if torch.cuda.is_available():
        torch.cuda.set_device(0)
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.benchmark = True

    fold_dir = MODEL_DIR / f"fold_{fold}"
    fold_dir.mkdir(parents=True, exist_ok=True)

    tokenizer_local = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
    collator_local = DataCollatorWithPadding(tokenizer=tokenizer_local, padding=True)

    train_ds = HFTextDataset(train_slice, tokenizer_local, text_col, label_col, MAX_LENGTH)
    val_ds = HFTextDataset(val_slice, tokenizer_local, text_col, label_col, MAX_LENGTH)

    model = AutoModelForSequenceClassification.from_pretrained(
        BASE_MODEL, num_labels=2
    )
    if USE_GRADIENT_CHECKPOINTING:
        try:
            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
        except TypeError:
            try:
                model.gradient_checkpointing_enable(use_reentrant=False)
            except TypeError:
                model.gradient_checkpointing_enable()
        if getattr(model, "config", None) is not None and hasattr(model.config, "use_cache"):
            model.config.use_cache = False

    training_kwargs = dict(
        output_dir=str(fold_dir),
        per_device_train_batch_size=DEBERTA_TRAIN_BATCH,
        per_device_eval_batch_size=DEBERTA_EVAL_BATCH,
        gradient_accumulation_steps=DEBERTA_GRAD_ACCUM,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=DEBERTA_LR,
        weight_decay=DEBERTA_WEIGHT_DECAY,
        warmup_ratio=DEBERTA_WARMUP_RATIO,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="roc_auc",
        greater_is_better=True,
        dataloader_num_workers=DEBERTA_NUM_WORKERS,
        report_to="none",
        lr_scheduler_type="cosine",
        label_smoothing_factor=LABEL_SMOOTHING,
        gradient_checkpointing=False,  # manual GC above with use_reentrant=False to avoid double-backward issues
        ddp_find_unused_parameters=False,
    )

    # Handle TrainingArguments API differences across transformers versions
    try:
        args = TrainingArguments(
            **training_kwargs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
        )
    except TypeError:
        try:
            args = TrainingArguments(
                **training_kwargs,
                evaluate_during_training=True,
                eval_steps=500,
                save_steps=500,
            )
        except TypeError:
            fallback_kwargs = training_kwargs.copy()
            for key in (
                "load_best_model_at_end",
                "metric_for_best_model",
                "greater_is_better",
                "lr_scheduler_type",
                "label_smoothing_factor",
                "gradient_checkpointing",
                "ddp_find_unused_parameters",
            ):
                fallback_kwargs.pop(key, None)
            args = TrainingArguments(**fallback_kwargs)

    # Ensure metric_for_best_model exists for EarlyStopping across transformers versions
    if getattr(args, "metric_for_best_model", None) is None:
        args.metric_for_best_model = "roc_auc"
    if getattr(args, "greater_is_better", None) is None:
        args.greater_is_better = True
    # Force evaluation/save strategy to something valid (epoch)
    if getattr(args, "evaluation_strategy", None) in (None, "no", "none"):
        args.evaluation_strategy = "epoch"
    # Some versions expose eval_strategy alias used in Trainer internals
    if getattr(args, "eval_strategy", None) in (None, "no", "none"):
        args.eval_strategy = getattr(args, "evaluation_strategy", "epoch")
    if getattr(args, "save_strategy", None) in (None, "no", "none"):
        args.save_strategy = getattr(args, "evaluation_strategy", "epoch")
    if getattr(args, "load_best_model_at_end", None) is None:
        args.load_best_model_at_end = True

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer_local,
        data_collator=collator_local,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
    )

    trainer.train()

    preds = trainer.predict(val_ds).predictions
    oof_preds = softmax_logits(preds)

    best_dir = trainer.state.best_model_checkpoint or str(fold_dir / "best")
    if trainer.state.best_model_checkpoint is None:
        trainer.save_model(best_dir)

    torch.cuda.empty_cache()
    return fold, val_idx, oof_preds, best_dir, device_id

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

oof_deberta = np.zeros(len(train_df))
deverta_model_paths = [None] * len(folds)

jobs = []
for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"[Model A] Preparing fold {fold+1}/{N_SPLITS}")
    train_slice = train_df.iloc[train_idx][[text_col, label_col]].reset_index(drop=True)
    val_slice = train_df.iloc[val_idx][[text_col, label_col]].reset_index(drop=True)
    device_id = None
    if PARALLEL_FOLDS and len(AVAILABLE_GPUS):
        device_id = AVAILABLE_GPUS[fold % len(AVAILABLE_GPUS)]
    jobs.append({"fold": fold, "train_df": train_slice, "val_df": val_slice, "val_idx": val_idx, "device_id": device_id})

if PARALLEL_FOLDS > 1 and len(AVAILABLE_GPUS):
    max_workers = min(PARALLEL_FOLDS, len(AVAILABLE_GPUS), len(jobs))
    print(f"[Model A] Running folds in parallel on GPUs {AVAILABLE_GPUS} (workers={max_workers})")
    ctx = mp.get_context("spawn")
    with ctx.Pool(processes=max_workers) as pool:
        for fold, val_idx, preds, best_dir, device_id in pool.imap_unordered(_train_deberta_fold, jobs):
            print(f"[Model A] Fold {fold+1} finished on GPU {device_id}")
            oof_deberta[val_idx] = preds
            deverta_model_paths[fold] = best_dir
else:
    for job in jobs:
        fold, val_idx, preds, best_dir, device_id = _train_deberta_fold(job)
        print(f"[Model A] Fold {fold+1}/{N_SPLITS} (GPU {device_id})")
        oof_deberta[val_idx] = preds
        deverta_model_paths[job["fold"]] = best_dir

# Remove potential None placeholders and keep existing behavior
_deberta_model_paths_clean = [p for p in deverta_model_paths if p is not None]
deverta_model_paths = _deberta_model_paths_clean


deverta_oof_auc = roc_auc_score(y, oof_deberta)
print(f"Model A OOF ROC-AUC: {deverta_oof_auc:.5f}")
pd.DataFrame({"oof_deberta": oof_deberta}).to_csv(OOF_DIR / "oof_deberta.csv", index=False)


[Model A] Preparing fold 1/5
[Model A] Preparing fold 2/5


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:

# Model B: LightGBM on numerical features only (Optuna tuning)
import optuna
from optuna.integration import lightgbm as lgb_optuna

oof_lgb = np.zeros(len(train_df))
lgb_models = []
lgb_model_paths = []
lgb_fold_auc = []

time_budget = int(LGB_TIME_BUDGET)

# Tuning con los mismos folds que el CV de entrenamiento
train_data = lgb.Dataset(train_df[num_cols], label=y)

tuner_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting": "gbdt",
    "n_estimators": 3000,
    "learning_rate": 0.015,
    "verbosity": -1,
    "feature_pre_filter": False,
    "scale_pos_weight": (len(y) - y.sum()) / y.sum(),
    "num_leaves": 63,
    "max_depth": -1,
    "min_child_samples": 40,
    "colsample_bytree": 0.6,
    "subsample": 0.8,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "random_state": 42,
    "bagging_freq": 1,
    "reg_lambda": 1.5,
    "reg_alpha": 0.5,
    "min_split_gain": 0.0,
}

tuner_kwargs = dict(
    params=tuner_params,
    train_set=train_data,
    folds=folds,
    num_boost_round=2500,
    callbacks=[lgb.early_stopping(100, verbose=False)],
    return_cvbooster=True,
    study=optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED)),
)
if time_budget > 0:
    tuner_kwargs["time_budget"] = time_budget

# Ejecuta tuning (ajusta time_budget o usa study.sampler si quieres limitar)
tuner = lgb_optuna.LightGBMTunerCV(**tuner_kwargs)
tuner.run()

best_params = tuner.best_params
best_cvbooster = None
if hasattr(tuner, "get_best_booster"):
    try:
        best_cvbooster = tuner.get_best_booster()
    except Exception:
        best_cvbooster = None
best_iter = getattr(best_cvbooster, "best_iteration", None) if best_cvbooster is not None else None
if best_iter is None:
    best_iter = best_params.get("num_boost_round") or best_params.get("n_estimators") or 2500
best_params.pop("metric", None)
best_params.pop("feature_pre_filter", None)
best_params.pop("num_boost_round", None)
best_params.update({
    "n_estimators": best_iter,
    "random_state": SEED,
    "n_jobs": -1,
})
print("[Model B] Mejores parametros:", best_params)

for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"[Model B] Fold {fold+1}/{N_SPLITS}")
    X_train = train_df.iloc[train_idx][num_cols]
    y_train = y[train_idx]
    X_val = train_df.iloc[val_idx][num_cols]
    y_val = y[val_idx]

    fold_dir = LGB_MODEL_DIR / f"fold_{fold}"
    legacy_fold_dir = LGB_MODEL_DIR_LEGACY / f"fold_{fold}"
    fold_dir.mkdir(parents=True, exist_ok=True)
    legacy_fold_dir.mkdir(parents=True, exist_ok=True)

    model = lgb.LGBMClassifier(**best_params, random_state=SEED + fold)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(100, verbose=False)],
    )

    val_preds = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
    fold_auc = roc_auc_score(y_val, val_preds)
    print(f"[Model B] Fold {fold+1} AUC: {fold_auc:.5f} (best_iter={model.best_iteration_})")

    oof_lgb[val_idx] = val_preds
    lgb_models.append(model)

    model.booster_.save_model(str(fold_dir / "best.txt"), num_iteration=model.best_iteration_)
    model.booster_.save_model(str(legacy_fold_dir / "best.txt"), num_iteration=model.best_iteration_)
    lgb_model_paths.append(fold_dir / "best.txt")
    lgb_fold_auc.append(fold_auc)

lgb_oof_auc = roc_auc_score(y, oof_lgb)
print(f"Model B OOF ROC-AUC: {lgb_oof_auc:.5f}")
pd.DataFrame({"oof_lgb": oof_lgb}).to_csv(OOF_DIR / "oof_lgb.csv", index=False)
pd.DataFrame({"fold": np.arange(N_SPLITS), "fold_auc": lgb_fold_auc}).to_csv(
    OOF_DIR / "oof_lgb_folds.csv", index=False
)


In [None]:

# Model C: TF-IDF (char n-grams) + SGD/LogReg (GPU via RAPIDS cuML when possible)
# VRAM-safe defaults; GPU disabled by default to avoid OOM/compat issues.
force_cpu_sgd = True  # set to False to try GPU path
use_gpu_sgd = torch.cuda.is_available() and not force_cpu_sgd
gpu_trim_chars = 400  # truncate text for GPU path to reduce n-grams
gpu_ngram_range = (3, 5)
gpu_min_df = 2
gpu_max_features = 80000
cpu_trim_chars = int(CPU_TRIM_CHARS)  # more contexto por defecto
cpu_ngram_range = (3, 5)
cpu_min_df = 2
cpu_max_features = int(CPU_MAX_FEATURES)
cpu_n_jobs = -1  # use all CPU cores for parallel folds
use_logreg_tfidf = bool(TFIDF_LOGREG)  # LogisticRegression sobre TF-IDF (mejor calibración)
try:
    cp.cuda.runtime.getDeviceCount()
except Exception as e:
    print(f"[Model C] GPU not usable (CuPy/CUDA check failed): {e}")
    use_gpu_sgd = False

oof_sgd = np.zeros(len(train_df))
sgd_models = []
sgd_model_paths = []
sgd_fold_auc = []

print("====================================")
print("[Model C] Training on GPU with RAPIDS cuML")
print("====================================")

if use_gpu_sgd:
    try:
        for fold, (train_idx, val_idx) in enumerate(folds):
            print(f"[Model C] Fold {fold+1}/{N_SPLITS}")

            train_text = train_df.iloc[train_idx][text_col].astype(str)
            val_text = train_df.iloc[val_idx][text_col].astype(str)
            if gpu_trim_chars is not None:
                train_text = train_text.str.slice(stop=gpu_trim_chars)
                val_text = val_text.str.slice(stop=gpu_trim_chars)

            X_train_gpu = cudf.Series(train_text.values)
            X_val_gpu = cudf.Series(val_text.values)

            y_train_gpu = cp.array(y[train_idx], dtype=cp.float32)
            y_val = y[val_idx]

            fold_dir = SGD_MODEL_DIR / f"fold_{fold}"
            fold_dir.mkdir(parents=True, exist_ok=True)

            tfidf_gpu = CuTfidfVectorizer(
                analyzer="char",
                ngram_range=gpu_ngram_range,
                min_df=gpu_min_df,
                max_features=gpu_max_features,
            )

            X_train_tfidf = tfidf_gpu.fit_transform(X_train_gpu)
            X_val_tfidf = tfidf_gpu.transform(X_val_gpu)

            clf = MBSGDClassifier(
                loss="log",
                penalty="l2",
                alpha=1e-4,
                epochs=2000,
                tol=1e-3,
                learning_rate="adaptive",
            )

            clf.fit(X_train_tfidf, y_train_gpu)

            val_probs_gpu = clf.predict_proba(X_val_tfidf)
            val_preds = val_probs_gpu.values[:, 1].get() if hasattr(val_probs_gpu, "values") else val_probs_gpu[:, 1].get()

            fold_auc = roc_auc_score(y_val, val_preds)
            print(f"[Model C] Fold {fold+1} AUC: {fold_auc:.5f}")

            oof_sgd[val_idx] = val_preds
            sgd_models.append(clf)
            joblib.dump(clf, fold_dir / "best.joblib")
            sgd_model_paths.append(fold_dir / "best.joblib")
            sgd_fold_auc.append(fold_auc)

            cp.get_default_memory_pool().free_all_blocks()
            cp.get_default_pinned_memory_pool().free_all_blocks()
    except Exception as e:
        print(f"[Model C] GPU training failed, falling back to CPU TF-IDF+SGD. Error: {e}")
        oof_sgd = np.zeros(len(train_df))
        sgd_models = []
        sgd_model_paths = []
        sgd_fold_auc = []
        use_gpu_sgd = False

if not use_gpu_sgd:
    print("====================================")
    print("[Model C] Training on CPU with sklearn TF-IDF + SGDClassifier/LogReg")
    print("====================================")

    sgd_param_grid = [
        {"alpha": 1e-4, "eta0": 0.1},
        {"alpha": 3e-4, "eta0": 0.05},
        {"alpha": 1e-5, "eta0": 0.2},
    ]

    def train_cpu_fold(fold, train_idx, val_idx):
        print(f"[Model C-CPU] Fold {fold+1}/{N_SPLITS}")

        X_train_cpu = train_df.iloc[train_idx][text_col].astype(str)
        X_val_cpu = train_df.iloc[val_idx][text_col].astype(str)
        if cpu_trim_chars is not None:
            X_train_cpu = X_train_cpu.str.slice(stop=cpu_trim_chars)
            X_val_cpu = X_val_cpu.str.slice(stop=cpu_trim_chars)
        y_val = y[val_idx]

        tfidf_cpu = TfidfVectorizer(
            analyzer="char_wb",
            ngram_range=cpu_ngram_range,
            min_df=cpu_min_df,
            max_features=cpu_max_features,
            sublinear_tf=True,
            strip_accents="unicode",
        )

        X_train_tfidf = tfidf_cpu.fit_transform(X_train_cpu)
        X_val_tfidf = tfidf_cpu.transform(X_val_cpu)

        best_model = None
        best_auc = -np.inf
        best_preds = None

        if use_logreg_tfidf:
            clf = LogisticRegression(
                max_iter=800,
                n_jobs=-1,
                C=1.0,
                solver="lbfgs",
                class_weight="balanced",
            )
            clf.fit(X_train_tfidf, y[train_idx])
            val_preds = clf.predict_proba(X_val_tfidf)[:, 1]
            best_model, best_preds = clf, val_preds
            best_auc = roc_auc_score(y_val, val_preds)
        else:
            for params in sgd_param_grid:
                clf = SGDClassifier(
                    loss="log_loss",
                    penalty="l2",
                    alpha=params["alpha"],
                    max_iter=2000,
                    tol=1e-3,
                    random_state=SEED + fold,
                    learning_rate="adaptive",
                    eta0=params["eta0"],
                    class_weight="balanced",
                )

                clf.fit(X_train_tfidf, y[train_idx])
                val_preds = clf.predict_proba(X_val_tfidf)[:, 1]
                fold_auc = roc_auc_score(y_val, val_preds)
                if fold_auc > best_auc:
                    best_auc = fold_auc
                    best_model = clf
                    best_preds = val_preds

        fold_auc = roc_auc_score(y_val, best_preds)
        print(f"[Model C-CPU] Fold {fold+1} AUC: {fold_auc:.5f}")

        fold_dir = SGD_MODEL_DIR / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)
        joblib.dump(best_model, fold_dir / "best.joblib")

        return {
            "fold": fold,
            "val_idx": val_idx,
            "val_preds": best_preds,
            "clf": best_model,
            "fold_auc": fold_auc,
            "model_path": fold_dir / "best.joblib",
        }

    cpu_results = joblib.Parallel(n_jobs=cpu_n_jobs, backend="loky")(
        joblib.delayed(train_cpu_fold)(fold, train_idx, val_idx)
        for fold, (train_idx, val_idx) in enumerate(folds)
    )

    for res in sorted(cpu_results, key=lambda r: r["fold"]):
        oof_sgd[res["val_idx"]] = res["val_preds"]
        sgd_models.append(res["clf"])
        sgd_model_paths.append(res["model_path"])
        sgd_fold_auc.append(res["fold_auc"])

sgd_oof_auc = roc_auc_score(y, oof_sgd)
print(f"Model C OOF ROC-AUC: {sgd_oof_auc:.5f}")
pd.DataFrame({"oof_sgd": oof_sgd}).to_csv(OOF_DIR / "oof_sgd.csv", index=False)
fold_ids = list(range(len(sgd_fold_auc)))
fold_auc_values = list(sgd_fold_auc)
if len(fold_auc_values) != N_SPLITS:
    print(f"[Model C] Warning: expected {N_SPLITS} fold AUCs, got {len(fold_auc_values)}; padding with NaN")
    for missing_fold in range(len(fold_auc_values), N_SPLITS):
        fold_ids.append(missing_fold)
        fold_auc_values.append(np.nan)

sgd_folds_df = pd.DataFrame({"fold": fold_ids, "fold_auc": fold_auc_values})
sgd_folds_df.to_csv(OOF_DIR / "oof_sgd_folds.csv", index=False)


GPU available for Model C: True
[Model C] Training on GPU with RAPIDS cuML
[Model C] Fold 1/5
[Model C] GPU training failed, falling back to CPU TF-IDF+SGD. Error: std::bad_alloc: out_of_memory: CUDA error (failed to allocate 7152403344 bytes) at: /__w/rmm/rmm/cpp/include/rmm/mr/device/cuda_memory_resource.hpp:62: cudaErrorMemoryAllocation out of memory
Model C OOF ROC-AUC: 0.50000


In [None]:

# Stacked Ensemble: Logistic Regression meta-learner on OOF probabilities
# Use cached OOF predictions so this cell can run without retraining base models.
def load_oof_preds(name):
    existing = globals().get(f"oof_{name}")
    if existing is not None and len(existing):
        return np.asarray(existing)
    path = OOF_DIR / f"oof_{name}.csv"
    if path.exists():
        col = f"oof_{name}"
        df = pd.read_csv(path)
        if col in df:
            print(f"[Meta] Loaded {col} from {path}")
            return df[col].values
    return None

oof_sources = {}
missing_oof = []
length_mismatch = []
for key in STACK_BASE_MODELS:
    preds = load_oof_preds(key)
    if preds is None:
        missing_oof.append(key)
        continue
    if len(preds) != len(y):
        length_mismatch.append((key, len(preds)))
        continue
    oof_sources[key] = preds

if missing_oof:
    missing_msg = ", ".join(missing_oof)
    raise RuntimeError(
        f"Missing OOF predictions for: {missing_msg}. "
        "Run the corresponding training cells once to cache them to disk."
    )

if length_mismatch:
    mismatch_msg = "; ".join(f"{k} has {n} rows" for k, n in length_mismatch)
    raise RuntimeError(f"OOF size mismatch ({len(y)} rows expected): {mismatch_msg}")

stack_train = np.column_stack([oof_sources[k] for k in STACK_BASE_MODELS])

# Cross-validated meta learner to reduce overfitting
meta_candidates = [0.01, 0.1, 1.0, 10]
meta_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

meta_learner = LogisticRegressionCV(
    Cs=meta_candidates,
    cv=meta_skf,
    max_iter=2000,
    n_jobs=-1,
    solver="lbfgs",
    scoring="roc_auc",
    class_weight="balanced",
)
meta_learner.fit(stack_train, y)
meta_learner.base_model_order = STACK_BASE_MODELS
best_c = float(np.ravel(meta_learner.C_)[0])
stack_auc = roc_auc_score(y, meta_learner.predict_proba(stack_train)[:, 1])
print(f"Meta-learner OOF ROC-AUC: {stack_auc:.5f} (best C={best_c})")

stack_model_path = STACK_MODEL_DIR / "meta_learner.joblib"
joblib.dump(meta_learner, stack_model_path)

stack_oof = {f"oof_{k}": oof_sources[k] for k in STACK_BASE_MODELS}
stack_oof["oof_stack"] = meta_learner.predict_proba(stack_train)[:, 1]
stack_oof[label_col] = y
pd.DataFrame(stack_oof).to_csv(OOF_DIR / "oof_stack.csv", index=False)


In [None]:
# Inference on test data (averaging fold predictions for each base model)
def _dedupe_paths(paths):
    seen = set()
    unique = []
    for p in paths:
        p = Path(p)
        if p.exists() and p not in seen:
            seen.add(p)
            unique.append(p)
    return unique
def predict_deberta(df):
    tok = globals().get("tokenizer") or AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
    coll = globals().get("collator") or DataCollatorWithPadding(tokenizer=tok, padding=True)
    globals()["tokenizer"] = tok
    globals()["collator"] = coll
    test_ds = HFTextDataset(df, tok, text_col, None, MAX_LENGTH)
    fold_paths = _dedupe_paths(
        [MODEL_DIR / f"fold_{f}" / "best" for f in range(N_SPLITS)]
        + list(globals().get("deberta_model_paths", []))
    )
    if not fold_paths:
        raise RuntimeError("No DeBERTa checkpoints found; run Model A training first.")
    fold_preds = []
    for path in fold_paths:
        model = AutoModelForSequenceClassification.from_pretrained(path).to(DEVICE)
        infer_trainer = Trainer(model=model, tokenizer=tok, data_collator=coll)
        preds = infer_trainer.predict(test_ds).predictions
        fold_preds.append(softmax_logits(preds))
        torch.cuda.empty_cache()
    return np.mean(fold_preds, axis=0)
def predict_lgb(df):
    feats = df[num_cols]
    lgb_dirs = [LGB_MODEL_DIR]
    if "LGB_MODEL_DIR_LEGACY" in globals():
        lgb_dirs.append(LGB_MODEL_DIR_LEGACY)

    candidate_paths = _dedupe_paths(
        [d / f"fold_{f}" / "best.txt" for d in lgb_dirs for f in range(N_SPLITS)]
        + list(globals().get("lgb_model_paths", []))
    )
    if candidate_paths:
        models = [lgb.Booster(model_file=str(p)) for p in candidate_paths]
    else:
        models = globals().get("lgb_models")
    if not models:
        raise RuntimeError("No LightGBM models found; run Model B training first.")

    fold_preds = []
    for m in models:
        if isinstance(m, lgb.Booster):
            fold_preds.append(m.predict(feats))
        else:
            fold_preds.append(m.predict_proba(feats, num_iteration=getattr(m, "best_iteration_", None))[:, 1])
    return np.mean(fold_preds, axis=0)
def predict_sgd(df):
    texts = df[text_col].astype(str)
    candidate_paths = _dedupe_paths(
        [SGD_MODEL_DIR / f"fold_{f}" / "best.joblib" for f in range(N_SPLITS)]
        + list(globals().get("sgd_model_paths", []))
    )
    if candidate_paths:
        models = [joblib.load(p) for p in candidate_paths]
    else:
        models = globals().get("sgd_models")
    if not models:
        raise RuntimeError("No TF-IDF+SGD models found; run Model C training first.")
    fold_preds = [m.predict_proba(texts)[:, 1] for m in models]
    return np.mean(fold_preds, axis=0)
def load_meta_model():
    saved_path = STACK_MODEL_DIR / "meta_learner.joblib"
    if saved_path.exists():
        return joblib.load(saved_path)
    fallback = globals().get("meta_learner")
    if fallback is None:
        raise RuntimeError("Meta-learner not trained yet; run the stacking cell.")
    return fallback
if test_df is not None:
    print("Running inference on test set...")
    base_preds = {
        "deberta": predict_deberta(test_df),
        "lgb": predict_lgb(test_df),
        "sgd": predict_sgd(test_df),
    }
    meta_for_inference = load_meta_model()
    base_order = getattr(meta_for_inference, "base_model_order", STACK_BASE_MODELS)
    stack_test = np.column_stack([base_preds[name] for name in base_order])
    test_pred = meta_for_inference.predict_proba(stack_test)[:, 1]
    submission = pd.DataFrame({"id": test_df.index, "prediction": test_pred})
    submission.to_csv(WORK_DIR / "submission.csv", index=False)
    print("Saved submission.csv")
else:
    print("No test file found; set TEST_PATH to run inference.")


In [None]:
# Model diagnostics summary (OOF and fold-level)
from collections import OrderedDict

# Helper to grab in-memory arrays first, otherwise fall back to saved OOF files

def load_oof(name):
    in_memory = globals().get(f"oof_{name}")
    if in_memory is not None:
        return np.asarray(in_memory)
    path = OOF_DIR / f"oof_{name}.csv"
    if path.exists():
        col = f"oof_{name}"
        df = pd.read_csv(path)
        if col in df:
            return df[col].values
    return None


def fold_scores(preds):
    scores = []
    for fold, (_, val_idx) in enumerate(folds):
        scores.append(roc_auc_score(y[val_idx], preds[val_idx]))
    return scores


summary_rows = []
for label, key in [
    ("Model A: DeBERTa-v3", "deberta"),
    ("Model B: LightGBM numeric", "lgb"),
    ("Model C: TF-IDF + SGD", "sgd"),
    ("Meta-learner (stack)", "stack"),
]:
    preds = load_oof(key)
    if preds is None:
        print(f"Skipping {label}: no OOF predictions found")
        continue
    overall = roc_auc_score(y, preds)
    folds_auc = fold_scores(preds) if len(preds) == len(y) else None
    summary_rows.append(
        OrderedDict(
            model=label,
            overall_auc=overall,
            fold_mean=np.mean(folds_auc) if folds_auc else None,
            fold_std=np.std(folds_auc) if folds_auc else None,
            min_fold=np.min(folds_auc) if folds_auc else None,
            max_fold=np.max(folds_auc) if folds_auc else None,
        )
    )

summary_df = pd.DataFrame(summary_rows)
if not summary_df.empty:
    display(summary_df.sort_values("overall_auc", ascending=False))
else:
    print("No OOF data available to summarize.")

# Meta-learner diagnostics (if saved/loaded)
meta_model = None
if 'load_meta_model' in globals():
    try:
        meta_model = load_meta_model()
    except Exception:
        meta_model = globals().get('meta_learner')
elif 'meta_learner' in globals():
    meta_model = meta_learner

if meta_model is not None and hasattr(meta_model, 'coef_'):
    coef = meta_model.coef_.ravel()
    bases = ['deberta', 'lgb', 'sgd'][: len(coef)]
    print("Meta-learner coefficients (positive -> higher AI probability):")
    display(pd.Series(coef, index=bases))
    if hasattr(meta_model, 'C'):
        print(f"Meta-learner C: {getattr(meta_model, 'C', None)}")

# Correlation between base model OOF predictions (helps assess diversity)
base_preds = {
    name: load_oof(name)
    for name in ['deberta', 'lgb', 'sgd']
}
if all(v is not None for v in base_preds.values()):
    corr_df = pd.DataFrame(base_preds)
    print("Correlation of base model OOF predictions:")
    display(corr_df.corr())

# Class balance reminder
pos_rate = y.mean()
print(f"Positive rate in training: {pos_rate:.4f} (n={len(y)})")
