In [None]:
!pip install memory_profiler optuna

# Imports

In [None]:
import os
import time
import gzip
import math
import json
import shutil
import zipfile
import requests
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from pandas.errors import ParserError

# Optional GPU memory
try:
    import cupy as cp
    from cupy.cuda import runtime as cuda_rt
    GPU_OK = True
except Exception:
    GPU_OK = False

import psutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from memory_profiler import memory_usage

# Baselines
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

try:
    import xgboost as xgb
    XGB_OK = True
except Exception:
    XGB_OK = False

try:
    import lightgbm as lgb
    LGB_OK = True
except Exception:
    LGB_OK = False

try:
    from catboost import CatBoostClassifier
    CAT_OK = True
except Exception:
    CAT_OK = False

# from adaptive_bayes import AdaptiveBayes
from improved_adaptive_bayes import AdaptiveBayes

# Functions

In [None]:
def _gpu_mem_info():
    if not GPU_OK:
        return None, None
    free_b, total_b = cuda_rt.memGetInfo()
    return free_b, total_b

def _proc_rss_mb():
    return psutil.Process(os.getpid()).memory_info().rss / (1024*1024)

def _measure_run(fn, *args, **kwargs):
    # CPU mem before
    rss_before = _proc_rss_mb()
    # GPU mem before
    free0, total0 = _gpu_mem_info()
    t0 = time.perf_counter()

    def _wrapped():
        return fn(*args, **kwargs)

    mem_trace = memory_usage((_wrapped, (), {}), max_iterations=1, interval=0.1, retval=True)
    if isinstance(mem_trace, tuple) and len(mem_trace) == 2:
        mem_series, ret = mem_trace
    else:
        mem_series, ret = mem_trace, None

    elapsed = time.perf_counter() - t0
    rss_after = _proc_rss_mb()
    free1, total1 = _gpu_mem_info()

    peak_cpu = max(mem_series) - rss_before if mem_series else 0.0
    rss_delta = rss_after - rss_before

    gpu_delta = None
    if free0 is not None and free1 is not None:
        gpu_delta = (free0 - free1) / (1024*1024)

    return {
        "elapsed_s": elapsed,
        "cpu_rss_mb_before": rss_before,
        "cpu_rss_mb_after": rss_after,
        "cpu_rss_delta_mb": rss_delta,
        "cpu_peak_mb": peak_cpu,
        "gpu_mem_delta_mb": gpu_delta,
        "ret": ret
    }

# ---------------------------
# Dataset loaders (paths expected)
# ---------------------------
def load_creditcard_fraud(path_csv):
    # Kaggle: V1..V28 + Time, Amount, Class; binary Class
    df = pd.read_csv(path_csv)
    y = df['Class'].astype(np.int32).values
    X = df.drop(columns=['Class']).values.astype(np.float64)
    return X, y

def load_higgs(path_gz):
    # UCI: CSV.gz with label first, then 28 features
    with gzip.open(path_gz, 'rt') as f:
        df = pd.read_csv(f, header=None)
    y = df.iloc[:, 0].astype(np.int32).values
    X = df.iloc[:, 1:].values.astype(np.float64)
    return X, y

def load_susy(path_gz):
    with gzip.open(path_gz, 'rt') as f:
        df = pd.read_csv(f, header=None)
    y = df.iloc[:, 0].astype(np.int32).values
    X = df.iloc[:, 1:].values.astype(np.float64)
    return X, y

def load_kddcup99(path_csv, drop_cats=True):
    # Mixed dtypes; simplify to numeric by one-hot or drop_cats
    df = pd.read_csv(path_csv, header=None)
    if drop_cats:
        # Keep numeric columns only
        num_df = df.select_dtypes(include=[np.number])
        # Target can be last column or named; assume last is label string -> map to binary (normal vs attack)
        # If last column non-numeric, we map
        if not np.issubdtype(df.iloc[:, -1].dtype, np.number):
            y = (df.iloc[:, -1].astype(str) != 'normal.').astype(np.int32).values
        else:
            y = df.iloc[:, -1].astype(np.int32).values
        X = num_df.iloc[:, :-1].values.astype(np.float64)
    else:
        # One-hot encode categoricals
        y = (df.iloc[:, -1].astype(str) != 'normal.').astype(np.int32).values
        X = pd.get_dummies(df.iloc[:, :-1]).values.astype(np.float64)
    return X, y

def load_covertype(path_csv):
    df = pd.read_csv(path_csv)
    target_col = 'Cover_Type' if 'Cover_Type' in df.columns else df.columns[-1]
    y = np.asarray(df[target_col], dtype=np.int32) - 1
    # Convert to binary: class1 vs others to align with AUC, or keep multiclass for accuracy
    # Here we keep multiclass; AUC will be skipped for multiclass
    X = df.drop(columns=[target_col]).values.astype(np.float64)
    return X, y

def create_synthetic_hepmass():
    """Создает синтетический датасет в стиле HEPMASS"""
    print("Creating synthetic HEPMASS-like dataset...")
    np.random.seed(42)
    n_samples = 500000
    n_features = 28

    # Создать корреляционные признаки как в физических данных
    X = np.random.randn(n_samples, n_features).astype(np.float64)

    # Добавить нелинейные взаимодействия для реализма
    X[:, 1] = X[:, 0] ** 2 + 0.5 * np.random.randn(n_samples)
    X[:, 2] = X[:, 0] * X[:, 1] + 0.3 * np.random.randn(n_samples)

    # Создать сложную целевую переменную
    signal = (0.3 * X[:, 0] + 0.2 * X[:, 1] - 0.1 * X[:, 2] +
              0.15 * X[:, 3] * X[:, 4] + 0.1 * np.sin(X[:, 5]))
    noise = 0.5 * np.random.randn(n_samples)
    y = (signal + noise > 0).astype(np.int32)

    print(f"Synthetic dataset: X={X.shape}, y={y.shape}, class balance={np.mean(y):.3f}")
    return X, y

def load_hepmass(path_csv):
    """Робастная загрузка HEPMASS с обработкой ошибок парсинга"""
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
    df = None

    for encoding in encodings:
        try:
            # Попробовать разные варианты парсинга
            parsing_options = [
                # Стандартный CSV
                {'encoding': encoding, 'sep': ','},
                # Разделитель - пробел/табуляция
                {'encoding': encoding, 'sep': r'\s+', 'engine': 'python'},
                # Пропуск плохих строк
                {'encoding': encoding, 'sep': ',', 'on_bad_lines': 'skip'},
                # Без заголовка
                {'encoding': encoding, 'sep': ',', 'header': None, 'on_bad_lines': 'skip'},
            ]

            for options in parsing_options:
                try:
                    df = pd.read_csv(path_csv, **options)
                    print(f"Successfully loaded HEPMASS with encoding: {encoding}, options: {options}")
                    break
                except (ParserError, pd.errors.ParserError):
                    continue

            if df is not None:
                break

        except UnicodeDecodeError:
            continue

    if df is None or df.empty:
        print("Failed to load HEPMASS, creating synthetic dataset...")
        return create_synthetic_hepmass()

    # Обработка разных форматов колонок
    print(f"HEPMASS loaded: {df.shape}, columns: {list(df.columns)}")

    # Попробовать найти целевую переменную
    if '# label' in df.columns:
        y = df['# label'].astype(np.int32)
        X = df.drop(columns=['# label']).select_dtypes(include=[np.number]).values.astype(np.float64)
    elif 'type' in df.columns:
        y = df['type'].astype(np.int32)
        X = df.drop(columns=['type']).select_dtypes(include=[np.number]).values.astype(np.float64)
    elif 'label' in df.columns:
        y = df['label'].astype(np.int32)
        X = df.drop(columns=['label']).select_dtypes(include=[np.number]).values.astype(np.float64)
    else:
        # Предполагаем первую или последнюю колонку как target
        if df.shape[1] > 1:
            # Попробовать последнюю колонку как target
            last_col = df.iloc[:, -1]
            if last_col.dtype in ['int64', 'float64'] and last_col.nunique() <= 10:
                y = last_col.astype(np.int32)
                X = df.iloc[:, :-1].select_dtypes(include=[np.number]).values.astype(np.float64)
            else:
                # Первая колонка как target
                y = df.iloc[:, 0].astype(np.int32)
                X = df.iloc[:, 1:].select_dtypes(include=[np.number]).values.astype(np.float64)
        else:
            print("Cannot determine target variable, creating synthetic...")
            return create_synthetic_hepmass()

    # Проверки корректности
    if X.shape[0] == 0 or X.shape[1] == 0:
        print("Empty feature matrix, creating synthetic...")
        return create_synthetic_hepmass()

    # Конвертация меток в бинарные если нужно
    if len(np.unique(y)) > 2:
        print(f"Converting {len(np.unique(y))} classes to binary")
        y = (y > np.median(y)).astype(np.int32)

    # Убрать NaN/inf значения
    X = np.nan_to_num(X, nan=0.0, posinf=1e6, neginf=-1e6)

    return X, y

def load_avazu(path_csv, sample_n=None):
    # High-cardinality categoricals; use basic hashing trick to numeric bins for fairness
    df_iter = pd.read_csv(path_csv, chunksize=10_000_00)
    df = next(df_iter)
    if sample_n is not None and len(df) > sample_n:
        df = df.sample(sample_n, random_state=42)
    if 'click' in df.columns:
        y = df['click'].astype(np.int32).values
        X = df.drop(columns=['click'])
    else:
        # competition format: 'id','click',... ; fallback
        y = df.iloc[:, 1].astype(np.int32).values
        X = df.drop(columns=[df.columns[1]])
    # Hash trick
    MOD = 1_000_003
    X_num = []
    for col in X.columns:
        if np.issubdtype(X[col].dtype, np.number):
            X_num.append(X[col].astype(np.float64).values)
        else:
            X_num.append((X[col].astype(str).apply(hash).values % MOD).astype(np.float64))
    X_num = np.vstack(X_num).T
    return X_num, y

def train_eval_one_old(model_name, model_ctor, X_train, y_train, X_test, y_test, is_multiclass=False, use_gpu=False):

    if model_name == "AdaptiveBayes":
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        fit_stats = _measure_run(model_ctor['fit'], X_train_scaled, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test_scaled)
    elif model_name == "XGBoost":
        X_train_gpu = cp.asarray(X_train)
        y_train_gpu = cp.asarray(y_train)
        X_test_gpu = cp.asarray(X_test)
        y_test = cp.asarray(y_test)
        fit_stats = _measure_run(model_ctor['fit'], X_train_gpu, y_train_gpu)
        pred_stats = _measure_run(model_ctor['predict'], X_test_gpu)
    else:
        # Обычное обучение для других моделей
        fit_stats = _measure_run(model_ctor['fit'], X_train, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test)

    y_pred = pred_stats["ret"]
    # Proba if available
    auc = None
    if not is_multiclass and 'predict_proba' in model_ctor:
        proba_stats = _measure_run(model_ctor['predict_proba'], X_test)
        y_prob = proba_stats["ret"][:, 1] if y_prob_shape(proba_stats["ret"]) else proba_stats["ret"]
        auc = roc_auc_score(y_test, y_prob)
        proba_time = proba_stats["elapsed_s"]
    else:
        proba_time = None
    acc = accuracy_score(y_test, y_pred)
    return {
        "model": model_name,
        "fit_s": fit_stats["elapsed_s"],
        "pred_s": pred_stats["elapsed_s"],
        "proba_s": proba_time,
        "cpu_peak_mb_fit": fit_stats["cpu_peak_mb"],
        "gpu_mem_mb_fit": fit_stats["gpu_mem_delta_mb"],
        "acc": acc,
        "auc": auc
    }

def train_eval_one(model_name, model_ctor, X_train, y_train, X_test, y_test, is_multiclass=False, use_gpu=False):
    # ДОБАВИТЬ: преобразование y_test в NumPy для sklearn метрик
    if hasattr(y_test, 'get'):  # CuPy массив
        y_test_np = y_test.get()
    else:
        y_test_np = np.asarray(y_test)

    # Нормализация для AdaptiveBayes
    if model_name == "AdaptiveBayes":
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        fit_stats = _measure_run(model_ctor['fit'], X_train_scaled, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test_scaled)
        y_pred = pred_stats["ret"]
        if not is_multiclass and 'predict_proba' in model_ctor:
            proba_stats = _measure_run(model_ctor['predict_proba'], X_test_scaled)
            y_prob = proba_stats["ret"]
            # y_prob уже NumPy из improved_adaptive_bayes.py
            auc = roc_auc_score(y_test_np, y_prob)  # Использовать y_test_np
            proba_time = proba_stats["elapsed_s"]
        else:
            proba_time = None
            auc = None
    else:
        # Обычное обучение для других моделей
        fit_stats = _measure_run(model_ctor['fit'], X_train, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test)
        y_pred = pred_stats["ret"]
        if not is_multiclass and 'predict_proba' in model_ctor:
            proba_stats = _measure_run(model_ctor['predict_proba'], X_test)
            y_prob = proba_stats["ret"][:, 1] if y_prob_shape(proba_stats["ret"]) else proba_stats["ret"]
            auc = roc_auc_score(y_test_np, y_prob)  # Использовать y_test_np
            proba_time = proba_stats["elapsed_s"]
        else:
            proba_time = None
            auc = None

    # Преобразовать y_pred в NumPy если нужно
    if hasattr(y_pred, 'get'):
        y_pred = y_pred.get()

    acc = accuracy_score(y_test_np, y_pred)  # Использовать y_test_np

    return {
        "model": model_name,
        "fit_s": fit_stats["elapsed_s"],
        "pred_s": pred_stats["elapsed_s"],
        "proba_s": proba_time,
        "cpu_peak_mb_fit": fit_stats["cpu_peak_mb"],
        "gpu_mem_mb_fit": fit_stats["gpu_mem_delta_mb"],
        "acc": acc,
        "auc": auc
    }

def y_prob_shape(arr):
    return (arr.ndim == 2) and (arr.shape[1] > 1)

def make_models(use_gpu):
    models = []

    # AdaptiveBayes
    ab = AdaptiveBayes(base_lr=1e-3, eps=1e-10, device='gpu' if use_gpu else 'cpu')
    models.append((
        "AdaptiveBayes",
        {
            "fit": ab.fit,
            "predict": ab.predict,
            "predict_proba": ab.predict_proba
        }
    ))

    # XGBoost
    if XGB_OK:
        if use_gpu:
            params = {
                "n_estimators": 300,
                "max_depth": 8,
                "learning_rate": 0.1,
                "subsample": 0.8,
                "colsample_bytree": 0.8,
                "tree_method": "hist",
                "device": "cuda",
                "eval_metric": "auc",
            }
        else:
            params = {
                "n_estimators": 300,
                "max_depth": 8,
                "learning_rate": 0.1,
                "subsample": 0.8,
                "colsample_bytree": 0.8,
                "tree_method": "hist",
                "eval_metric": "auc",
            }

        xgbc = xgb.XGBClassifier(**params)
        models.append((
            "XGBoost",
            {
                "fit": xgbc.fit,
                "predict": xgbc.predict,
                "predict_proba": xgbc.predict_proba
            }
        ))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=None)
    models.append((
        "RandomForest",
        {
            "fit": rf.fit,
            "predict": rf.predict,
            "predict_proba": rf.predict_proba
        }
    ))

    # Neural Net (sklearn MLP)
    mlp = MLPClassifier(hidden_layer_sizes=(256, 128), batch_size=512, max_iter=20, solver='adam', early_stopping=True, random_state=42)
    models.append((
        "MLP",
        {
            "fit": mlp.fit,
            "predict": mlp.predict,
            "predict_proba": mlp.predict_proba
        }
    ))

    # LightGBM
    if LGB_OK:
        device_type = 'gpu' if use_gpu else 'cpu'

        # Адаптивные параметры в зависимости от устройства
        if device_type == 'gpu':
            lgbm = lgb.LGBMClassifier(
                n_estimators=300,          # Меньше итераций для GPU
                num_leaves=511,            # Больше листьев
                learning_rate=0.01,        # Меньше learning rate
                subsample=0.8,
                colsample_bytree=0.8,
                device_type='gpu',
                max_bin=127,               # Больше bins для GPU
                min_data_in_leaf=100,      # Минимум данных в листе
                min_gain_to_split=0.01,    # Минимальный gain для разбиения
                verbose=-1                 # Убрать лишние warning'и
            )
        else:
            lgbm = lgb.LGBMClassifier(
                n_estimators=500,
                num_leaves=255,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                device_type='cpu',
                n_jobs=-1
            )

        models.append((
            "LightGBM",
            {
                "fit": lgbm.fit,
                "predict": lgbm.predict,
                "predict_proba": lgbm.predict_proba
            }
        ))

    # CatBoost
    if CAT_OK:
        cat = CatBoostClassifier(
            iterations=500, depth=8, learning_rate=0.1, verbose=False,
            task_type="GPU" if use_gpu else "CPU"
        )
        models.append((
            "CatBoost",
            {
                "fit": cat.fit,
                "predict": cat.predict,
                "predict_proba": cat.predict_proba
            }
        ))

    # Logistic Regression
    lr = LogisticRegression(max_iter=200, solver='saga', n_jobs=-1)
    models.append((
        "LogisticRegression",
        {
            "fit": lr.fit,
            "predict": lr.predict,
            "predict_proba": lr.predict_proba
        }
    ))

    return models

def run_benchmark(datasets_config, use_gpu=False, test_size=0.2, val_size=0.0, output_csv="results.csv"):
    rows = []
    for ds in datasets_config:
        name = ds["name"]
        loader = ds["loader"]
        path = ds["path"]
        is_multiclass = ds.get("multiclass", False)
        sample_n = ds.get("sample_n")
        print(f"Loading {name} ...")
        if name == "Synthetic":
            X, y = create_synthetic_hepmass()
        elif name == "Avazu":
            X, y = load_avazu(path, sample_n=sample_n)
        else:
            X, y = loader(path)
            if sample_n is not None and len(X) > sample_n:
                ridx = np.random.RandomState(42).choice(len(X), size=sample_n, replace=False)
                X = X[ridx]
                y = y[ridx]
        print(f"{name}: X={X.shape}, y={y.shape}")

        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y if not is_multiclass else None)
        models = make_models(use_gpu=use_gpu)
        for mname, m in models:
            print(f"Training {mname} on {name} ...")
            stats = train_eval_one(mname, m, X_tr, y_tr, X_te, y_te, is_multiclass=is_multiclass, use_gpu=use_gpu)
            stats["dataset"] = name
            rows.append(stats)
            print(stats)

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"Saved results to {output_csv}")
    return df

def download_file(url, dest, chunk_size=2**20):
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    with open(dest, 'wb') as f:
        for chunk in resp.iter_content(chunk_size):
            if chunk:
                f.write(chunk)

def download_all_datasets(data_dir='data/'):
    os.makedirs(data_dir, exist_ok=True)

    # Credit Card Fraud (Figshare: альтернатива Kaggle)
    creditcard_url = "https://www.dropbox.com/s/b44o3t3ehmnx2b7/creditcard.csv?dl=1"
    creditcard_path = os.path.join(data_dir, "creditcard.csv")
    if not os.path.exists(creditcard_path):
        print("Downloading CreditCardFraud ...")
        download_file(creditcard_url, creditcard_path)

    # HIGGS (UCI)
    higgs_url = "https://archive.ics.uci.edu/static/public/280/higgs.zip"
    higgs_zip = os.path.join(data_dir, "higgs.zip")
    higgs_csv_gz = os.path.join(data_dir, "HIGGS.csv.gz")
    if not os.path.exists(higgs_csv_gz):
        print("Downloading HIGGS ...")
        download_file(higgs_url, higgs_zip)
        with zipfile.ZipFile(higgs_zip) as zf:
            zf.extract("HIGGS.csv.gz", path=data_dir)
        os.remove(higgs_zip)

    # SUSY (UCI)
    susy_url = "https://archive.ics.uci.edu/static/public/279/susy.zip"
    susy_zip = os.path.join(data_dir, "susy.zip")
    susy_csv_gz = os.path.join(data_dir, "SUSY.csv.gz")
    if not os.path.exists(susy_csv_gz):
        print("Downloading SUSY ...")
        download_file(susy_url, susy_zip)
        with zipfile.ZipFile(susy_zip) as zf:
            zf.extract("SUSY.csv.gz", path=data_dir)
        os.remove(susy_zip)

    # KDDCup99 (10 percent) (UCI)
    kdd_url = "https://figshare.com/ndownloader/files/5976042"
    kdd_gz = os.path.join(data_dir, "kddcup.data_10_percent.gz")
    kdd_csv = os.path.join(data_dir, "kddcup.data_10_percent.csv")
    if not os.path.exists(kdd_csv):
        print("Downloading KDDCup99 ...")
        download_file(kdd_url, kdd_gz)
        import gzip
        with gzip.open(kdd_gz, 'rb') as f_in, open(kdd_csv, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(kdd_gz)

    # Covertype (UCI/sklearn, already CSV)
    covertype_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
    covertype_gz = os.path.join(data_dir, "covtype.data.gz")
    covertype_csv = os.path.join(data_dir, "covtype.csv")
    if not os.path.exists(covertype_csv):
        print("Downloading Covertype ...")
        download_file(covertype_url, covertype_gz)
        import gzip
        with gzip.open(covertype_gz, 'rb') as f_in, open(covertype_csv, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        os.remove(covertype_gz)

    # HEPMASS
    # hepmass_url = "https://archive.ics.uci.edu/static/public/347/hepmass.zip"
    # hepmass_csv = os.path.join(data_dir, "HEPMASS_train.csv")
    # if not os.path.exists(hepmass_csv):
    #     print("Downloading HEPMASS ...")
    #     download_file(hepmass_url, hepmass_csv)

    # Avazu CTR (HF mirror, 2m строк фрагмент — fastest for dev)
    avazu_url = "https://www.kaggle.com/api/v1/datasets/download/wuyingwen06/avazu-ctr-train"
    avazu_csv = os.path.join(data_dir, "avazu-ctr-train.zip")
    if not os.path.exists(avazu_csv):
        print("Downloading Avazu...")
        download_file(avazu_url, avazu_csv)

    print("Done downloading all datasets.")


In [None]:
data_dir='data/'
os.makedirs(data_dir, exist_ok=True)
# hepmass_url = "https://www.openml.org/data/get_csv/2419/BNG_balance-scale.csv"  # Временная замена
# hepmass_csv = os.path.join(data_dir, "HEPMASS_train.csv")
# if not os.path.exists(hepmass_csv):
#     print("Downloading HEPMASS (alternative dataset)...")
#     try:
#         download_file(hepmass_url, hepmass_csv)
#     except Exception as e:
#         print(f"Failed to download HEPMASS: {e}")
#         # Создать заглушку, чтобы не прерывать весь benchmark
#         print("Creating dummy HEPMASS dataset...")
#         np.random.seed(42)
#         X_dummy = np.random.randn(10000, 28).astype(np.float64)
#         y_dummy = np.random.randint(0, 2, 10000).astype(np.int32)
#         dummy_df = pd.DataFrame(X_dummy)
#         dummy_df['label'] = y_dummy
#         dummy_df.to_csv(hepmass_csv, index=False)
hepmass_url = "https://archive.ics.uci.edu/static/public/347/hepmass.zip"
hepmass_csv = os.path.join(data_dir, "HEPMASS_train.csv")
if not os.path.exists(hepmass_csv):
    print("Downloading HEPMASS ...")
    download_file(hepmass_url, hepmass_csv)

Downloading HEPMASS ...


# Main body

In [None]:
download_all_datasets("data/")

Downloading CreditCardFraud ...
Downloading HIGGS ...
Downloading SUSY ...
Downloading KDDCup99 ...
Downloading Covertype ...
Downloading Avazu...
Done downloading all datasets.


In [None]:
# Example configuration; update paths to local files
datasets = [
    {"name": "CreditCardFraud", "loader": load_creditcard_fraud, "path": "data/creditcard.csv", "multiclass": False},
    {"name": "HIGGS", "loader": load_higgs, "path": "data/HIGGS.csv.gz", "multiclass": False, "sample_n": 2_000_000},
    {"name": "SUSY", "loader": load_susy, "path": "data/SUSY.csv.gz", "multiclass": False, "sample_n": 2_000_000},
    {"name": "KDDCup99", "loader": load_kddcup99, "path": "data/kddcup.data_10_percent.csv", "multiclass": False},
    {"name": "Covertype", "loader": load_covertype, "path": "data/covtype.csv", "multiclass": True},
    {"name": "Synthetic", "loader": create_synthetic_hepmass, "path": "", "multiclass": False, "sample_n": 1_000_000},
    {"name": "Avazu", "loader": load_avazu, "path": "data/avazu-ctr-train.zip", "multiclass": False, "sample_n": 2_000_000},
]
use_gpu = GPU_OK
run_benchmark(datasets, use_gpu=use_gpu, output_csv="benchmark_results_default_impr_method.csv")

Loading CreditCardFraud ...
CreditCardFraud: X=(284807, 30), y=(284807,)
Training AdaptiveBayes on CreditCardFraud ...
{'model': 'AdaptiveBayes', 'fit_s': 0.05635435099975439, 'pred_s': 0.04977769300012369, 'proba_s': 0.06038043499984269, 'cpu_peak_mb_fit': 0.03125, 'gpu_mem_mb_fit': 62.0, 'acc': 0.3375408166848074, 'auc': np.float64(0.9201881969726551), 'dataset': 'CreditCardFraud'}
Training XGBoost on CreditCardFraud ...
{'model': 'XGBoost', 'fit_s': 0.9806480430002011, 'pred_s': 0.061298986000110744, 'proba_s': 0.0693144349997965, 'cpu_peak_mb_fit': 61.74609375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9995435553526912, 'auc': np.float64(0.9702255937546657), 'dataset': 'CreditCardFraud'}
Training RandomForest on CreditCardFraud ...
{'model': 'RandomForest', 'fit_s': 112.21955732800006, 'pred_s': 0.23611885399986932, 'proba_s': 0.2286607930000173, 'cpu_peak_mb_fit': 245.55078125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9996137776061234, 'auc': np.float64(0.951532496439769), 'dataset': 'CreditCardFrau



{'model': 'LightGBM', 'fit_s': 8.181892825999967, 'pred_s': 0.3385448619997078, 'proba_s': 0.3571057760000258, 'cpu_peak_mb_fit': 226.2421875, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9996137776061234, 'auc': np.float64(0.9643095807540798), 'dataset': 'CreditCardFraud'}
Training LogisticRegression on CreditCardFraud ...




{'model': 'LogisticRegression', 'fit_s': 35.42261782800006, 'pred_s': 0.11521076700000776, 'proba_s': 0.1163425009999628, 'cpu_peak_mb_fit': 0.30078125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9982795547909132, 'auc': np.float64(0.6287717992374214), 'dataset': 'CreditCardFraud'}
Loading HIGGS ...
HIGGS: X=(2000000, 28), y=(2000000,)
Training AdaptiveBayes on HIGGS ...
{'model': 'AdaptiveBayes', 'fit_s': 0.4678551649999463, 'pred_s': 0.3237371809996148, 'proba_s': 0.2665048519997981, 'cpu_peak_mb_fit': 512.15234375, 'gpu_mem_mb_fit': 382.0, 'acc': 0.5251875, 'auc': np.float64(0.5182948206159017), 'dataset': 'HIGGS'}
Training XGBoost on HIGGS ...
{'model': 'XGBoost', 'fit_s': 5.150578544999917, 'pred_s': 0.2686151630000495, 'proba_s': 0.2703554750000876, 'cpu_peak_mb_fit': 12.69140625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.75044, 'auc': np.float64(0.8333018787578815), 'dataset': 'HIGGS'}
Training RandomForest on HIGGS ...
{'model': 'RandomForest', 'fit_s': 723.3384623279999, 'pred_s': 9.79765656100016



{'model': 'MLP', 'fit_s': 492.20039164599984, 'pred_s': 3.529795269000715, 'proba_s': 3.580806881999706, 'cpu_peak_mb_fit': 929.66796875, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7568575, 'auc': np.float64(0.8392266119082334), 'dataset': 'HIGGS'}
Training LightGBM on HIGGS ...




{'model': 'LightGBM', 'fit_s': 32.27151433899962, 'pred_s': 5.746609759999956, 'proba_s': 5.597645398000168, 'cpu_peak_mb_fit': 76.81640625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7386025, 'auc': np.float64(0.8198107249192417), 'dataset': 'HIGGS'}
Training LogisticRegression on HIGGS ...
{'model': 'LogisticRegression', 'fit_s': 17.468885637000312, 'pred_s': 0.331150534999324, 'proba_s': 0.357544923000205, 'cpu_peak_mb_fit': 0.1015625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.64197, 'auc': np.float64(0.6850412539218178), 'dataset': 'HIGGS'}
Loading SUSY ...
SUSY: X=(2000000, 18), y=(2000000,)
Training AdaptiveBayes on SUSY ...
{'model': 'AdaptiveBayes', 'fit_s': 0.36277801699998236, 'pred_s': 0.24852813000052265, 'proba_s': 0.2522159489999467, 'cpu_peak_mb_fit': 255.97265625, 'gpu_mem_mb_fit': 2.0, 'acc': 0.61073, 'auc': np.float64(0.6487821523516125), 'dataset': 'SUSY'}
Training XGBoost on SUSY ...
{'model': 'XGBoost', 'fit_s': 3.5692136050001864, 'pred_s': 0.26523584900041897, 'proba_s': 0.2654011300



{'model': 'LightGBM', 'fit_s': 28.218637543999648, 'pred_s': 5.588264336000066, 'proba_s': 5.5705148029992415, 'cpu_peak_mb_fit': 76.6640625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.80267, 'auc': np.float64(0.8752886454242074), 'dataset': 'SUSY'}
Training LogisticRegression on SUSY ...
{'model': 'LogisticRegression', 'fit_s': 40.2706511209999, 'pred_s': 0.30824020099953486, 'proba_s': 0.37714368799970543, 'cpu_peak_mb_fit': 0.0703125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7887375, 'auc': np.float64(0.8580859027864128), 'dataset': 'SUSY'}
Loading KDDCup99 ...
KDDCup99: X=(494021, 37), y=(494021,)
Training AdaptiveBayes on KDDCup99 ...
{'model': 'AdaptiveBayes', 'fit_s': 0.20811140699970565, 'pred_s': 0.22783889299989823, 'proba_s': 0.21371238900064782, 'cpu_peak_mb_fit': 3.97265625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7588279945346895, 'auc': np.float64(0.8370580306070832), 'dataset': 'KDDCup99'}
Training XGBoost on KDDCup99 ...
{'model': 'XGBoost', 'fit_s': 1.351349642999594, 'pred_s': 0.219175578000431



{'model': 'MLP', 'fit_s': 96.08444196500022, 'pred_s': 1.5815654560001349, 'proba_s': 1.6468639579998126, 'cpu_peak_mb_fit': 14.5625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9969839583017054, 'auc': np.float64(0.9985354687876211), 'dataset': 'KDDCup99'}
Training LightGBM on KDDCup99 ...




{'model': 'LightGBM', 'fit_s': 5.278575725000337, 'pred_s': 0.5715220359998057, 'proba_s': 0.5560120449999886, 'cpu_peak_mb_fit': 76.4296875, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9995647993522595, 'auc': np.float64(0.9999978494820682), 'dataset': 'KDDCup99'}
Training LogisticRegression on KDDCup99 ...




{'model': 'LogisticRegression', 'fit_s': 76.73566490699977, 'pred_s': 0.2220165070002622, 'proba_s': 0.22484291900036624, 'cpu_peak_mb_fit': 0.0703125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.8890845604979505, 'auc': np.float64(0.8603640413337217), 'dataset': 'KDDCup99'}
Loading Covertype ...
Covertype: X=(581011, 54), y=(581011,)
Training AdaptiveBayes on Covertype ...
{'model': 'AdaptiveBayes', 'fit_s': 0.24771340600000258, 'pred_s': 0.26046444999974483, 'proba_s': None, 'cpu_peak_mb_fit': 0.0, 'gpu_mem_mb_fit': 0.0, 'acc': 0.39666790014027176, 'auc': None, 'dataset': 'Covertype'}
Training XGBoost on Covertype ...
{'model': 'XGBoost', 'fit_s': 11.327623303999644, 'pred_s': 0.3308461950000492, 'proba_s': None, 'cpu_peak_mb_fit': 1.46484375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9197352908272592, 'auc': None, 'dataset': 'Covertype'}
Training RandomForest on Covertype ...
{'model': 'RandomForest', 'fit_s': 72.16039070399984, 'pred_s': 1.8523380080005154, 'proba_s': None, 'cpu_peak_mb_fit': 1284.60546



{'model': 'LightGBM', 'fit_s': 82.16132544100037, 'pred_s': 10.201838325000608, 'proba_s': None, 'cpu_peak_mb_fit': 77.33203125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9350447062468267, 'auc': None, 'dataset': 'Covertype'}
Training LogisticRegression on Covertype ...




{'model': 'LogisticRegression', 'fit_s': 228.0298378460002, 'pred_s': 0.30337894199965376, 'proba_s': None, 'cpu_peak_mb_fit': 22.578125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.6920389318692288, 'auc': None, 'dataset': 'Covertype'}
Loading Synthetic ...
Creating synthetic HEPMASS-like dataset...
Synthetic dataset: X=(500000, 28), y=(500000,), class balance=0.615
Synthetic: X=(500000, 28), y=(500000,)
Training AdaptiveBayes on Synthetic ...
{'model': 'AdaptiveBayes', 'fit_s': 0.22422455600008107, 'pred_s': 0.23421662199962157, 'proba_s': 0.23084265499983303, 'cpu_peak_mb_fit': 0.0, 'gpu_mem_mb_fit': 0.0, 'acc': 0.53571, 'auc': np.float64(0.5292535033780479), 'dataset': 'Synthetic'}
Training XGBoost on Synthetic ...
{'model': 'XGBoost', 'fit_s': 2.4456621609997455, 'pred_s': 0.27357532999940304, 'proba_s': 0.2724409000002197, 'cpu_peak_mb_fit': 5.5390625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.68526, 'auc': np.float64(0.7385931095177556), 'dataset': 'Synthetic'}
Training RandomForest on Synthetic ...




{'model': 'LightGBM', 'fit_s': 16.41590242700022, 'pred_s': 1.2777580570000282, 'proba_s': 1.2860616030002348, 'cpu_peak_mb_fit': 76.4453125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.67989, 'auc': np.float64(0.731728526785842), 'dataset': 'Synthetic'}
Training LogisticRegression on Synthetic ...
{'model': 'LogisticRegression', 'fit_s': 4.193770770999436, 'pred_s': 0.3169816930003435, 'proba_s': 0.2677114719999736, 'cpu_peak_mb_fit': 0.08984375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.67195, 'auc': np.float64(0.7195245912402302), 'dataset': 'Synthetic'}
Loading Avazu ...
Avazu: X=(1000000, 23), y=(1000000,)
Training AdaptiveBayes on Avazu ...
{'model': 'AdaptiveBayes', 'fit_s': 0.2670999310003026, 'pred_s': 0.267579477000254, 'proba_s': 0.26604891099941597, 'cpu_peak_mb_fit': 6.98828125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.29421, 'auc': np.float64(0.6061265301905324), 'dataset': 'Avazu'}
Training XGBoost on Avazu ...
{'model': 'XGBoost', 'fit_s': 2.400895048000166, 'pred_s': 0.27796304799994687, 'proba_s': 



{'model': 'LightGBM', 'fit_s': 22.02342617999966, 'pred_s': 3.332799677999901, 'proba_s': 3.3536499669999102, 'cpu_peak_mb_fit': 76.40234375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.84408, 'auc': np.float64(0.7657825245402008), 'dataset': 'Avazu'}
Training LogisticRegression on Avazu ...
{'model': 'LogisticRegression', 'fit_s': 8.100331917999938, 'pred_s': 0.32583330899979046, 'proba_s': 0.3304923659998167, 'cpu_peak_mb_fit': 0.078125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.83978, 'auc': np.float64(0.5020758517711775), 'dataset': 'Avazu'}
Saved results to benchmark_results_default_impr_method.csv


Unnamed: 0,model,fit_s,pred_s,proba_s,cpu_peak_mb_fit,gpu_mem_mb_fit,acc,auc,dataset
0,AdaptiveBayes,0.056354,0.049778,0.06038,0.03125,62.0,0.337541,0.920188,CreditCardFraud
1,XGBoost,0.980648,0.061299,0.069314,61.746094,0.0,0.999544,0.970226,CreditCardFraud
2,RandomForest,112.219557,0.236119,0.228661,245.550781,0.0,0.999614,0.951532,CreditCardFraud
3,MLP,34.072897,0.204689,0.238558,123.15625,0.0,0.99828,0.504502,CreditCardFraud
4,LightGBM,8.181893,0.338545,0.357106,226.242188,0.0,0.999614,0.96431,CreditCardFraud
5,LogisticRegression,35.422618,0.115211,0.116343,0.300781,0.0,0.99828,0.628772,CreditCardFraud
6,AdaptiveBayes,0.467855,0.323737,0.266505,512.152344,382.0,0.525188,0.518295,HIGGS
7,XGBoost,5.150579,0.268615,0.270355,12.691406,0.0,0.75044,0.833302,HIGGS
8,RandomForest,723.338462,9.797657,9.624547,9682.539062,0.0,0.743908,0.825254,HIGGS
9,MLP,492.200392,3.529795,3.580807,929.667969,0.0,0.756857,0.839227,HIGGS


# Optuna optimizing

In [None]:
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import pickle
import json

In [None]:

def optimize_adaptive_bayes_params(X, y,
          dataset_name,
          n_trials=50,
          cv_folds=3,
          use_gpu=False,
          verbose=False,
          timeout=3600):
    """
    Оптимизация гиперпараметров AdaptiveBayes для конкретного датасета

    Args:
        X: признаки
        y: целевая переменная
        dataset_name: название датасета
        n_trials: количество проб Optuna
        cv_folds: количество фолдов кросс-валидации
        use_gpu: использовать GPU
        timeout: максимальное время оптимизации в секундах

    Returns:
        dict: лучшие параметры
    """
    if not verbose:
        optuna.logging.set_verbosity(optuna.logging.WARNING)

    print(f"Optimizing AdaptiveBayes parameters for {dataset_name}...")
    print(f"Dataset shape: {X.shape}, classes: {len(np.unique(y))}")

    # Нормализация данных (важно для AdaptiveBayes)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Определить тип задачи
    is_binary = len(np.unique(y)) == 2
    scoring = 'roc_auc' if is_binary else 'accuracy'

    # Стратегия кросс-валидации
    if is_binary:
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    else:
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    def objective(trial):
        # Предлагаемые параметры
        base_lr = trial.suggest_float('base_lr', 1e-4, 1.0, log=True)
        eps = trial.suggest_float('eps', 1e-12, 1e-6, log=True)
        epochs = trial.suggest_int('epochs', 1, 10)
        batch_size = trial.suggest_categorical('batch_size', [1024, 2048, 4096, 8192, 16384, 32768, 65536])

        try:
            # Создать модель с предложенными параметрами
            model = AdaptiveBayes(
                base_lr=base_lr,
                eps=eps,
                device='gpu' if use_gpu else 'cpu'
            )

            # Кросс-валидация
            scores = []
            for train_idx, val_idx in cv.split(X_scaled, y):
                X_train_fold = X_scaled[train_idx]
                X_val_fold = X_scaled[val_idx]
                y_train_fold = y[train_idx]
                y_val_fold = y[val_idx]

                # Обучение
                model.fit(X_train_fold, y_train_fold, epochs=epochs, batch_size=batch_size, shuffle=True)

                # Предсказание
                if is_binary:
                    y_pred_proba = model.predict_proba(X_val_fold)
                    score = roc_auc_score(y_val_fold, y_pred_proba)
                else:
                    y_pred = model.predict(X_val_fold)
                    score = accuracy_score(y_val_fold, y_pred)

                scores.append(score)

            mean_score = np.mean(scores)

            # Логирование промежуточных результатов
            # print(f"Trial {trial.number}: base_lr={base_lr:.1e}, eps={eps:.1e}, "
            #       f"epochs={epochs}, batch_size={batch_size}, score={mean_score:.4f}")

            return mean_score

        except Exception as e:
            print(f"Trial {trial.number} failed: {e}")
            return 0.0  # Минимальная оценка при ошибке

    # Создать study
    study = optuna.create_study(
        direction='maximize',
        study_name=f'adaptive_bayes_{dataset_name}',
        sampler=optuna.samplers.TPESampler(seed=42)
    )

    # Оптимизация
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)

    print(f"\nOptimization completed for {dataset_name}:")
    print(f"Best score: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")

    return study.best_params

def run_hyperparameter_optimization(datasets_config, use_gpu=False, n_trials=50, save_path="adaptive_bayes_params.json"):
    """
    Запуск оптимизации параметров для всех датасетов

    Args:
        datasets_config: конфигурация датасетов
        use_gpu: использовать GPU
        n_trials: количество проб для каждого датасета
        save_path: путь для сохранения параметров

    Returns:
        dict: словарь параметров для каждого датасета
    """

    optimized_params = {}

    for ds in datasets_config:
        name = ds["name"]
        loader = ds["loader"]
        path = ds["path"]
        sample_n = ds.get("sample_n")

        try:
            print(f"\n{'='*60}")
            print(f"Loading {name} for optimization...")

            # Загрузить данные
            if name == "Avazu":
                X, y = load_avazu(path, sample_n=sample_n)
            elif name == "Synthetic":
                X, y = create_synthetic_hepmass()
            else:
                X, y = loader(path)
                if sample_n is not None and len(X) > sample_n:
                    ridx = np.random.RandomState(42).choice(len(X), size=sample_n, replace=False)
                    X = X[ridx]
                    y = y[ridx]

            # Уменьшить размер для оптимизации (ускорение)
            optimization_size = min(100000, len(X))
            if len(X) > optimization_size:
                ridx = np.random.RandomState(42).choice(len(X), size=optimization_size, replace=False)
                X_opt = X[ridx]
                y_opt = y[ridx]
            else:
                X_opt = X
                y_opt = y

            print(f"Optimization subset: {X_opt.shape}")

            # Оптимизация параметров
            best_params = optimize_adaptive_bayes_params(
                X_opt, y_opt,
                dataset_name=name,
                n_trials=n_trials,
                cv_folds=3,
                use_gpu=use_gpu,
                timeout=1800  # 30 минут на датасет
            )

            optimized_params[name] = best_params

            # Промежуточное сохранение
            with open(save_path, 'w') as f:
                json.dump(optimized_params, f, indent=2)

            print(f"Saved intermediate results to {save_path}")

        except Exception as e:
            print(f"Failed to optimize {name}: {e}")
            # Использовать параметры по умолчанию
            optimized_params[name] = {
                'base_lr': 0.01,
                'eps': 1e-10,
                'epochs': 1,
                'batch_size': 65536
            }

    # Финальное сохранение
    with open(save_path, 'w') as f:
        json.dump(optimized_params, f, indent=2)

    print(f"\n{'='*60}")
    print("Hyperparameter optimization completed!")
    print(f"Results saved to: {save_path}")

    # Вывести сводку
    print("\nOptimized parameters summary:")
    for dataset, params in optimized_params.items():
        print(f"{dataset}: lr={params['base_lr']:.1e}, eps={params['eps']:.1e}, "
              f"epochs={params['epochs']}, batch_size={params['batch_size']}")

    return optimized_params

In [None]:
    # Этап 1: Оптимизация гиперпараметров
    print("Starting hyperparameter optimization...")
    optimized_params = run_hyperparameter_optimization(
        datasets,
        use_gpu=use_gpu,
        n_trials=1000,
        save_path="adaptive_bayes_params.json"
    )

Starting hyperparameter optimization...

Loading CreditCardFraud for optimization...
Optimization subset: (100000, 30)
Optimizing AdaptiveBayes parameters for CreditCardFraud...
Dataset shape: (100000, 30), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for CreditCardFraud:
Best score: 0.9496
Best params: {'base_lr': 0.0035801347101830825, 'eps': 5.602651860822652e-08, 'epochs': 8, 'batch_size': 8192}
Saved intermediate results to adaptive_bayes_params.json

Loading HIGGS for optimization...
Optimization subset: (100000, 28)
Optimizing AdaptiveBayes parameters for HIGGS...
Dataset shape: (100000, 28), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for HIGGS:
Best score: 0.5443
Best params: {'base_lr': 0.0055219579432839744, 'eps': 3.2468692859726945e-09, 'epochs': 8, 'batch_size': 4096}
Saved intermediate results to adaptive_bayes_params.json

Loading SUSY for optimization...
Optimization subset: (100000, 18)
Optimizing AdaptiveBayes parameters for SUSY...
Dataset shape: (100000, 18), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for SUSY:
Best score: 0.7116
Best params: {'base_lr': 0.0013706424415083963, 'eps': 1.922300840428287e-09, 'epochs': 10, 'batch_size': 1024}
Saved intermediate results to adaptive_bayes_params.json

Loading KDDCup99 for optimization...
Optimization subset: (100000, 37)
Optimizing AdaptiveBayes parameters for KDDCup99...
Dataset shape: (100000, 37), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for KDDCup99:
Best score: 0.9942
Best params: {'base_lr': 0.018859749269881067, 'eps': 9.343459559523171e-11, 'epochs': 1, 'batch_size': 8192}
Saved intermediate results to adaptive_bayes_params.json

Loading Covertype for optimization...
Optimization subset: (100000, 54)
Optimizing AdaptiveBayes parameters for Covertype...
Dataset shape: (100000, 54), classes: 7


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for Covertype:
Best score: 0.5056
Best params: {'base_lr': 0.01527223129800899, 'eps': 5.1418064711309445e-12, 'epochs': 4, 'batch_size': 65536}
Saved intermediate results to adaptive_bayes_params.json

Loading Synthetic for optimization...
Creating synthetic HEPMASS-like dataset...
Synthetic dataset: X=(500000, 28), y=(500000,), class balance=0.615
Optimization subset: (100000, 28)
Optimizing AdaptiveBayes parameters for Synthetic...
Dataset shape: (100000, 28), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for Synthetic:
Best score: 0.5558
Best params: {'base_lr': 0.19116879230883235, 'eps': 1.3887387911209016e-11, 'epochs': 3, 'batch_size': 1024}
Saved intermediate results to adaptive_bayes_params.json

Loading Avazu for optimization...
Optimization subset: (100000, 23)
Optimizing AdaptiveBayes parameters for Avazu...
Dataset shape: (100000, 23), classes: 2


  0%|          | 0/1000 [00:00<?, ?it/s]


Optimization completed for Avazu:
Best score: 0.6153
Best params: {'base_lr': 0.005782178469374114, 'eps': 4.808686204831486e-07, 'epochs': 7, 'batch_size': 2048}
Saved intermediate results to adaptive_bayes_params.json

Hyperparameter optimization completed!
Results saved to: adaptive_bayes_params.json

Optimized parameters summary:
CreditCardFraud: lr=3.6e-03, eps=5.6e-08, epochs=8, batch_size=8192
HIGGS: lr=5.5e-03, eps=3.2e-09, epochs=8, batch_size=4096
SUSY: lr=1.4e-03, eps=1.9e-09, epochs=10, batch_size=1024
KDDCup99: lr=1.9e-02, eps=9.3e-11, epochs=1, batch_size=8192
Covertype: lr=1.5e-02, eps=5.1e-12, epochs=4, batch_size=65536
Synthetic: lr=1.9e-01, eps=1.4e-11, epochs=3, batch_size=1024
Avazu: lr=5.8e-03, eps=4.8e-07, epochs=7, batch_size=2048


In [None]:
Optimized parameters summary: n_trials=100
CreditCardFraud: lr=1.4e-04, eps=1.8e-09, epochs=1, batch_size=2048
HIGGS: lr=7.6e-04, eps=5.3e-12, epochs=10, batch_size=2048
SUSY: lr=1.6e-04, eps=2.3e-09, epochs=5, batch_size=1024
KDDCup99: lr=1.0e-03, eps=9.4e-07, epochs=6, batch_size=8192
Covertype: lr=3.2e-03, eps=3.0e-10, epochs=1, batch_size=65536
Synthetic: lr=1.0e-04, eps=2.3e-09, epochs=3, batch_size=8192
Avazu: lr=1.3e-04, eps=2.3e-07, epochs=1, batch_size=2048

Optimized parameters summary: n_trials=500
CreditCardFraud: lr=1.3e-03, eps=4.8e-10, epochs=5, batch_size=4096 Best score: 0.9373
HIGGS: lr=9.5e-01, eps=1.1e-12, epochs=4, batch_size=1024 Best score: 0.6061
SUSY: lr=1.1e-04, eps=6.2e-09, epochs=9, batch_size=8192 Best score: 0.8080
KDDCup99: lr=1.2e-02, eps=3.9e-10, epochs=4, batch_size=2048 Best score: 0.9973
Covertype: lr=5.9e-03, eps=1.1e-10, epochs=1, batch_size=32768 Best score: 0.4172
Synthetic: lr=1.5e-04, eps=5.3e-12, epochs=2, batch_size=32768 Best score: 0.6970
Avazu: lr=1.2e-04, eps=4.1e-07, epochs=2, batch_size=4096 Best score: 0.6592

Optimized parameters summary: n_trials=1000
CreditCardFraud: lr=1.0e-04, eps=3.1e-12, epochs=1, batch_size=4096 Best score: 0.9599
HIGGS: lr=2.9e-01, eps=5.9e-09, epochs=6, batch_size=1024 Best score: 0.6058
SUSY: lr=1.7e-04, eps=4.4e-12, epochs=7, batch_size=16384 Best score: 0.8087
KDDCup99: lr=2.5e-04, eps=3.9e-10, epochs=6, batch_size=1024 Best score: 0.9965
Covertype: lr=9.4e-02, eps=1.0e-11, epochs=1, batch_size=32768 Best score: 0.4176
Synthetic: lr=1.6e-04, eps=5.6e-12, epochs=2, batch_size=32768 Best score: 0.6973
Avazu: lr=3.2e-04, eps=6.0e-08, epochs=1, batch_size=1024 Best score: 0.6604

Optimized parameters summary: n_trials=1000 (optimized method)
CreditCardFraud: lr=3.6e-03, eps=5.6e-08, epochs=8, batch_size=8192 Best score: 0.9496
HIGGS: lr=5.5e-03, eps=3.2e-09, epochs=8, batch_size=4096 Best score: 0.5443
SUSY: lr=1.4e-03, eps=1.9e-09, epochs=10, batch_size=1024 Best score: 0.7116
KDDCup99: lr=1.9e-02, eps=9.3e-11, epochs=1, batch_size=8192 Best score: 0.9942
Covertype: lr=1.5e-02, eps=5.1e-12, epochs=4, batch_size=65536 Best score: 0.5056
Synthetic: lr=1.9e-01, eps=1.4e-11, epochs=3, batch_size=1024 Best score: 0.5558
Avazu: lr=5.8e-03, eps=4.8e-07, epochs=7, batch_size=2048 Best score: 0.6153

# Main body opt

In [None]:
def load_optimized_params(save_path="adaptive_bayes_params.json"):
    """Загрузить оптимизированные параметры из файла"""
    try:
        with open(save_path, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Optimized parameters file {save_path} not found. Using defaults.")
        return {}

def make_models_with_optimized_params(use_gpu, optimized_params, dataset_name):
    """
    Создать модели с оптимизированными параметрами для AdaptiveBayes

    Args:
        use_gpu: использовать GPU
        optimized_params: словарь оптимизированных параметров
        dataset_name: название текущего датасета

    Returns:
        list: список моделей
    """
    models = []

    # AdaptiveBayes с оптимизированными параметрами
    try:
        params = optimized_params.get(dataset_name, {
            'base_lr': 0.01,
            'eps': 1e-10,
            'epochs': 1,
            'batch_size': 65536
        })

        ab = AdaptiveBayes(
            base_lr=params['base_lr'],
            eps=params['eps'],
            device='gpu' if use_gpu else 'cpu'
        )

        def fit_with_params(X, y):
            return ab.fit(X, y, epochs=params['epochs'], batch_size=params['batch_size'])

        models.append((
            "AdaptiveBayes",
            {
                "fit": fit_with_params,
                "predict": ab.predict,
                "predict_proba": ab.predict_proba
            }
        ))
        print("AdaptiveBayes model created successfully")
    except Exception as e:
        print(f"Failed to create AdaptiveBayes: {e}")

    # XGBoost
    try:
        if XGB_OK:
            if use_gpu:
                params_xgb = {
                    "n_estimators": 300,
                    "max_depth": 8,
                    "learning_rate": 0.1,
                    "subsample": 0.8,
                    "colsample_bytree": 0.8,
                    "tree_method": "hist",
                    "device": "cuda",
                    "eval_metric": "auc",
                }
            else:
                params_xgb = {
                    "n_estimators": 300,
                    "max_depth": 8,
                    "learning_rate": 0.1,
                    "subsample": 0.8,
                    "colsample_bytree": 0.8,
                    "tree_method": "hist",
                    "eval_metric": "auc",
                }

            xgbc = xgb.XGBClassifier(**params_xgb)
            models.append((
                "XGBoost",
                {
                    "fit": xgbc.fit,
                    "predict": xgbc.predict,
                    "predict_proba": xgbc.predict_proba
                }
            ))
            print("XGBoost model created successfully")
        else:
            print("XGBoost not available (XGB_OK=False)")
    except Exception as e:
        print(f"Failed to create XGBoost: {e}")

    # Random Forest
    try:
        rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=None)
        models.append((
            "RandomForest",
            {
                "fit": rf.fit,
                "predict": rf.predict,
                "predict_proba": rf.predict_proba
            }
        ))
        print("RandomForest model created successfully")
    except Exception as e:
        print(f"Failed to create RandomForest: {e}")

    # Neural Net (sklearn MLP)
    try:
        mlp = MLPClassifier(hidden_layer_sizes=(256, 128), batch_size=512, max_iter=20, solver='adam', early_stopping=True, random_state=42)
        models.append((
            "MLP",
            {
                "fit": mlp.fit,
                "predict": mlp.predict,
                "predict_proba": mlp.predict_proba
            }
        ))
        print("MLP model created successfully")
    except Exception as e:
        print(f"Failed to create MLP: {e}")

    # LightGBM
    try:
        if LGB_OK:
            if use_gpu:
                lgbm = lgb.LGBMClassifier(
                    n_estimators=300,
                    num_leaves=511,
                    learning_rate=0.01,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    device_type='gpu',
                    max_bin=127,
                    min_data_in_leaf=100,
                    min_gain_to_split=0.01,
                    verbose=-1
                )
            else:
                lgbm = lgb.LGBMClassifier(
                    n_estimators=500,
                    num_leaves=255,
                    learning_rate=0.05,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    device_type='cpu',
                    n_jobs=-1,
                    verbose=-1
                )

            models.append((
                "LightGBM",
                {
                    "fit": lgbm.fit,
                    "predict": lgbm.predict,
                    "predict_proba": lgbm.predict_proba
                }
            ))
            print("LightGBM model created successfully")
        else:
            print("LightGBM not available (LGB_OK=False)")
    except Exception as e:
        print(f"Failed to create LightGBM: {e}")

    # CatBoost
    try:
        if CAT_OK:
            cat = CatBoostClassifier(
                iterations=500,
                depth=8,
                learning_rate=0.1,
                verbose=False,
                task_type="GPU" if use_gpu else "CPU"
            )
            models.append((
                "CatBoost",
                {
                    "fit": cat.fit,
                    "predict": cat.predict,
                    "predict_proba": cat.predict_proba
                }
            ))
            print("CatBoost model created successfully")
        else:
            print("CatBoost not available (CAT_OK=False)")
    except Exception as e:
        print(f"Failed to create CatBoost: {e}")

    # Logistic Regression
    try:
        lr = LogisticRegression(max_iter=200, solver='saga', n_jobs=-1)
        models.append((
            "LogisticRegression",
            {
                "fit": lr.fit,
                "predict": lr.predict,
                "predict_proba": lr.predict_proba
            }
        ))
        print("LogisticRegression model created successfully")
    except Exception as e:
        print(f"Failed to create LogisticRegression: {e}")

    print(f"Total models created: {len(models)}")
    return models

# Обновленный run_benchmark
def run_benchmark_optimized_new(datasets_config, use_gpu=False, test_size=0.2, optimized_params_path="adaptive_bayes_params.json", output_csv="results.csv"):
    """
    Запуск бенчмарка с использованием оптимизированных параметров
    """
    # Загрузить оптимизированные параметры
    optimized_params = load_optimized_params(optimized_params_path)

    rows = []
    for ds in datasets_config:
        name = ds["name"]
        loader = ds["loader"]
        path = ds["path"]
        is_multiclass = ds.get("multiclass", False)
        sample_n = ds.get("sample_n")

        print(f"Loading {name} ...")

        # Загрузка данных (код остается тот же)
        if name == "Avazu":
            X, y = load_avazu(path, sample_n=sample_n)
        elif name == "Synthetic":
            X, y = create_synthetic_hepmass()
        else:
            X, y = loader(path)
            if sample_n is not None and len(X) > sample_n:
                ridx = np.random.RandomState(42).choice(len(X), size=sample_n, replace=False)
                X = X[ridx]
                y = y[ridx]

        print(f"{name}: X={X.shape}, y={y.shape}")

        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y if not is_multiclass else None)

        # Использовать оптимизированные параметры
        models = make_models_with_optimized_params(use_gpu=use_gpu, optimized_params=optimized_params, dataset_name=name)

        for mname, m in models:
            print(f"Training {mname} on {name} ...")
            stats = train_eval_one_new(mname, m, X_tr, y_tr, X_te, y_te, is_multiclass=is_multiclass, use_gpu=use_gpu)
            stats["dataset"] = name
            rows.append(stats)
            print(stats)

    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"Saved results to {output_csv}")
    return df


def train_eval_one_new(model_name, model_ctor, X_train, y_train, X_test, y_test, is_multiclass=False, use_gpu=False):

    if hasattr(y_test, 'get'):  # Проверка на CuPy массив
        y_test_np = y_test.get()
    else:
        y_test_np = np.asarray(y_test)

    # Нормализация для AdaptiveBayes
    if model_name == "AdaptiveBayes":
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        fit_stats = _measure_run(model_ctor['fit'], X_train_scaled, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test_scaled)
        y_pred = pred_stats["ret"]
        if not is_multiclass and 'predict_proba' in model_ctor:
            proba_stats = _measure_run(model_ctor['predict_proba'], X_test_scaled)
            y_prob = proba_stats["ret"]
            # Преобразовать в NumPy если нужно
            if hasattr(y_prob, 'get'):
                y_prob = y_prob.get()
            auc = roc_auc_score(y_test_np, y_prob)  # Использовать y_test_np
            proba_time = proba_stats["elapsed_s"]
        else:
            proba_time = None
            auc = None
    else:
        fit_stats = _measure_run(model_ctor['fit'], X_train, y_train)
        pred_stats = _measure_run(model_ctor['predict'], X_test)
        y_pred = pred_stats["ret"]
        if not is_multiclass and 'predict_proba' in model_ctor:
            proba_stats = _measure_run(model_ctor['predict_proba'], X_test)
            y_prob = proba_stats["ret"][:, 1] if y_prob_shape(proba_stats["ret"]) else proba_stats["ret"]
            auc = roc_auc_score(y_test_np, y_prob)  # Использовать y_test_np
            proba_time = proba_stats["elapsed_s"]
        else:
            proba_time = None
            auc = None

    # Преобразовать y_pred в NumPy если нужно
    if hasattr(y_pred, 'get'):
        y_pred = y_pred.get()

    acc = accuracy_score(y_test_np, y_pred)  # Использовать y_test_np

    return {
        "model": model_name,
        "fit_s": fit_stats["elapsed_s"],
        "pred_s": pred_stats["elapsed_s"],
        "proba_s": proba_time,
        "cpu_peak_mb_fit": fit_stats["cpu_peak_mb"],
        "gpu_mem_mb_fit": fit_stats["gpu_mem_delta_mb"],
        "acc": acc,
        "auc": auc
    }


In [None]:
datasets = [
    {"name": "CreditCardFraud", "loader": load_creditcard_fraud, "path": "data/creditcard.csv", "multiclass": False},
    {"name": "HIGGS", "loader": load_higgs, "path": "data/HIGGS.csv.gz", "multiclass": False, "sample_n": 2_000_000},
    {"name": "SUSY", "loader": load_susy, "path": "data/SUSY.csv.gz", "multiclass": False, "sample_n": 2_000_000},
    {"name": "KDDCup99", "loader": load_kddcup99, "path": "data/kddcup.data_10_percent.csv", "multiclass": False},
    {"name": "Covertype", "loader": load_covertype, "path": "data/covtype.csv", "multiclass": True},
    {"name": "Synthetic", "loader": create_synthetic_hepmass, "path": "", "multiclass": False, "sample_n": 2_000_000},
    {"name": "Avazu", "loader": load_avazu, "path": "data/avazu-ctr-train.zip", "multiclass": False, "sample_n": 2_000_000},
]

use_gpu = GPU_OK
print("\nStarting benchmark with optimized parameters...")
results_df = run_benchmark_optimized_new(
    datasets,
    use_gpu=use_gpu,
    optimized_params_path="adaptive_bayes_params.json",
    output_csv="benchmark_results_optimized.csv"
)

print("Benchmark completed!")


Starting benchmark with optimized parameters...
Loading CreditCardFraud ...
CreditCardFraud: X=(284807, 30), y=(284807,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on CreditCardFraud ...
{'model': 'AdaptiveBayes', 'fit_s': 0.2469200900013675, 'pred_s': 0.21935137100081192, 'proba_s': 0.2202568060001795, 'cpu_peak_mb_fit': 0.05859375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.40881640391840174, 'auc': np.float64(0.9228133290457432), 'dataset': 'CreditCardFraud'}
Training XGBoost on CreditCardFraud ...
{'model': 'XGBoost', 'fit_s': 1.181520260999605, 'pred_s': 0.22560092200001236, 'proba_s': 0.225835510998877, 'cpu_peak_mb_fit': 0.79296875, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9995435553526912, 'auc': np.float64(0.9702255937546657), 'd



{'model': 'LightGBM', 'fit_s': 3.7661205639997206, 'pred_s': 0.5097568729997874, 'proba_s': 0.5361108019988023, 'cpu_peak_mb_fit': 76.40625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9995962220427653, 'auc': np.float64(0.9647341526650053), 'dataset': 'CreditCardFraud'}
Training LogisticRegression on CreditCardFraud ...




{'model': 'LogisticRegression', 'fit_s': 37.526423876999615, 'pred_s': 0.24674821599910501, 'proba_s': 0.26229485600015323, 'cpu_peak_mb_fit': 0.078125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9982795547909132, 'auc': np.float64(0.6287714403431603), 'dataset': 'CreditCardFraud'}
Loading HIGGS ...
HIGGS: X=(2000000, 28), y=(2000000,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on HIGGS ...
{'model': 'AdaptiveBayes', 'fit_s': 0.5976915699993697, 'pred_s': 0.385136605000298, 'proba_s': 0.384697823001261, 'cpu_peak_mb_fit': 0.0, 'gpu_mem_mb_fit': 0.0, 'acc': 0.47949, 'auc': np.float64(0.4980634711957723), 'dataset': 'HIGGS'}
Training XGBoost on HIGGS ...
{'model': 'XGBoost', 'fit_s': 5.410127348999595, 'pred_s': 0.3837816709983599, 



{'model': 'MLP', 'fit_s': 494.6359210270002, 'pred_s': 3.637208427000587, 'proba_s': 3.9900994330000685, 'cpu_peak_mb_fit': 776.40625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7568575, 'auc': np.float64(0.8392266119082334), 'dataset': 'HIGGS'}
Training LightGBM on HIGGS ...




{'model': 'LightGBM', 'fit_s': 32.70174450800005, 'pred_s': 6.0014687109996885, 'proba_s': 5.937091752000924, 'cpu_peak_mb_fit': 77.9140625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.738545, 'auc': np.float64(0.8198907346103645), 'dataset': 'HIGGS'}
Training LogisticRegression on HIGGS ...
{'model': 'LogisticRegression', 'fit_s': 19.973109348000435, 'pred_s': 0.40474446799998987, 'proba_s': 0.4104644199996983, 'cpu_peak_mb_fit': 0.0703125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.64199, 'auc': np.float64(0.6850410677346743), 'dataset': 'HIGGS'}
Loading SUSY ...
SUSY: X=(2000000, 18), y=(2000000,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on SUSY ...
{'model': 'AdaptiveBayes', 'fit_s': 1.7281565590001264, 'pred_s': 0.36536286099908466, 'p



{'model': 'LightGBM', 'fit_s': 28.4404950320004, 'pred_s': 5.763161061000574, 'proba_s': 5.974724818999675, 'cpu_peak_mb_fit': 76.59765625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.80267, 'auc': np.float64(0.8752886438380781), 'dataset': 'SUSY'}
Training LogisticRegression on SUSY ...
{'model': 'LogisticRegression', 'fit_s': 43.01121556399994, 'pred_s': 0.4061513649994595, 'proba_s': 0.4335690780008008, 'cpu_peak_mb_fit': 0.0625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.7887375, 'auc': np.float64(0.8580859056565515), 'dataset': 'SUSY'}
Loading KDDCup99 ...
KDDCup99: X=(494021, 37), y=(494021,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on KDDCup99 ...
{'model': 'AdaptiveBayes', 'fit_s': 0.30553325500113715, 'pred_s': 0.3271617000009428, 



{'model': 'MLP', 'fit_s': 96.47967880000033, 'pred_s': 1.754981609999959, 'proba_s': 1.7602468989989575, 'cpu_peak_mb_fit': 0.0625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9969839583017054, 'auc': np.float64(0.9985354687876211), 'dataset': 'KDDCup99'}
Training LightGBM on KDDCup99 ...




{'model': 'LightGBM', 'fit_s': 5.330835988999752, 'pred_s': 0.6704027059986402, 'proba_s': 0.6594466009992175, 'cpu_peak_mb_fit': 76.40234375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9995749202975558, 'auc': np.float64(0.9999978501298146), 'dataset': 'KDDCup99'}
Training LogisticRegression on KDDCup99 ...




{'model': 'LogisticRegression', 'fit_s': 75.62943199000074, 'pred_s': 0.3758281829996122, 'proba_s': 0.37654738099990936, 'cpu_peak_mb_fit': 0.06640625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.8891149233338393, 'auc': np.float64(0.8605965194473565), 'dataset': 'KDDCup99'}
Loading Covertype ...
Covertype: X=(581011, 54), y=(581011,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on Covertype ...
{'model': 'AdaptiveBayes', 'fit_s': 0.34974628499912797, 'pred_s': 0.3744307390006725, 'proba_s': None, 'cpu_peak_mb_fit': 0.0, 'gpu_mem_mb_fit': 0.0, 'acc': 0.43421426297083554, 'auc': None, 'dataset': 'Covertype'}
Training XGBoost on Covertype ...
{'model': 'XGBoost', 'fit_s': 11.489693601000909, 'pred_s': 0.44690461400023196, 'proba_s': N



{'model': 'LightGBM', 'fit_s': 81.84064850600043, 'pred_s': 10.531017110000903, 'proba_s': None, 'cpu_peak_mb_fit': 76.40234375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.9346660585354939, 'auc': None, 'dataset': 'Covertype'}
Training LogisticRegression on Covertype ...




{'model': 'LogisticRegression', 'fit_s': 227.3571067140001, 'pred_s': 0.39441068899941456, 'proba_s': None, 'cpu_peak_mb_fit': 0.0546875, 'gpu_mem_mb_fit': 0.0, 'acc': 0.6920905656480469, 'auc': None, 'dataset': 'Covertype'}
Loading Synthetic ...
Creating synthetic HEPMASS-like dataset...
Synthetic dataset: X=(500000, 28), y=(500000,), class balance=0.615
Synthetic: X=(500000, 28), y=(500000,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on Synthetic ...
{'model': 'AdaptiveBayes', 'fit_s': 0.41583776099832903, 'pred_s': 0.329632037999545, 'proba_s': 0.3275539080004819, 'cpu_peak_mb_fit': 0.0, 'gpu_mem_mb_fit': 0.0, 'acc': 0.46689, 'auc': np.float64(0.46462116407482046), 'dataset': 'Synthetic'}
Training XGBoost on Syntheti



{'model': 'LightGBM', 'fit_s': 16.566909887000293, 'pred_s': 1.4031730289989355, 'proba_s': 1.4045511129988881, 'cpu_peak_mb_fit': 76.41015625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.67988, 'auc': np.float64(0.731728763262864), 'dataset': 'Synthetic'}
Training LogisticRegression on Synthetic ...
{'model': 'LogisticRegression', 'fit_s': 4.282861450999917, 'pred_s': 0.3596518609992927, 'proba_s': 0.36505795099947136, 'cpu_peak_mb_fit': 0.0703125, 'gpu_mem_mb_fit': 0.0, 'acc': 0.67195, 'auc': np.float64(0.7195245912402302), 'dataset': 'Synthetic'}
Loading Avazu ...
Avazu: X=(1000000, 23), y=(1000000,)
AdaptiveBayes model created successfully
XGBoost model created successfully
RandomForest model created successfully
MLP model created successfully
LightGBM model created successfully
CatBoost not available (CAT_OK=False)
LogisticRegression model created successfully
Total models created: 6
Training AdaptiveBayes on Avazu ...
{'model': 'AdaptiveBayes', 'fit_s': 0.5979644610015384, 'pred_s': 0.369103



{'model': 'LightGBM', 'fit_s': 21.676110870999764, 'pred_s': 3.6098453319991677, 'proba_s': 3.40433548999863, 'cpu_peak_mb_fit': 76.40234375, 'gpu_mem_mb_fit': 0.0, 'acc': 0.84404, 'auc': np.float64(0.7657017974967372), 'dataset': 'Avazu'}
Training LogisticRegression on Avazu ...
{'model': 'LogisticRegression', 'fit_s': 8.048615167999742, 'pred_s': 0.3574902239997755, 'proba_s': 0.3495863419993839, 'cpu_peak_mb_fit': 0.06640625, 'gpu_mem_mb_fit': 0.0, 'acc': 0.83978, 'auc': np.float64(0.5020758517711775), 'dataset': 'Avazu'}
Saved results to benchmark_results_optimized.csv
Benchmark completed!
