In [3]:
# XGBoost gblinear on fixed 6-folds with fold-wise StandardScaler and fixed n_estimators
import numpy as np, pandas as pd, json, time, sys, subprocess, warnings, os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss

SEED = 2025
np.random.seed(SEED)

# Ensure xgboost is available
try:
    import xgboost as xgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'xgboost>=1.7'], check=True)
    import xgboost as xgb

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def save_probs_and_logits(prefix: str, oof: np.ndarray, test_pred: np.ndarray):
    np.save(f'oof_{prefix}.npy', oof.astype(np.float32))
    np.save(f'test_{prefix}.npy', test_pred.astype(np.float32))
    oof_log = np.log(np.clip(oof, 1e-15, 1.0))
    test_log = np.log(np.clip(test_pred, 1e-15, 1.0))
    np.save(f'oof_{prefix}_logits.npy', oof_log.astype(np.float32))
    np.save(f'test_{prefix}_logits.npy', test_log.astype(np.float32))
    print(f'Saved oof_{prefix}.npy, test_{prefix}.npy and *_logits.npy', flush=True)

# Load data and folds
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
id_col = 'id'; target_col = 'species'
feature_cols = [c for c in train.columns if c not in [id_col, target_col]]
X = train[feature_cols].values.astype(np.float64, copy=True)
X_test = test[feature_cols].values.astype(np.float64, copy=True)
le = LabelEncoder()
y = le.fit_transform(train[target_col].values)
K = len(le.classes_)
with open('folds_6.json', 'r') as f:
    folds = [(np.array(a, dtype=np.int64), np.array(b, dtype=np.int64)) for a,b in json.load(f)]
print('Data ready:', X.shape, X_test.shape, 'Classes:', K, 'Folds:', len(folds), flush=True)

def run_xgb_gblinear(lrs=(0.1,), lambdas=(100,), alphas=(0.1,), n_estimators=200):
    best = (None, 1e9)
    best_oof = None
    best_test = None
    grid = []
    for eta in lrs:
        for lam in lambdas:
            for alp in alphas:
                grid.append((eta, lam, alp))
    print('Grid size:', len(grid), flush=True)
    for gi, (eta, lam, alp) in enumerate(grid, 1):
        t0 = time.time()
        oof = np.zeros((len(X), K), dtype=np.float64)
        test_pred = np.zeros((len(X_test), K), dtype=np.float64)
        print(f'[{gi}/{len(grid)}] eta={eta}, lambda={lam}, alpha={alp}', flush=True)
        for fi, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx, dtype=np.int64); val_idx = np.array(val_idx, dtype=np.int64)
            sc = StandardScaler(with_mean=True, with_std=True)
            X_tr = sc.fit_transform(X[trn_idx])
            X_va = sc.transform(X[val_idx])
            X_te = sc.transform(X_test)
            clf = xgb.XGBClassifier(
                booster='gblinear',
                objective='multi:softprob',
                num_class=K,
                n_estimators=n_estimators,
                learning_rate=eta,
                reg_lambda=lam,
                reg_alpha=alp,
                eval_metric='mlogloss',
                n_jobs=-1,
                random_state=SEED,
                verbosity=0
            )
            fstart = time.time()
            clf.fit(X_tr, y[trn_idx], eval_set=[(X_va, y[val_idx])], verbose=False)
            P_va = clf.predict_proba(X_va)
            P_te = clf.predict_proba(X_te)
            oof[val_idx] = P_va
            test_pred += P_te / len(folds)
            print(f'  [fold {fi}/{len(folds)}] time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
        print(f'--> OOF={ll:.6f} | params: eta={eta}, lambda={lam}, alpha={alp} | time {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = ((eta, lam, alp), ll)
            best_oof = oof
            best_test = test_pred
    print('Best gblinear:', best, flush=True)
    if best_oof is not None:
        save_probs_and_logits('xgb_gblinear', best_oof, best_test)
    return best, best_oof, best_test

best_params, oof_xgb, test_xgb = run_xgb_gblinear()
print('Done. Best params:', best_params)

Data ready: (891, 192) (99, 192) Classes: 99 Folds: 6


Grid size: 1


[1/1] eta=0.1, lambda=100, alpha=0.1


  [fold 1/6] time=1.52s


  [fold 2/6] time=1.51s


  [fold 3/6] time=1.50s


  [fold 4/6] time=1.58s


  [fold 5/6] time=1.59s


  [fold 6/6] time=1.51s


--> OOF=4.612579 | params: eta=0.1, lambda=100, alpha=0.1 | time 9.23s


Best gblinear: ((0.1, 100, 0.1), 4.6125794410508)


Saved oof_xgb_gblinear.npy, test_xgb_gblinear.npy and *_logits.npy


Done. Best params: ((0.1, 100, 0.1), 4.6125794410508)
