# Plan to Win a Medal

- Objective: Leaf Classification (99 classes), metric = multiclass log loss.
- Strategy:
  1) Load data, sanity checks, target distribution, feature types.
  2) Baseline CV with strong tabular models: LightGBM/XGBoost + stratified KFold (seeded).
  3) Tune key hyperparameters with quick opt (Optuna or manual grid) under time budget.
  4) Feature scaling variants and model ensembling (LGBM + XGB + Ridge/LogReg).
  5) Calibrate probabilities if needed.
  6) Train on full train, predict test, create submission.
- Logging: print fold indices, elapsed times, interim scores.
- Checkpoints: After EDA and after first CV baseline, request expert review for optimization guidance.

Next: Load data and perform quick EDA.

In [1]:
import os, sys, time, json, math, random
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder

SEED = 2025
rng = np.random.RandomState(SEED)
pd.set_option('display.max_columns', 200)

t0 = time.time()
train_path = 'train.csv'
test_path = 'test.csv'
sample_path = 'sample_submission.csv'
print('Loading data...')
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_sub = pd.read_csv(sample_path)
print(f'train shape: {train.shape}, test shape: {test.shape}')
print('train columns:', list(train.columns))
print('test columns:', list(test.columns))

# Identify columns
assert 'species' in train.columns, 'Target column species not found'
assert 'id' in train.columns and 'id' in test.columns, 'id column missing'
target_col = 'species'
id_col = 'id'
feature_cols = [c for c in train.columns if c not in [id_col, target_col]]
print(f'Num features: {len(feature_cols)}')

# Missing values
missing_train = train[feature_cols].isna().sum().sum()
missing_test = test[feature_cols].isna().sum().sum() if set(feature_cols).issubset(test.columns) else test.isna().sum().sum()
print(f'Missing values - train features: {missing_train}, test features: {missing_test}')

# Basic stats
print('Head of features:')
print(train[feature_cols].head(3))

# Target analysis
y = train[target_col].values
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)
print(f'Num classes: {num_classes}')
vc = pd.Series(y).value_counts().sort_values(ascending=False)
print('Top-10 class counts:')
print(vc.head(10))
print('Min/Max class count:', vc.min(), vc.max())

# Constant / duplicate features check
nunique = train[feature_cols].nunique()
const_feats = list(nunique[nunique <= 1].index)
dup_feats = []
seen = {}
for c in feature_cols:
    sig = tuple(np.round(train[c].values, 10))
    if sig in seen:
        dup_feats.append((c, seen[sig]))
    else:
        seen[sig] = c
print(f'Constant features: {len(const_feats)} -> {const_feats[:10]}')
print(f'Duplicate feature pairs (first 5): {dup_feats[:5]}')

elapsed = time.time() - t0
print(f'EDA summary done in {elapsed:.2f}s')

Loading data...
train shape: (891, 194), test shape: (99, 193)
train columns: ['id', 'species', 'margin1', 'margin2', 'margin3', 'margin4', 'margin5', 'margin6', 'margin7', 'margin8', 'margin9', 'margin10', 'margin11', 'margin12', 'margin13', 'margin14', 'margin15', 'margin16', 'margin17', 'margin18', 'margin19', 'margin20', 'margin21', 'margin22', 'margin23', 'margin24', 'margin25', 'margin26', 'margin27', 'margin28', 'margin29', 'margin30', 'margin31', 'margin32', 'margin33', 'margin34', 'margin35', 'margin36', 'margin37', 'margin38', 'margin39', 'margin40', 'margin41', 'margin42', 'margin43', 'margin44', 'margin45', 'margin46', 'margin47', 'margin48', 'margin49', 'margin50', 'margin51', 'margin52', 'margin53', 'margin54', 'margin55', 'margin56', 'margin57', 'margin58', 'margin59', 'margin60', 'margin61', 'margin62', 'margin63', 'margin64', 'shape1', 'shape2', 'shape3', 'shape4', 'shape5', 'shape6', 'shape7', 'shape8', 'shape9', 'shape10', 'shape11', 'shape12', 'shape13', 'shape14', 

In [3]:
# Baseline CV: LDA with StandardScaler + PCA(whiten, retain ~99.7% var)
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import log_loss
import numpy as np

t0 = time.time()
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc  # from previous cell's LabelEncoder
num_classes = len(le.classes_)

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

oof = np.zeros((len(X), num_classes), dtype=np.float64)
test_pred_accum = np.zeros((len(X_test), num_classes), dtype=np.float64)

fold_times = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y_idx), 1):
    fstart = time.time()
    X_tr, X_va = X[trn_idx], X[val_idx]
    y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]

    # Preprocessing inside fold to avoid leakage
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_s = scaler.fit_transform(X_tr)
    X_va_s = scaler.transform(X_va)
    X_test_s = scaler.transform(X_test)

    pca = PCA(n_components=0.997, whiten=True, random_state=SEED)
    X_tr_p = pca.fit_transform(X_tr_s)
    X_va_p = pca.transform(X_va_s)
    X_test_p = pca.transform(X_test_s)
    print(f'[Fold {fold}] PCA comps: {pca.n_components_}', flush=True)

    clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
    clf.fit(X_tr_p, y_tr)
    va_proba = clf.predict_proba(X_va_p)
    oof[val_idx] = va_proba
    test_pred_accum += clf.predict_proba(X_test_p) / n_splits

    fold_ll = log_loss(y_va, np.clip(va_proba, 1e-15, 1-1e-15), labels=list(range(num_classes)))
    ftime = time.time() - fstart
    fold_times.append(ftime)
    print(f'[Fold {fold}/{n_splits}] val_idx range=({val_idx.min()}..{val_idx.max()}), size={len(val_idx)}, logloss={fold_ll:.5f}, time={ftime:.2f}s', flush=True)

oof_ll = log_loss(y_idx, np.clip(oof, 1e-15, 1-1e-15), labels=list(range(num_classes)))
print(f'LDA OOF logloss: {oof_ll:.6f}')
print('Avg fold time: {:.2f}s (total {:.2f}s)'.format(np.mean(fold_times), time.time()-t0))

# Save intermediate predictions for reuse
np.save('oof_lda.npy', oof)
np.save('test_pred_lda.npy', test_pred_accum)

# Build submission from LDA as provisional baseline
proba = np.clip(test_pred_accum, 1e-15, 1-1e-15)
proba = proba / proba.sum(axis=1, keepdims=True)
pred_df = pd.DataFrame(proba, columns=list(le.classes_))

# Align columns to sample_submission
sub_cols = [c for c in sample_sub.columns if c != id_col]
if set(sub_cols) != set(le.classes_):
    raise ValueError('LabelEncoder classes do not match sample_submission columns')
pred_df = pred_df[sub_cols]
submission = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape', submission.shape)

[Fold 1] PCA comps: 133




[Fold 1/10] val_idx range=(26..869), size=90, logloss=0.38379, time=0.26s


[Fold 2] PCA comps: 133


[Fold 2/10] val_idx range=(0..887), size=89, logloss=0.48600, time=0.26s


[Fold 3] PCA comps: 133


[Fold 3/10] val_idx range=(2..889), size=89, logloss=0.38862, time=0.26s


[Fold 4] PCA comps: 133


[Fold 4/10] val_idx range=(1..884), size=89, logloss=0.44110, time=0.26s


[Fold 5] PCA comps: 133


[Fold 5/10] val_idx range=(13..888), size=89, logloss=0.80862, time=0.27s


[Fold 6] PCA comps: 133


[Fold 6/10] val_idx range=(19..874), size=89, logloss=0.00269, time=0.30s


[Fold 7] PCA comps: 133


[Fold 7/10] val_idx range=(3..883), size=89, logloss=0.74465, time=0.28s


[Fold 8] PCA comps: 133


[Fold 8/10] val_idx range=(4..882), size=89, logloss=0.00627, time=0.26s


[Fold 9] PCA comps: 133


[Fold 9/10] val_idx range=(6..876), size=89, logloss=0.26708, time=0.26s


[Fold 10] PCA comps: 133


[Fold 10/10] val_idx range=(12..890), size=89, logloss=0.00005, time=0.26s


LDA OOF logloss: 0.352922
Avg fold time: 0.27s (total 2.69s)
Saved submission.csv with shape (99, 100)


In [4]:
# Build stable 6-fold CV and train fast Logistic Regression baseline (with PCA branch)
import time, json
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import numpy as np

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)

# Fix CV to 6 folds and freeze indices for reuse across models
n_splits = 6
skf6 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
fold_indices = [(trn_idx.tolist(), val_idx.tolist()) for trn_idx, val_idx in skf6.split(X, y_idx)]
with open('folds_6.json', 'w') as f:
    json.dump(fold_indices, f)
print(f'Prepared {n_splits}-fold stratified splits. Example fold sizes:', [len(v) for _, v in fold_indices])

# Helper: evaluate a single LogReg setting on fixed folds
def run_logreg_cv(use_pca=True, pca_var=0.997, C=10.0):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    fold_lls = []
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        fstart = time.time()
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)

        if use_pca:
            pca = PCA(n_components=pca_var, whiten=True, random_state=SEED)
            X_tr_s = pca.fit_transform(X_tr_s)
            X_va_s = pca.transform(X_va_s)
            X_te_s = pca.transform(X_te_s)
            if i == 1:
                print(f'LogReg PCA components: {pca.n_components_}')

        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, C=C, n_jobs=None, random_state=SEED)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
        ll = log_loss(y_va, np.clip(va_proba, 1e-15, 1-1e-15), labels=list(range(num_classes)))
        fold_lls.append(ll)
        print(f'[LogReg fold {i}/{n_splits}] C={C}, use_pca={use_pca}, pca_var={pca_var}, logloss={ll:.5f}, time={time.time()-fstart:.2f}s', flush=True)

    oof_ll = log_loss(y_idx, np.clip(oof, 1e-15, 1-1e-15), labels=list(range(num_classes)))
    print(f'LogReg OOF logloss (C={C}, use_pca={use_pca}, pca_var={pca_var}): {oof_ll:.6f} | avg fold {np.mean(fold_lls):.6f} in {time.time()-t0:.2f}s')
    return oof, test_pred, oof_ll

# Small, fast grid for LogReg on PCA branch
grid_C = [3.0, 10.0, 30.0]
best = (None, 1e9)
best_oof = None
best_test = None
for C in grid_C:
    oof_lr, test_lr, oof_ll = run_logreg_cv(use_pca=True, pca_var=0.997, C=C)
    if oof_ll < best[1]:
        best = ((True, 0.997, C), oof_ll)
        best_oof = oof_lr
        best_test = test_lr

print('Best LogReg setting:', best)
np.save('oof_logreg.npy', best_oof)
np.save('test_pred_logreg.npy', best_test)

# Provisional submission from best LogReg (for quick LB read if needed)
proba = np.clip(best_test, 1e-15, 1-1e-15)
proba = proba / proba.sum(axis=1, keepdims=True)
pred_df = pd.DataFrame(proba, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_lr = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_lr.to_csv('submission_logreg.csv', index=False)
print('Saved submission_logreg.csv with shape', submission_lr.shape)

Prepared 6-fold stratified splits. Example fold sizes: [149, 149, 149, 148, 148, 148]
LogReg PCA components: 133




[LogReg fold 1/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.25548, time=0.41s


[LogReg fold 2/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.31293, time=0.22s




[LogReg fold 3/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.26569, time=0.21s




[LogReg fold 4/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.30053, time=0.31s




[LogReg fold 5/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.26575, time=0.26s




[LogReg fold 6/6] C=3.0, use_pca=True, pca_var=0.997, logloss=0.27298, time=0.34s


LogReg OOF logloss (C=3.0, use_pca=True, pca_var=0.997): 0.278890 | avg fold 0.278893 in 1.77s
LogReg PCA components: 133




[LogReg fold 1/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.18178, time=0.31s


[LogReg fold 2/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.25298, time=0.23s




[LogReg fold 3/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.22773, time=0.24s




[LogReg fold 4/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.22520, time=0.30s


[LogReg fold 5/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.21709, time=0.24s




[LogReg fold 6/6] C=10.0, use_pca=True, pca_var=0.997, logloss=0.19339, time=0.26s




LogReg OOF logloss (C=10.0, use_pca=True, pca_var=0.997): 0.216377 | avg fold 0.216362 in 1.62s
LogReg PCA components: 133


[LogReg fold 1/6] C=30.0, use_pca=True, pca_var=0.997, logloss=0.17666, time=0.26s






[LogReg fold 2/6] C=30.0, use_pca=True, pca_var=0.997, logloss=0.23997, time=0.36s




[LogReg fold 3/6] C=30.0, use_pca=True, pca_var=0.997, logloss=0.20855, time=0.32s




[LogReg fold 4/6] C=30.0, use_pca=True, pca_var=0.997, logloss=0.23015, time=0.37s






[LogReg fold 6/6] C=30.0, use_pca=True, pca_var=0.997, logloss=0.18585, time=0.34s


LogReg OOF logloss (C=30.0, use_pca=True, pca_var=0.997): 0.209021 | avg fold 0.209024 in 2.05s
Best LogReg setting: ((True, 0.997, 30.0), 0.2090214624173086)
Saved submission_logreg.csv with shape (99, 100)


In [5]:
# SVM RBF on fixed 6-folds: Branch B (Scaler->PCA(whiten)) and Branch A (Scaler only)
import time, json
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss
import numpy as np

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)

# Load fixed 6-fold indices
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def run_svm_cv(use_pca=True, pca_var=0.997, C=32.0, gamma='scale'):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    fold_lls = []
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        fstart = time.time()
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)

        if use_pca:
            pca = PCA(n_components=pca_var, whiten=True, random_state=SEED)
            X_tr_s = pca.fit_transform(X_tr_s)
            X_va_s = pca.transform(X_va_s)
            X_te_s = pca.transform(X_te_s)
            if i == 1:
                print(f'SVM PCA components: {pca.n_components_}')

        clf = SVC(kernel='rbf', C=C, gamma=gamma, probability=True, tol=1e-3, cache_size=1000, random_state=SEED)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
        ll = log_loss(y_va, np.clip(va_proba, 1e-15, 1-1e-15), labels=list(range(num_classes)))
        fold_lls.append(ll)
        print(f'[SVM fold {i}/{n_splits}] use_pca={use_pca}, pca_var={pca_var}, C={C}, gamma={gamma}, logloss={ll:.5f}, time={time.time()-fstart:.2f}s', flush=True)
    oof_ll = log_loss(y_idx, np.clip(oof, 1e-15, 1-1e-15), labels=list(range(num_classes)))
    print(f'SVM OOF logloss (use_pca={use_pca}, pca_var={pca_var}, C={C}, gamma={gamma}): {oof_ll:.6f} | avg fold {np.mean(fold_lls):.6f} in {time.time()-t0:.2f}s')
    return oof, test_pred, oof_ll

# Grid for Branch B (with PCA whiten)
pca_vars = [0.995, 0.997, 0.999]
Cs = [16.0, 32.0, 64.0]
gammas = ['scale', 0.01, 0.02]
best_b = (None, 1e9)
best_oof_b = None
best_test_b = None
grid_total = len(pca_vars) * len(Cs) * len(gammas)
gcnt = 0
t_grid = time.time()
for pv in pca_vars:
    for C in Cs:
        for gm in gammas:
            gcnt += 1
            print(f'\n[Grid B {gcnt}/{grid_total}] pv={pv}, C={C}, gamma={gm}', flush=True)
            oof_b, test_b, oof_ll_b = run_svm_cv(use_pca=True, pca_var=pv, C=C, gamma=gm)
            if oof_ll_b < best_b[1]:
                best_b = ((True, pv, C, gm), oof_ll_b)
                best_oof_b = oof_b
                best_test_b = test_b
print('Best SVM Branch B:', best_b, '| grid time {:.2f}s'.format(time.time()-t_grid))
if best_oof_b is not None:
    np.save('oof_svm_pca.npy', best_oof_b)
    np.save('test_pred_svm_pca.npy', best_test_b)
    with open('best_svm_pca.json', 'w') as f: json.dump({'use_pca': True, 'params': best_b[0], 'oof_ll': best_b[1]}, f)

# Compact Branch A (no PCA) grid
Cs_a = [16.0, 32.0, 64.0]
gammas_a = ['scale', 0.01, 0.02]
best_a = (None, 1e9)
best_oof_a = None
best_test_a = None
grid_total_a = len(Cs_a) * len(gammas_a)
gcnt = 0
t_grid_a = time.time()
for C in Cs_a:
    for gm in gammas_a:
        gcnt += 1
        print(f'\n[Grid A {gcnt}/{grid_total_a}] C={C}, gamma={gm}', flush=True)
        oof_a, test_a, oof_ll_a = run_svm_cv(use_pca=False, pca_var=None, C=C, gamma=gm)
        if oof_ll_a < best_a[1]:
            best_a = ((False, None, C, gm), oof_ll_a)
            best_oof_a = oof_a
            best_test_a = test_a
print('Best SVM Branch A:', best_a, '| grid time {:.2f}s'.format(time.time()-t_grid_a))
if best_oof_a is not None:
    np.save('oof_svm_nopca.npy', best_oof_a)
    np.save('test_pred_svm_nopca.npy', best_test_a)
    with open('best_svm_nopca.json', 'w') as f: json.dump({'use_pca': False, 'params': best_a[0], 'oof_ll': best_a[1]}, f)


[Grid B 1/27] pv=0.995, C=16.0, gamma=scale


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=16.0, gamma=scale, logloss=2.26388, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=16.0, gamma=scale, logloss=2.28455, time=1.13s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=16.0, gamma=scale, logloss=2.20680, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=16.0, gamma=scale, logloss=2.28633, time=1.16s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=16.0, gamma=scale, logloss=2.31393, time=1.10s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=16.0, gamma=scale): 2.269320 | avg fold 2.269379 in 6.74s

[Grid B 2/27] pv=0.995, C=16.0, gamma=0.01


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.28232, time=1.08s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.29504, time=1.06s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.22342, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.30744, time=1.07s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.28183, time=1.07s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.01, logloss=2.32367, time=1.11s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=16.0, gamma=0.01): 2.285556 | avg fold 2.285619 in 6.51s

[Grid B 3/27] pv=0.995, C=16.0, gamma=0.02


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.13674, time=1.05s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.16026, time=1.07s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.31272, time=1.08s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.27937, time=1.07s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.26055, time=1.08s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=16.0, gamma=0.02, logloss=3.21094, time=1.08s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=16.0, gamma=0.02): 3.226684 | avg fold 3.226763 in 6.45s

[Grid B 4/27] pv=0.995, C=32.0, gamma=scale


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.26388, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.28455, time=1.09s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.20680, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.28633, time=1.13s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.26078, time=1.10s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=32.0, gamma=scale, logloss=2.31393, time=1.08s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=32.0, gamma=scale): 2.269320 | avg fold 2.269379 in 6.64s

[Grid B 5/27] pv=0.995, C=32.0, gamma=0.01


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.28232, time=1.07s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.29504, time=1.08s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.22342, time=1.07s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.30744, time=1.07s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.28183, time=1.07s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.01, logloss=2.32367, time=1.05s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=32.0, gamma=0.01): 2.285556 | avg fold 2.285619 in 6.42s

[Grid B 6/27] pv=0.995, C=32.0, gamma=0.02


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.13674, time=1.05s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.16026, time=1.09s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.31272, time=1.06s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.27937, time=1.08s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.26055, time=1.06s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=32.0, gamma=0.02, logloss=3.21094, time=1.08s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=32.0, gamma=0.02): 3.226684 | avg fold 3.226763 in 6.43s

[Grid B 7/27] pv=0.995, C=64.0, gamma=scale


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.26388, time=1.08s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.28455, time=1.07s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.20680, time=1.09s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.28633, time=1.10s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.26078, time=1.10s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=64.0, gamma=scale, logloss=2.31393, time=1.10s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=64.0, gamma=scale): 2.269320 | avg fold 2.269379 in 6.54s

[Grid B 8/27] pv=0.995, C=64.0, gamma=0.01


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.28232, time=1.06s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.29504, time=1.07s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.22342, time=1.07s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.30744, time=1.06s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.28183, time=1.07s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.01, logloss=2.32367, time=1.07s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=64.0, gamma=0.01): 2.285556 | avg fold 2.285619 in 6.41s

[Grid B 9/27] pv=0.995, C=64.0, gamma=0.02


SVM PCA components: 126


[SVM fold 1/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.13674, time=1.05s


[SVM fold 2/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.16026, time=1.10s


[SVM fold 3/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.31272, time=1.11s


[SVM fold 4/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.27937, time=1.11s


[SVM fold 5/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.26055, time=1.07s


[SVM fold 6/6] use_pca=True, pca_var=0.995, C=64.0, gamma=0.02, logloss=3.21094, time=1.09s


SVM OOF logloss (use_pca=True, pca_var=0.995, C=64.0, gamma=0.02): 3.226684 | avg fold 3.226763 in 6.52s

[Grid B 10/27] pv=0.997, C=16.0, gamma=scale


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.29183, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.29615, time=1.13s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.23696, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.29198, time=1.11s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.26871, time=1.18s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=16.0, gamma=scale, logloss=2.32356, time=1.19s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=16.0, gamma=scale): 2.284834 | avg fold 2.284867 in 6.86s

[Grid B 11/27] pv=0.997, C=16.0, gamma=0.01


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.33710, time=1.12s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.32986, time=1.10s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.29055, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.33734, time=1.08s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.31985, time=1.10s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.01, logloss=2.36301, time=1.11s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=16.0, gamma=0.01): 2.329584 | avg fold 2.329619 in 6.64s

[Grid B 12/27] pv=0.997, C=16.0, gamma=0.02


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.48328, time=1.10s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.47278, time=1.11s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.64244, time=1.11s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.56165, time=1.10s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.53969, time=1.10s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=16.0, gamma=0.02, logloss=3.55231, time=1.11s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=16.0, gamma=0.02): 3.541995 | avg fold 3.542026 in 6.64s

[Grid B 13/27] pv=0.997, C=32.0, gamma=scale


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.29183, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.29615, time=1.10s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.23696, time=1.11s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.29198, time=1.10s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.26871, time=1.13s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=32.0, gamma=scale, logloss=2.32356, time=1.12s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=32.0, gamma=scale): 2.284834 | avg fold 2.284867 in 6.68s

[Grid B 14/27] pv=0.997, C=32.0, gamma=0.01


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.33710, time=1.12s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.32986, time=1.13s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.29055, time=1.10s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.33734, time=1.11s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.31985, time=1.09s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.01, logloss=2.36301, time=1.12s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=32.0, gamma=0.01): 2.329584 | avg fold 2.329619 in 6.68s

[Grid B 15/27] pv=0.997, C=32.0, gamma=0.02


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.48328, time=1.10s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.47278, time=1.09s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.64244, time=1.12s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.56165, time=1.09s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.53969, time=1.08s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=32.0, gamma=0.02, logloss=3.55231, time=1.10s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=32.0, gamma=0.02): 3.541995 | avg fold 3.542026 in 6.58s

[Grid B 16/27] pv=0.997, C=64.0, gamma=scale


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.29183, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.29615, time=1.11s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.23696, time=1.09s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.29198, time=1.10s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.26871, time=1.08s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=64.0, gamma=scale, logloss=2.32356, time=1.12s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=64.0, gamma=scale): 2.284834 | avg fold 2.284867 in 6.63s

[Grid B 17/27] pv=0.997, C=64.0, gamma=0.01


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.33710, time=1.13s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.32986, time=1.10s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.29055, time=1.10s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.33734, time=1.08s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.31985, time=1.12s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.01, logloss=2.36301, time=1.10s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=64.0, gamma=0.01): 2.329584 | avg fold 2.329619 in 6.64s

[Grid B 18/27] pv=0.997, C=64.0, gamma=0.02


SVM PCA components: 133


[SVM fold 1/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.48328, time=1.11s


[SVM fold 2/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.47278, time=1.14s


[SVM fold 3/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.64244, time=1.13s


[SVM fold 4/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.56165, time=1.09s


[SVM fold 5/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.53969, time=1.07s


[SVM fold 6/6] use_pca=True, pca_var=0.997, C=64.0, gamma=0.02, logloss=3.55231, time=1.18s


SVM OOF logloss (use_pca=True, pca_var=0.997, C=64.0, gamma=0.02): 3.541995 | avg fold 3.542026 in 6.73s

[Grid B 19/27] pv=0.999, C=16.0, gamma=scale


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.30949, time=1.24s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.31558, time=1.23s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.25579, time=1.20s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.30617, time=1.22s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.26556, time=1.25s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=16.0, gamma=scale, logloss=2.31293, time=1.23s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=16.0, gamma=scale): 2.294248 | avg fold 2.294250 in 7.38s

[Grid B 20/27] pv=0.999, C=16.0, gamma=0.01


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.41862, time=1.20s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.41845, time=1.19s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.39396, time=1.22s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.43252, time=1.22s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.40448, time=1.20s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.01, logloss=2.42062, time=1.19s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=16.0, gamma=0.01): 2.414758 | avg fold 2.414773 in 7.22s

[Grid B 21/27] pv=0.999, C=16.0, gamma=0.02


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.86347, time=1.20s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.84718, time=1.20s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.97599, time=1.18s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.95912, time=1.20s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.92593, time=1.16s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=16.0, gamma=0.02, logloss=3.90117, time=1.20s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=16.0, gamma=0.02): 3.912087 | avg fold 3.912143 in 7.14s

[Grid B 22/27] pv=0.999, C=32.0, gamma=scale


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.30949, time=1.21s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.31558, time=1.20s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.25579, time=1.22s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.30617, time=1.21s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.26556, time=1.22s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=32.0, gamma=scale, logloss=2.31293, time=1.17s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=32.0, gamma=scale): 2.294248 | avg fold 2.294250 in 7.23s

[Grid B 23/27] pv=0.999, C=32.0, gamma=0.01


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.41862, time=1.20s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.41845, time=1.21s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.39396, time=1.19s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.43252, time=1.20s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.40448, time=1.17s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.01, logloss=2.42062, time=1.21s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=32.0, gamma=0.01): 2.414758 | avg fold 2.414773 in 7.19s

[Grid B 24/27] pv=0.999, C=32.0, gamma=0.02


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.86347, time=1.20s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.84718, time=1.18s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.97599, time=1.19s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.95912, time=1.20s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.92593, time=1.21s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=32.0, gamma=0.02, logloss=3.90117, time=1.20s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=32.0, gamma=0.02): 3.912087 | avg fold 3.912143 in 7.18s

[Grid B 25/27] pv=0.999, C=64.0, gamma=scale


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.30949, time=1.18s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.31558, time=1.22s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.25579, time=1.20s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.30617, time=1.20s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.26556, time=1.16s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=64.0, gamma=scale, logloss=2.31293, time=1.19s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=64.0, gamma=scale): 2.294248 | avg fold 2.294250 in 7.15s

[Grid B 26/27] pv=0.999, C=64.0, gamma=0.01


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.41862, time=1.20s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.41845, time=1.20s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.39396, time=1.20s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.43252, time=1.21s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.40448, time=1.17s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.01, logloss=2.42062, time=1.18s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=64.0, gamma=0.01): 2.414758 | avg fold 2.414773 in 7.17s

[Grid B 27/27] pv=0.999, C=64.0, gamma=0.02


SVM PCA components: 143


[SVM fold 1/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.86347, time=1.19s


[SVM fold 2/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.84718, time=1.23s


[SVM fold 3/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.97599, time=1.22s


[SVM fold 4/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.95912, time=1.21s


[SVM fold 5/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.92593, time=1.18s


[SVM fold 6/6] use_pca=True, pca_var=0.999, C=64.0, gamma=0.02, logloss=3.90117, time=1.22s


SVM OOF logloss (use_pca=True, pca_var=0.999, C=64.0, gamma=0.02): 3.912087 | avg fold 3.912143 in 7.27s
Best SVM Branch B: ((True, 0.995, 16.0, 'scale'), 2.2693198809667927) | grid time 183.69s

[Grid A 1/9] C=16.0, gamma=scale


[SVM fold 1/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.47650, time=1.09s


[SVM fold 2/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.44972, time=1.04s


[SVM fold 3/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.40816, time=1.08s


[SVM fold 4/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.44405, time=1.07s


[SVM fold 5/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.46969, time=1.01s


[SVM fold 6/6] use_pca=False, pca_var=None, C=16.0, gamma=scale, logloss=2.45645, time=1.04s


SVM OOF logloss (use_pca=False, pca_var=None, C=16.0, gamma=scale): 2.450742 | avg fold 2.450762 in 6.35s

[Grid A 2/9] C=16.0, gamma=0.01


[SVM fold 1/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.45812, time=1.13s


[SVM fold 2/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.42484, time=1.04s


[SVM fold 3/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.37503, time=1.07s


[SVM fold 4/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.43501, time=1.04s


[SVM fold 5/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.45819, time=1.04s


[SVM fold 6/6] use_pca=False, pca_var=None, C=16.0, gamma=0.01, logloss=2.43341, time=1.09s


SVM OOF logloss (use_pca=False, pca_var=None, C=16.0, gamma=0.01): 2.430729 | avg fold 2.430768 in 6.42s

[Grid A 3/9] C=16.0, gamma=0.02


[SVM fold 1/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.39767, time=1.06s


[SVM fold 2/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.36219, time=1.05s


[SVM fold 3/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.29489, time=1.03s


[SVM fold 4/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.39174, time=1.04s


[SVM fold 5/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.41681, time=1.06s


[SVM fold 6/6] use_pca=False, pca_var=None, C=16.0, gamma=0.02, logloss=2.36935, time=1.06s


SVM OOF logloss (use_pca=False, pca_var=None, C=16.0, gamma=0.02): 2.372040 | avg fold 2.372109 in 6.31s

[Grid A 4/9] C=32.0, gamma=scale


[SVM fold 1/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.47650, time=1.08s


[SVM fold 2/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.44972, time=1.03s


[SVM fold 3/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.40816, time=1.02s


[SVM fold 4/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.44405, time=1.04s


[SVM fold 5/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.46969, time=1.07s


[SVM fold 6/6] use_pca=False, pca_var=None, C=32.0, gamma=scale, logloss=2.45645, time=1.07s


SVM OOF logloss (use_pca=False, pca_var=None, C=32.0, gamma=scale): 2.450742 | avg fold 2.450762 in 6.32s

[Grid A 5/9] C=32.0, gamma=0.01


[SVM fold 1/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.45812, time=1.05s


[SVM fold 2/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.42484, time=1.06s


[SVM fold 3/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.37503, time=1.06s


[SVM fold 4/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.43501, time=1.03s


[SVM fold 5/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.45819, time=1.09s


[SVM fold 6/6] use_pca=False, pca_var=None, C=32.0, gamma=0.01, logloss=2.43341, time=1.11s


SVM OOF logloss (use_pca=False, pca_var=None, C=32.0, gamma=0.01): 2.430729 | avg fold 2.430768 in 6.41s

[Grid A 6/9] C=32.0, gamma=0.02


[SVM fold 1/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.39767, time=1.05s


[SVM fold 2/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.36219, time=1.05s


[SVM fold 3/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.29489, time=1.05s


[SVM fold 4/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.39174, time=1.07s


[SVM fold 5/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.41681, time=1.09s


[SVM fold 6/6] use_pca=False, pca_var=None, C=32.0, gamma=0.02, logloss=2.36935, time=1.05s


SVM OOF logloss (use_pca=False, pca_var=None, C=32.0, gamma=0.02): 2.372040 | avg fold 2.372109 in 6.38s

[Grid A 7/9] C=64.0, gamma=scale


[SVM fold 1/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.47650, time=1.05s


[SVM fold 2/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.44972, time=1.02s


[SVM fold 3/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.40816, time=1.01s


[SVM fold 4/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.44405, time=1.07s


[SVM fold 5/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.46969, time=1.07s


[SVM fold 6/6] use_pca=False, pca_var=None, C=64.0, gamma=scale, logloss=2.45645, time=1.05s


SVM OOF logloss (use_pca=False, pca_var=None, C=64.0, gamma=scale): 2.450742 | avg fold 2.450762 in 6.28s

[Grid A 8/9] C=64.0, gamma=0.01


[SVM fold 1/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.45812, time=1.07s


[SVM fold 2/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.42484, time=1.05s


[SVM fold 3/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.37503, time=1.07s


[SVM fold 4/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.43501, time=1.06s


[SVM fold 5/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.45819, time=1.05s


[SVM fold 6/6] use_pca=False, pca_var=None, C=64.0, gamma=0.01, logloss=2.43341, time=1.03s


SVM OOF logloss (use_pca=False, pca_var=None, C=64.0, gamma=0.01): 2.430729 | avg fold 2.430768 in 6.34s

[Grid A 9/9] C=64.0, gamma=0.02


[SVM fold 1/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.39767, time=1.08s


[SVM fold 2/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.36219, time=1.03s


[SVM fold 3/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.29489, time=1.03s


[SVM fold 4/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.39174, time=1.08s


[SVM fold 5/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.41681, time=1.06s


[SVM fold 6/6] use_pca=False, pca_var=None, C=64.0, gamma=0.02, logloss=2.36935, time=1.08s


SVM OOF logloss (use_pca=False, pca_var=None, C=64.0, gamma=0.02): 2.372040 | avg fold 2.372109 in 6.36s
Best SVM Branch A: ((False, None, 16.0, 0.02), 2.37203963720527) | grid time 57.18s


In [6]:
# KNN (distance-weighted) on fixed 6-folds with StandardScaler; optional PCA grid
import time, json
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)

# Load fixed 6-folds
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def run_knn_cv(use_pca=False, pca_var=0.997, n_neighbors=15):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    fold_lls = []
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        fstart = time.time()
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)

        if use_pca:
            pca = PCA(n_components=pca_var, whiten=False, random_state=SEED)
            X_tr_s = pca.fit_transform(X_tr_s)
            X_va_s = pca.transform(X_va_s)
            X_te_s = pca.transform(X_te_s)
            if i == 1:
                print(f'KNN PCA components: {pca.n_components_}')

        clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', metric='minkowski', p=2, n_jobs=-1)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
        ll = log_loss(y_va, np.clip(va_proba, 1e-15, 1-1e-15), labels=list(range(num_classes)))
        fold_lls.append(ll)
        print(f'[KNN fold {i}/{n_splits}] k={n_neighbors}, use_pca={use_pca}, pca_var={pca_var}, logloss={ll:.5f}, time={time.time()-fstart:.2f}s', flush=True)
    oof_ll = log_loss(y_idx, np.clip(oof, 1e-15, 1-1e-15), labels=list(range(num_classes)))
    print(f'KNN OOF logloss (k={n_neighbors}, use_pca={use_pca}, pca_var={pca_var}): {oof_ll:.6f} | avg fold {np.mean(fold_lls):.6f} in {time.time()-t0:.2f}s')
    return oof, test_pred, oof_ll

# Small grid for KNN without PCA (often strong here)
k_list = [5, 9, 15, 25]
best_knn = (None, 1e9)
best_oof = None
best_test = None
for k in k_list:
    oof_k, test_k, llk = run_knn_cv(use_pca=False, pca_var=None, n_neighbors=k)
    if llk < best_knn[1]:
        best_knn = ((False, None, k), llk)
        best_oof = oof_k
        best_test = test_k

print('Best KNN (no PCA):', best_knn)
np.save('oof_knn.npy', best_oof)
np.save('test_pred_knn.npy', best_test)

# Optional quick PCA variant for KNN for diversity
pca_try = [0.997]
for pv in pca_try:
    for k in [9, 15]:
        print(f'\n[Try KNN PCA] pv={pv}, k={k}')
        oof_kp, test_kp, llkp = run_knn_cv(use_pca=True, pca_var=pv, n_neighbors=k)
        np.save(f'oof_knn_pca_{pv}_{k}.npy', oof_kp)
        np.save(f'test_pred_knn_pca_{pv}_{k}.npy', test_kp)

[KNN fold 1/6] k=5, use_pca=False, pca_var=None, logloss=0.10828, time=0.06s


[KNN fold 2/6] k=5, use_pca=False, pca_var=None, logloss=0.31725, time=0.01s


[KNN fold 3/6] k=5, use_pca=False, pca_var=None, logloss=0.08952, time=0.01s


[KNN fold 4/6] k=5, use_pca=False, pca_var=None, logloss=0.11775, time=0.01s


[KNN fold 5/6] k=5, use_pca=False, pca_var=None, logloss=0.14734, time=0.01s


[KNN fold 6/6] k=5, use_pca=False, pca_var=None, logloss=0.12534, time=0.01s


KNN OOF logloss (k=5, use_pca=False, pca_var=None): 0.150983 | avg fold 0.150913 in 0.13s
[KNN fold 1/6] k=9, use_pca=False, pca_var=None, logloss=0.32598, time=0.01s


[KNN fold 2/6] k=9, use_pca=False, pca_var=None, logloss=0.53886, time=0.01s


[KNN fold 3/6] k=9, use_pca=False, pca_var=None, logloss=0.33501, time=0.01s


[KNN fold 4/6] k=9, use_pca=False, pca_var=None, logloss=0.34203, time=0.01s


[KNN fold 5/6] k=9, use_pca=False, pca_var=None, logloss=0.35200, time=0.01s


[KNN fold 6/6] k=9, use_pca=False, pca_var=None, logloss=0.33357, time=0.01s


KNN OOF logloss (k=9, use_pca=False, pca_var=None): 0.371339 | avg fold 0.371243 in 0.09s
[KNN fold 1/6] k=15, use_pca=False, pca_var=None, logloss=0.66115, time=0.01s


[KNN fold 2/6] k=15, use_pca=False, pca_var=None, logloss=0.88601, time=0.01s


[KNN fold 3/6] k=15, use_pca=False, pca_var=None, logloss=0.67115, time=0.01s


[KNN fold 4/6] k=15, use_pca=False, pca_var=None, logloss=0.68595, time=0.01s


[KNN fold 5/6] k=15, use_pca=False, pca_var=None, logloss=0.70447, time=0.01s


[KNN fold 6/6] k=15, use_pca=False, pca_var=None, logloss=0.68042, time=0.01s


KNN OOF logloss (k=15, use_pca=False, pca_var=None): 0.714942 | avg fold 0.714859 in 0.09s


[KNN fold 1/6] k=25, use_pca=False, pca_var=None, logloss=1.05120, time=0.01s


[KNN fold 2/6] k=25, use_pca=False, pca_var=None, logloss=1.27146, time=0.01s


[KNN fold 3/6] k=25, use_pca=False, pca_var=None, logloss=1.06281, time=0.01s


[KNN fold 4/6] k=25, use_pca=False, pca_var=None, logloss=1.07247, time=0.01s


[KNN fold 5/6] k=25, use_pca=False, pca_var=None, logloss=1.09212, time=0.01s


[KNN fold 6/6] k=25, use_pca=False, pca_var=None, logloss=1.06840, time=0.01s


KNN OOF logloss (k=25, use_pca=False, pca_var=None): 1.103162 | avg fold 1.103076 in 0.09s
Best KNN (no PCA): ((False, None, 5), 0.1509830363687263)

[Try KNN PCA] pv=0.997, k=9


KNN PCA components: 133


[KNN fold 1/6] k=9, use_pca=True, pca_var=0.997, logloss=0.32426, time=0.19s


[KNN fold 2/6] k=9, use_pca=True, pca_var=0.997, logloss=0.53954, time=0.23s


[KNN fold 3/6] k=9, use_pca=True, pca_var=0.997, logloss=0.33030, time=0.19s


[KNN fold 4/6] k=9, use_pca=True, pca_var=0.997, logloss=0.34314, time=0.23s


[KNN fold 5/6] k=9, use_pca=True, pca_var=0.997, logloss=0.35567, time=0.22s


[KNN fold 6/6] k=9, use_pca=True, pca_var=0.997, logloss=0.33186, time=0.20s


KNN OOF logloss (k=9, use_pca=True, pca_var=0.997): 0.370887 | avg fold 0.370795 in 1.27s

[Try KNN PCA] pv=0.997, k=15
KNN PCA components: 133
[KNN fold 1/6] k=15, use_pca=True, pca_var=0.997, logloss=0.65987, time=0.19s


[KNN fold 2/6] k=15, use_pca=True, pca_var=0.997, logloss=0.88477, time=0.21s


[KNN fold 3/6] k=15, use_pca=True, pca_var=0.997, logloss=0.66913, time=0.18s


[KNN fold 4/6] k=15, use_pca=True, pca_var=0.997, logloss=0.68473, time=0.22s


[KNN fold 5/6] k=15, use_pca=True, pca_var=0.997, logloss=0.70467, time=0.20s


[KNN fold 6/6] k=15, use_pca=True, pca_var=0.997, logloss=0.67655, time=0.20s


KNN OOF logloss (k=15, use_pca=True, pca_var=0.997): 0.713370 | avg fold 0.713287 in 1.20s


In [7]:
# Blend OOF/test predictions (KNN + LogReg) with weight optimization on OOF
import numpy as np, json, time
from sklearn.metrics import log_loss

SEED = 2025
num_classes = len(le.classes_)

# Load OOF/test preds
oof_knn = np.load('oof_knn.npy')
test_knn = np.load('test_pred_knn.npy')
oof_lr = np.load('oof_logreg.npy')
test_lr = np.load('test_pred_logreg.npy')

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

# Grid-search weights for two-model blend: P = w*KNN + (1-w)*LogReg
best = (None, 1e9)
grid = np.linspace(0.0, 1.0, 101)
t0 = time.time()
for w in grid:
    oof_blend = clip_norm(w * oof_knn + (1-w) * oof_lr)
    ll = log_loss(y_enc, oof_blend, labels=list(range(num_classes)))
    if ll < best[1]:
        best = (w, ll)
print(f'Best blend weight w(KNN): {best[0]:.4f}, OOF logloss: {best[1]:.6f}, searched {len(grid)} weights in {time.time()-t0:.2f}s')

# Build blended submission
w = best[0]
test_blend = clip_norm(w * test_knn + (1-w) * test_lr)
pred_df = pd.DataFrame(test_blend, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_blend = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_blend.to_csv('submission.csv', index=False)
submission_blend.to_csv('submission_blend_knn_logreg.csv', index=False)
print('Saved submission.csv and submission_blend_knn_logreg.csv with shape', submission_blend.shape)

Best blend weight w(KNN): 0.8900, OOF logloss: 0.115757, searched 101 weights in 0.25s
Saved submission.csv and submission_blend_knn_logreg.csv with shape (99, 100)


In [8]:
# Extend KNN (k in [1,3,5,7,11]), add LogReg (no PCA), add QDA (reg), then re-blend
import time, json, itertools
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

def run_knn_cv(n_neighbors=5, weights='distance'):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    fold_lls = []
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)
        clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric='minkowski', p=2, n_jobs=-1)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
        ll = log_loss(y_va, clip_norm(va_proba), labels=list(range(num_classes)))
        fold_lls.append(ll)
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'KNN OOF (k={n_neighbors}, weights={weights}): {oof_ll:.6f}')
    return oof, test_pred, oof_ll

def run_logreg_cv(use_pca=False, pca_var=0.999, C=10.0):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)
        if use_pca:
            pca = PCA(n_components=pca_var, whiten=True, random_state=SEED)
            X_tr_s = pca.fit_transform(X_tr_s)
            X_va_s = pca.transform(X_va_s)
            X_te_s = pca.transform(X_te_s)
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, C=C, random_state=SEED)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'LogReg OOF (use_pca={use_pca}, C={C}): {oof_ll:.6f}')
    return oof, test_pred, oof_ll

def run_qda_cv(reg_param=0.1, use_pca=False, pca_var=0.999):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(X_test)
        if use_pca:
            pca = PCA(n_components=pca_var, whiten=False, random_state=SEED)
            X_tr_s = pca.fit_transform(X_tr_s)
            X_va_s = pca.transform(X_va_s)
            X_te_s = pca.transform(X_te_s)
        clf = QuadraticDiscriminantAnalysis(reg_param=reg_param)
        clf.fit(X_tr_s, y_tr)
        va_proba = clf.predict_proba(X_va_s)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_s) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'QDA OOF (reg={reg_param}, use_pca={use_pca}): {oof_ll:.6f}')
    return oof, test_pred, oof_ll

# 1) Extend KNN search
knn_candidates = []
for k in [1,3,5,7,11]:
    for w in ['distance','uniform']:
        oof_k, test_k, llk = run_knn_cv(n_neighbors=k, weights=w)
        knn_candidates.append(((k,w), llk, oof_k, test_k))
knn_candidates.sort(key=lambda x: x[1])
best_knn = knn_candidates[0]
print('Best KNN:', best_knn[0], 'OOF:', best_knn[1])
np.save('oof_knn.npy', best_knn[2])
np.save('test_pred_knn.npy', best_knn[3])

# 2) LogReg no-PCA small grid
lr_candidates = []
for C in [1.0, 3.0, 10.0, 30.0, 100.0]:
    oof_lr_np, test_lr_np, llr_np = run_logreg_cv(use_pca=False, C=C)
    lr_candidates.append(((False, C), llr_np, oof_lr_np, test_lr_np))
lr_candidates.sort(key=lambda x: x[1])
best_lr_np = lr_candidates[0]
print('Best LogReg (no PCA):', best_lr_np[0], 'OOF:', best_lr_np[1])
np.save('oof_logreg_nopca.npy', best_lr_np[2])
np.save('test_pred_logreg_nopca.npy', best_lr_np[3])

# 3) QDA reg grid
qda_candidates = []
for reg in [0.0, 0.02, 0.05, 0.1, 0.2]:
    try:
        oof_q, test_q, llq = run_qda_cv(reg_param=reg, use_pca=False)
        qda_candidates.append(((reg, False), llq, oof_q, test_q))
    except Exception as e:
        print('QDA failed for reg', reg, '->', e)
qda_candidates.sort(key=lambda x: x[1])
best_qda = qda_candidates[0] if qda_candidates else None
if best_qda:
    print('Best QDA:', best_qda[0], 'OOF:', best_qda[1])
    np.save('oof_qda.npy', best_qda[2])
    np.save('test_pred_qda.npy', best_qda[3])

# 4) Blend: use best KNN + best LogReg (PCA one from earlier) + best LogReg no-PCA + optional QDA
oof_knn = np.load('oof_knn.npy')
test_knn = np.load('test_pred_knn.npy')
oof_lr_pca = np.load('oof_logreg.npy')
test_lr_pca = np.load('test_pred_logreg.npy')
oof_lr_np = np.load('oof_logreg_nopca.npy')
test_lr_np = np.load('test_pred_logreg_nopca.npy')
models = [
    ('knn', oof_knn, test_knn),
    ('lr_pca', oof_lr_pca, test_lr_pca),
    ('lr_np', oof_lr_np, test_lr_np)
]
if best_qda:
    models.append(('qda', best_qda[2], best_qda[3]))

# Coarse grid weights that sum to 1 in steps of 0.1 for up to 4 models
names = [m[0] for m in models]
oofs = [m[1] for m in models]
tests = [m[2] for m in models]
steps = [i/10.0 for i in range(11)]
best = (None, 1e9)
def evaluate_weights(w):
    P = np.zeros_like(oofs[0])
    for wi, Pi in zip(w, oofs):
        P += wi * Pi
    return log_loss(y_idx, clip_norm(P), labels=list(range(num_classes)))

if len(models) <= 3:
    for w1 in steps:
        for w2 in steps:
            w3 = 1.0 - w1 - w2
            if w3 < 0 or w3 > 1:
                continue
            w = [w1, w2, w3][:len(models)]
            ll = evaluate_weights(w)
            if ll < best[1]:
                best = (w, ll)
else:
    for w1 in steps:
        for w2 in steps:
            for w3 in steps:
                w4 = 1.0 - w1 - w2 - w3
                if w4 < 0 or w4 > 1:
                    continue
                w = [w1, w2, w3, w4]
                ll = evaluate_weights(w)
                if ll < best[1]:
                    best = (w, ll)

print('Best blend on OOF:', dict(zip(names, best[0])), 'OOF:', best[1])

# Build final blended submission.csv using best weights
w = best[0]
Ptest = np.zeros_like(tests[0])
for wi, Ti in zip(w, tests):
    Ptest += wi * Ti
Ptest = clip_norm(Ptest)
pred_df = pd.DataFrame(Ptest, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_final = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_final.to_csv('submission.csv', index=False)
print('Saved submission.csv (blended) with shape', submission_final.shape)

KNN OOF (k=1, weights=distance): 0.697753
KNN OOF (k=1, weights=uniform): 0.697753
KNN OOF (k=3, weights=distance): 0.211137


KNN OOF (k=3, weights=uniform): 0.216371
KNN OOF (k=5, weights=distance): 0.150983
KNN OOF (k=5, weights=uniform): 0.163544


KNN OOF (k=7, weights=distance): 0.237886
KNN OOF (k=7, weights=uniform): 0.267618
KNN OOF (k=11, weights=distance): 0.502239


KNN OOF (k=11, weights=uniform): 0.601040
Best KNN: (5, 'distance') OOF: 0.15098303636882396














LogReg OOF (use_pca=False, C=1.0): 0.115043














LogReg OOF (use_pca=False, C=3.0): 0.078270














LogReg OOF (use_pca=False, C=10.0): 0.058210












LogReg OOF (use_pca=False, C=30.0): 0.048535










LogReg OOF (use_pca=False, C=100.0): 0.052654
Best LogReg (no PCA): (False, 30.0) OOF: 0.048534560563282744






QDA OOF (reg=0.0, use_pca=False): 33.647203




QDA OOF (reg=0.02, use_pca=False): 9.651246




QDA OOF (reg=0.05, use_pca=False): 5.737604




QDA OOF (reg=0.1, use_pca=False): 4.217850




QDA OOF (reg=0.2, use_pca=False): 3.510059
Best QDA: (0.2, False) OOF: 3.5100591944994273


Best blend on OOF: {'knn': 0.0, 'lr_pca': 0.0, 'lr_np': 1.0, 'qda': 0.0} OOF: 0.048534560563282744
Saved submission.csv (blended) with shape (99, 100)


In [9]:
# Expanded Logistic Regression (no PCA) tuning: solvers, C sweep, scaler on/off
import time, json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

def run_logreg_nopca_cv(C=100.0, solver='lbfgs', use_scaler=True):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        if use_scaler:
            scaler = StandardScaler()
            X_tr = scaler.fit_transform(X_tr)
            X_va = scaler.transform(X_va)
            X_te = scaler.transform(X_test)
        else:
            X_te = X_test
        clf = LogisticRegression(multi_class='multinomial', solver=solver, penalty='l2', C=C, max_iter=5000, random_state=SEED, n_jobs=None if solver!='saga' else None)
        clf.fit(X_tr, y_tr)
        va_proba = clf.predict_proba(X_va)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'LogReg no-PCA OOF: {oof_ll:.6f} | C={C}, solver={solver}, scaler={use_scaler} in {time.time()-t0:.2f}s')
    return oof, test_pred, oof_ll

Cs = [20, 30, 40, 50, 60, 80, 100, 120, 150, 200, 300, 500, 800, 1000, 1500, 2000]
solvers = ['lbfgs', 'saga']  # both support multinomial + l2
scaler_opts = [True, False]
best = (None, 1e9)
best_oof = None
best_test = None
grid_total = len(Cs)*len(solvers)*len(scaler_opts)
gcnt = 0
for C in Cs:
    for solver in solvers:
        for use_scaler in scaler_opts:
            gcnt += 1
            print(f'\n[LogReg grid {gcnt}/{grid_total}] C={C}, solver={solver}, scaler={use_scaler}')
            try:
                oof_lr, test_lr, oof_ll = run_logreg_nopca_cv(C=C, solver=solver, use_scaler=use_scaler)
                if oof_ll < best[1]:
                    best = ((C, solver, use_scaler), oof_ll)
                    best_oof = oof_lr
                    best_test = test_lr
            except Exception as e:
                print('Config failed:', C, solver, use_scaler, '->', e)

print('Best LogReg no-PCA:', best)
if best_oof is not None:
    np.save('oof_logreg_nopca.npy', best_oof)
    np.save('test_pred_logreg_nopca.npy', best_test)

# Update submission.csv using best LogReg no-PCA
Ptest = clip_norm(best_test)
pred_df = pd.DataFrame(Ptest, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_lr_np = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_lr_np.to_csv('submission.csv', index=False)
print('Saved submission.csv (best LogReg no-PCA) with shape', submission_lr_np.shape)


[LogReg grid 1/64] C=20, solver=lbfgs, scaler=True












LogReg no-PCA OOF: 0.048727 | C=20, solver=lbfgs, scaler=True in 1.56s

[LogReg grid 2/64] C=20, solver=lbfgs, scaler=False














LogReg no-PCA OOF: 1.814009 | C=20, solver=lbfgs, scaler=False in 2.80s

[LogReg grid 3/64] C=20, solver=saga, scaler=True














LogReg no-PCA OOF: 0.059831 | C=20, solver=saga, scaler=True in 238.44s

[LogReg grid 4/64] C=20, solver=saga, scaler=False














LogReg no-PCA OOF: 1.815281 | C=20, solver=saga, scaler=False in 21.96s

[LogReg grid 5/64] C=30, solver=lbfgs, scaler=True










LogReg no-PCA OOF: 0.048535 | C=30, solver=lbfgs, scaler=True in 1.47s

[LogReg grid 6/64] C=30, solver=lbfgs, scaler=False














LogReg no-PCA OOF: 1.472730 | C=30, solver=lbfgs, scaler=False in 3.34s

[LogReg grid 7/64] C=30, solver=saga, scaler=True






In [10]:
# Exact and near-duplicate overrides on top of best LR(no-PCA) predictions
import numpy as np, json, time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss

X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)

# Load fixed folds and base LR(no-PCA) preds
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
base_oof = np.load('oof_logreg_nopca.npy')
base_test = np.load('test_pred_logreg_nopca.npy')

def onehot(idx, K):
    v = np.zeros(K, dtype=np.float64)
    v[idx] = 1.0
    return v

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

# Parameters from expert guidance
HARD_SIM = 0.9995
SOFT_SIM = 0.997
ALPHA = 0.8  # blend weight for NN onehot in soft override

# 1) Exact duplicate overrides
def build_exact_map(X_tr, y_tr):
    m = {}
    for r, yi in zip(X_tr, y_tr):
        key = tuple(np.round(r, 12))
        if key not in m:
            m[key] = yi
    return m

# Apply to OOF via folds
adj_oof = base_oof.copy()
exact_oof_cnt = 0
soft_oof_cnt = 0
hard_oof_cnt = 0
t0 = time.time()
for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    X_tr, X_va = X[trn_idx], X[val_idx]
    y_tr = y_idx[trn_idx]
    # Exact
    emap = build_exact_map(X_tr, y_tr)
    for j, r in zip(val_idx, X_va):
        key = tuple(np.round(r, 12))
        if key in emap:
            adj_oof[j] = onehot(emap[key], num_classes)
            exact_oof_cnt += 1
    # Near-duplicate via 1-NN cosine
    knn = KNeighborsClassifier(n_neighbors=1, metric='cosine')
    knn.fit(X_tr, y_tr)
    dists, nbrs = knn.kneighbors(X_va, n_neighbors=1, return_distance=True)
    sims = 1.0 - dists.ravel()
    nn_cls = y_tr[nbrs.ravel()]
    for idx_loc, j in enumerate(val_idx):
        if tuple(np.round(X_va[idx_loc],12)) in emap:
            continue  # already exact
        s = sims[idx_loc]
        if s >= HARD_SIM:
            adj_oof[j] = onehot(nn_cls[idx_loc], num_classes)
            hard_oof_cnt += 1
        elif s >= SOFT_SIM:
            oh = onehot(nn_cls[idx_loc], num_classes)
            adj_oof[j] = clip_norm(ALPHA*oh + (1-ALPHA)*adj_oof[j])
            soft_oof_cnt += 1
print(f'OOF overrides -> exact:{exact_oof_cnt}, hard:{hard_oof_cnt}, soft:{soft_oof_cnt} in {time.time()-t0:.2f}s')

oof_ll_before = log_loss(y_idx, clip_norm(base_oof), labels=list(range(num_classes)))
oof_ll_after = log_loss(y_idx, clip_norm(adj_oof), labels=list(range(num_classes)))
print(f'OOF logloss before: {oof_ll_before:.6f} | after overrides: {oof_ll_after:.6f}')

# 2) Apply to TEST using full train as reference
adj_test = base_test.copy()
emap_full = build_exact_map(X, y_idx)
exact_test_cnt = 0
soft_test_cnt = 0
hard_test_cnt = 0
# Exact matches
for i, r in enumerate(X_test):
    key = tuple(np.round(r, 12))
    if key in emap_full:
        adj_test[i] = onehot(emap_full[key], num_classes)
        exact_test_cnt += 1
# Near-duplicates
knn_full = KNeighborsClassifier(n_neighbors=1, metric='cosine')
knn_full.fit(X, y_idx)
dists_t, nbrs_t = knn_full.kneighbors(X_test, n_neighbors=1, return_distance=True)
sims_t = 1.0 - dists_t.ravel()
nn_cls_t = y_idx[nbrs_t.ravel()]
for i in range(len(X_test)):
    key = tuple(np.round(X_test[i],12))
    if key in emap_full:
        continue
    s = sims_t[i]
    if s >= HARD_SIM:
        adj_test[i] = onehot(nn_cls_t[i], num_classes)
        hard_test_cnt += 1
    elif s >= SOFT_SIM:
        oh = onehot(nn_cls_t[i], num_classes)
        adj_test[i] = clip_norm(ALPHA*oh + (1-ALPHA)*adj_test[i])
        soft_test_cnt += 1
print(f'TEST overrides -> exact:{exact_test_cnt}, hard:{hard_test_cnt}, soft:{soft_test_cnt}')

# 3) Save adjusted predictions and write submission
np.save('oof_lr_nopca_overridden.npy', adj_oof)
np.save('test_lr_nopca_overridden.npy', adj_test)
proba = clip_norm(adj_test)
pred_df = pd.DataFrame(proba, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_override = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_override.to_csv('submission.csv', index=False)
print('Saved submission.csv (overrides applied) with shape', submission_override.shape)

OOF overrides -> exact:0, hard:0, soft:0 in 0.51s
OOF logloss before: 0.048535 | after overrides: 0.048535
TEST overrides -> exact:0, hard:0, soft:0
Saved submission.csv (overrides applied) with shape (99, 100)


In [11]:
# Hellinger and Quantile transforms + LR; KNN cosine; then blend with best LR(no-PCA)
import numpy as np, json, time
import pandas as pd
from sklearn.preprocessing import normalize, QuantileTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

# 1) Hellinger transform branch: sqrt(clip) -> row L2 normalize -> LR
def hellinger_transform(A):
    A2 = np.sqrt(np.clip(A, 0, None))
    return normalize(A2, norm='l2', axis=1, copy=False)

def run_lr_hellinger(C=20.0):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        X_tr_h = hellinger_transform(X_tr.copy())
        X_va_h = hellinger_transform(X_va.copy())
        X_te_h = hellinger_transform(X_test.copy())
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=3000, random_state=SEED)
        clf.fit(X_tr_h, y_tr)
        va_proba = clf.predict_proba(X_va_h)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_h) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'LR-Hellinger OOF: {oof_ll:.6f} | C={C}')
    return oof, test_pred, oof_ll

hell_Cs = [8.0, 12.0, 20.0, 30.0, 50.0]
best_hell = (None, 1e9); best_hell_oof=None; best_hell_test=None
for C in hell_Cs:
    oof_h, test_h, llh = run_lr_hellinger(C=C)
    if llh < best_hell[1]:
        best_hell = ((C,), llh); best_hell_oof=oof_h; best_hell_test=test_h
print('Best LR-Hellinger:', best_hell)
np.save('oof_lr_hell.npy', best_hell_oof)
np.save('test_pred_lr_hell.npy', best_hell_test)

# 2) QuantileTransformer to normal -> LR
def run_lr_quantile(C=30.0):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        qt = QuantileTransformer(n_quantiles=min(1000, X_tr.shape[0]), output_distribution='normal', random_state=SEED, subsample=1_000_000, copy=True)
        X_tr_q = qt.fit_transform(X_tr)
        X_va_q = qt.transform(X_va)
        X_te_q = qt.transform(X_test)
        # Scale after quantile to stabilize LR
        scaler = StandardScaler()
        X_tr_q = scaler.fit_transform(X_tr_q)
        X_va_q = scaler.transform(X_va_q)
        X_te_q = scaler.transform(X_te_q)
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=3000, random_state=SEED)
        clf.fit(X_tr_q, y_tr)
        va_proba = clf.predict_proba(X_va_q)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te_q) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'LR-Quantile OOF: {oof_ll:.6f} | C={C}')
    return oof, test_pred, oof_ll

q_Cs = [10.0, 30.0, 60.0, 100.0]
best_q = (None, 1e9); best_q_oof=None; best_q_test=None
for C in q_Cs:
    oof_q, test_q, llq = run_lr_quantile(C=C)
    if llq < best_q[1]:
        best_q = ((C,), llq); best_q_oof=oof_q; best_q_test=test_q
print('Best LR-Quantile:', best_q)
np.save('oof_lr_quant.npy', best_q_oof)
np.save('test_pred_lr_quant.npy', best_q_test)

# 3) KNN cosine (as a model for diversity)
def run_knn_cosine(k=3, weights='distance'):
    oof = np.zeros((len(X), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test), num_classes), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X[trn_idx], X[val_idx]
        y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
        clf = KNeighborsClassifier(n_neighbors=k, metric='cosine', weights=weights)
        clf.fit(X_tr, y_tr)
        va_proba = clf.predict_proba(X_va)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_test) / n_splits
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'KNN-cosine OOF: {oof_ll:.6f} | k={k}, weights={weights}')
    return oof, test_pred, oof_ll

best_knn_c = (None, 1e9); best_knn_c_oof=None; best_knn_c_test=None
for k in [1,3,5,7]:
    for w in ['distance','uniform']:
        oof_c, test_c, llc = run_knn_cosine(k=k, weights=w)
        if llc < best_knn_c[1]:
            best_knn_c=((k,w), llc); best_knn_c_oof=oof_c; best_knn_c_test=test_c
print('Best KNN-cosine:', best_knn_c)
np.save('oof_knn_cos.npy', best_knn_c_oof)
np.save('test_pred_knn_cos.npy', best_knn_c_test)

# 4) Blend top models: LR(no-PCA best), LR-Hellinger, LR-Quantile, KNN-cosine
base_lr_oof = np.load('oof_logreg_nopca.npy')
base_lr_test = np.load('test_pred_logreg_nopca.npy')
oofs = [base_lr_oof, best_hell_oof, best_q_oof, best_knn_c_oof]
tests = [base_lr_test, best_hell_test, best_q_test, best_knn_c_test]
names = ['lr_base','lr_hell','lr_quant','knn_cos']

# Coarse grid weights (step 0.1) summing to 1 for 4 models
steps = [i/10.0 for i in range(11)]
best = (None, 1e9)
cnt=0
for w1 in steps:
    for w2 in steps:
        for w3 in steps:
            w4 = 1.0 - w1 - w2 - w3
            if w4 < 0 or w4 > 1:
                continue
            cnt += 1
            w = np.array([w1,w2,w3,w4])
            P = np.zeros_like(oofs[0])
            for wi, Pi in zip(w, oofs):
                P += wi * Pi
            ll = log_loss(y_idx, clip_norm(P), labels=list(range(num_classes)))
            if ll < best[1]:
                best = (w, ll)
print(f'Blend search tried {cnt} combos. Best OOF {best[1]:.6f} with weights:', dict(zip(names, best[0])))

# Build final blended submission
w = best[0]
Ptest = np.zeros_like(tests[0])
for wi, Ti in zip(w, tests):
    Ptest += wi * Ti
Ptest = clip_norm(Ptest)
pred_df = pd.DataFrame(Ptest, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_final2 = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_final2.to_csv('submission.csv', index=False)
print('Saved submission.csv (enhanced blend) with shape', submission_final2.shape)













LR-Hellinger OOF: 1.100540 | C=8.0














LR-Hellinger OOF: 0.870645 | C=12.0














LR-Hellinger OOF: 0.655639 | C=20.0














LR-Hellinger OOF: 0.526478 | C=30.0














LR-Hellinger OOF: 0.410351 | C=50.0
Best LR-Hellinger: ((50.0,), 0.4103505254965633)














LR-Quantile OOF: 0.111609 | C=10.0














LR-Quantile OOF: 0.096319 | C=30.0














LR-Quantile OOF: 0.092994 | C=60.0














LR-Quantile OOF: 0.092544 | C=100.0
Best LR-Quantile: ((100.0,), 0.09254423017942547)
KNN-cosine OOF: 3.333709 | k=1, weights=distance


KNN-cosine OOF: 3.333709 | k=1, weights=uniform
KNN-cosine OOF: 1.741756 | k=3, weights=distance
KNN-cosine OOF: 1.772478 | k=3, weights=uniform


KNN-cosine OOF: 1.404837 | k=5, weights=distance
KNN-cosine OOF: 1.476497 | k=5, weights=uniform
KNN-cosine OOF: 1.364209 | k=7, weights=distance


KNN-cosine OOF: 1.480876 | k=7, weights=uniform
Best KNN-cosine: ((7, 'distance'), 1.3642091332998065)


Blend search tried 269 combos. Best OOF 0.048535 with weights: {'lr_base': 1.0, 'lr_hell': 0.0, 'lr_quant': 0.0, 'knn_cos': 0.0}
Saved submission.csv (enhanced blend) with shape (99, 100)


In [13]:
# LightGBM multiclass (regularized) with 6-fold CV, then blend with LR(no-PCA)
import time, json, sys, subprocess, importlib
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

def ensure_lightgbm():
    try:
        import lightgbm as lgb
        return lgb
    except Exception:
        print('Installing lightgbm...')
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'], check=True)
        import lightgbm as lgb
        return lgb

lgb = ensure_lightgbm()

SEED = 2025
X = train[feature_cols].values.astype(np.float32)
X_test = test[feature_cols].values.astype(np.float32)
y_idx = y_enc.astype(np.int32)
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 12,
    'max_depth': 4,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l2': 3.0,
    'verbosity': -1,
    'seed': SEED,
}

oof = np.zeros((len(X), num_classes), dtype=np.float32)
test_pred = np.zeros((len(X_test), num_classes), dtype=np.float32)
fold_times = []
t0 = time.time()
for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
    fstart = time.time()
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    X_tr, X_va = X[trn_idx], X[val_idx]
    y_tr, y_va = y_idx[trn_idx], y_idx[val_idx]
    dtrain = lgb.Dataset(X_tr, label=y_tr, free_raw_data=True)
    dvalid = lgb.Dataset(X_va, label=y_va, free_raw_data=True)
    clf = lgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        valid_sets=[dvalid],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    va_proba = clf.predict(X_va, num_iteration=clf.best_iteration)
    oof[val_idx] = va_proba
    test_pred += clf.predict(X_test, num_iteration=clf.best_iteration) / n_splits
    ll = log_loss(y_va, np.clip(va_proba, 1e-15, 1-1e-15), labels=list(range(num_classes)))
    et = time.time()-fstart; fold_times.append(et)
    print(f'[LGBM fold {i}/{n_splits}] best_iter={clf.best_iteration}, val logloss={ll:.6f}, time={et:.2f}s', flush=True)

oof_ll = log_loss(y_idx, np.clip(oof, 1e-15, 1-1e-15), labels=list(range(num_classes)))
print(f'LGBM OOF logloss: {oof_ll:.6f}; avg fold time {np.mean(fold_times):.2f}s; total {time.time()-t0:.2f}s')
np.save('oof_lgbm.npy', oof)
np.save('test_pred_lgbm.npy', test_pred)

# Blend LR(no-PCA) + LGBM
base_lr_oof = np.load('oof_logreg_nopca.npy')
base_lr_test = np.load('test_pred_logreg_nopca.npy')

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    p = p / p.sum(axis=1, keepdims=True)
    return p

best = (None, 1e9)
for w in np.linspace(0.0, 1.0, 101):
    P = clip_norm(w*oof + (1-w)*base_lr_oof)
    ll = log_loss(y_idx, P, labels=list(range(num_classes)))
    if ll < best[1]:
        best = (w, ll)
print(f'Best LR/LGBM blend w(LGBM)={best[0]:.2f}, OOF={best[1]:.6f}')

w = best[0]
Ptest = clip_norm(w*test_pred + (1-w)*base_lr_test)
pred_df = pd.DataFrame(Ptest, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission_lgb_blend = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission_lgb_blend.to_csv('submission.csv', index=False)
print('Saved submission.csv (LR/LGBM blend) with shape', submission_lgb_blend.shape)

[100]	valid's multi_logloss: 1.21609


[200]	valid's multi_logloss: 0.801358


[300]	valid's multi_logloss: 0.670667


[400]	valid's multi_logloss: 0.601268


[500]	valid's multi_logloss: 0.559564


[600]	valid's multi_logloss: 0.529244


[700]	valid's multi_logloss: 0.506776


[800]	valid's multi_logloss: 0.488946


[900]	valid's multi_logloss: 0.473848


[1000]	valid's multi_logloss: 0.461656


[1100]	valid's multi_logloss: 0.452044


[1200]	valid's multi_logloss: 0.442789


[1300]	valid's multi_logloss: 0.434797


[1400]	valid's multi_logloss: 0.427845


[1500]	valid's multi_logloss: 0.421372


[1600]	valid's multi_logloss: 0.415434


[1700]	valid's multi_logloss: 0.410062


[1800]	valid's multi_logloss: 0.405123


[1900]	valid's multi_logloss: 0.400388


[2000]	valid's multi_logloss: 0.396088


[2100]	valid's multi_logloss: 0.392144


[2200]	valid's multi_logloss: 0.388636


[2300]	valid's multi_logloss: 0.38532


[2400]	valid's multi_logloss: 0.38199


[2500]	valid's multi_logloss: 0.378917


[2600]	valid's multi_logloss: 0.376092


[2700]	valid's multi_logloss: 0.373402


[2800]	valid's multi_logloss: 0.370764


[2900]	valid's multi_logloss: 0.368411


[3000]	valid's multi_logloss: 0.366098


[3100]	valid's multi_logloss: 0.363995


[3200]	valid's multi_logloss: 0.36185


[3300]	valid's multi_logloss: 0.35975


[3400]	valid's multi_logloss: 0.357711


[3500]	valid's multi_logloss: 0.355862


[3600]	valid's multi_logloss: 0.354061


[3700]	valid's multi_logloss: 0.352326


[3800]	valid's multi_logloss: 0.350745


[3900]	valid's multi_logloss: 0.349163


[4000]	valid's multi_logloss: 0.347698


[LGBM fold 1/6] best_iter=4000, val logloss=0.347698, time=98.88s


[100]	valid's multi_logloss: 1.14827


[200]	valid's multi_logloss: 0.716298


[300]	valid's multi_logloss: 0.58098


[400]	valid's multi_logloss: 0.509943


[500]	valid's multi_logloss: 0.465258


[600]	valid's multi_logloss: 0.433126


[700]	valid's multi_logloss: 0.409655


[800]	valid's multi_logloss: 0.391581


[900]	valid's multi_logloss: 0.376572


[1000]	valid's multi_logloss: 0.3638


[1100]	valid's multi_logloss: 0.352584


[1200]	valid's multi_logloss: 0.342951


[1300]	valid's multi_logloss: 0.334788


[1400]	valid's multi_logloss: 0.327118


[1500]	valid's multi_logloss: 0.320596


[1600]	valid's multi_logloss: 0.314665


[1700]	valid's multi_logloss: 0.309208


[1800]	valid's multi_logloss: 0.304119


[1900]	valid's multi_logloss: 0.299458


[2000]	valid's multi_logloss: 0.295145


[2100]	valid's multi_logloss: 0.291122


[2200]	valid's multi_logloss: 0.287328


[2300]	valid's multi_logloss: 0.283812


[2400]	valid's multi_logloss: 0.280413


[2500]	valid's multi_logloss: 0.277331


[2600]	valid's multi_logloss: 0.274425


[2700]	valid's multi_logloss: 0.271634


[2800]	valid's multi_logloss: 0.269012


[2900]	valid's multi_logloss: 0.266499


[3000]	valid's multi_logloss: 0.264164


[3100]	valid's multi_logloss: 0.261873


[3200]	valid's multi_logloss: 0.259677


[3300]	valid's multi_logloss: 0.257541


[3400]	valid's multi_logloss: 0.255571


[3500]	valid's multi_logloss: 0.253628


[3600]	valid's multi_logloss: 0.251786


[3700]	valid's multi_logloss: 0.250052


[3800]	valid's multi_logloss: 0.248423


[3900]	valid's multi_logloss: 0.246806


[4000]	valid's multi_logloss: 0.245272


[LGBM fold 2/6] best_iter=4000, val logloss=0.245272, time=95.48s


[100]	valid's multi_logloss: 1.10158


[200]	valid's multi_logloss: 0.684183


[300]	valid's multi_logloss: 0.555501


[400]	valid's multi_logloss: 0.486956


[500]	valid's multi_logloss: 0.446293


[600]	valid's multi_logloss: 0.417326


[700]	valid's multi_logloss: 0.395126


[800]	valid's multi_logloss: 0.377282


[900]	valid's multi_logloss: 0.362949


[1000]	valid's multi_logloss: 0.351115


[1100]	valid's multi_logloss: 0.340995


[1200]	valid's multi_logloss: 0.33245


[1300]	valid's multi_logloss: 0.325045


[1400]	valid's multi_logloss: 0.317979


[1500]	valid's multi_logloss: 0.311878


[1600]	valid's multi_logloss: 0.306247


[1700]	valid's multi_logloss: 0.30127


[1800]	valid's multi_logloss: 0.296726


[1900]	valid's multi_logloss: 0.292552


[2000]	valid's multi_logloss: 0.288671


[2100]	valid's multi_logloss: 0.284986


[2200]	valid's multi_logloss: 0.281553


[2300]	valid's multi_logloss: 0.278249


[2400]	valid's multi_logloss: 0.275178


[2500]	valid's multi_logloss: 0.272471


[2600]	valid's multi_logloss: 0.269799


[2700]	valid's multi_logloss: 0.267115


[2800]	valid's multi_logloss: 0.264637


[2900]	valid's multi_logloss: 0.26238


[3000]	valid's multi_logloss: 0.260268


In [14]:
# Block-wise L1 -> sqrt (Hellinger) + LR, and AdditiveChi2Sampler + LinearSVC (calibrated) / LR
import time, json, numpy as np, pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.kernel_approximation import AdditiveChi2Sampler

SEED = 2025
X_full = train[feature_cols].values.astype(np.float64)
X_test_full = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

# Define block indices (64 each): margin, shape, texture
def get_blocks(cols):
    margin_cols = [c for c in cols if c.startswith('margin')]
    shape_cols = [c for c in cols if c.startswith('shape')]
    texture_cols = [c for c in cols if c.startswith('texture')]
    assert len(margin_cols)==64 and len(shape_cols)==64 and len(texture_cols)==64, 'Expected 64 cols per block'
    m_idx = np.array([cols.index(c) for c in margin_cols])
    s_idx = np.array([cols.index(c) for c in shape_cols])
    t_idx = np.array([cols.index(c) for c in texture_cols])
    return m_idx, s_idx, t_idx

m_idx, s_idx, t_idx = get_blocks(feature_cols)

def block_l1_sqrt(A):
    A = np.clip(A, 0, None)
    out = np.empty_like(A)
    for idx in (m_idx, s_idx, t_idx):
        B = A[:, idx]
        denom = B.sum(axis=1, keepdims=True) + 1e-12
        Bn = B / denom
        out[:, idx] = np.sqrt(Bn)
    return out

def block_l1_only(A):
    A = np.clip(A, 0, None)
    out = np.empty_like(A)
    for idx in (m_idx, s_idx, t_idx):
        B = A[:, idx]
        denom = B.sum(axis=1, keepdims=True) + 1e-12
        out[:, idx] = B / denom
    return out

def clip_norm(p):
    p = np.clip(p, 1e-15, 1-1e-15)
    return p / p.sum(axis=1, keepdims=True)

# A) Block-wise L1->sqrt (Hellinger) + optional per-block StandardScaler -> LogisticRegression
def run_block_hell_lr(Cs=(10.0, 30.0, 100.0), use_block_scaler=False):
    Xh = block_l1_sqrt(X_full)
    Xh_test = block_l1_sqrt(X_test_full)
    # If block scaler, standardize each block separately within folds
    best = (None, 1e9); best_oof=None; best_test=None
    for C in Cs:
        oof = np.zeros((len(Xh), num_classes), dtype=np.float64)
        test_pred = np.zeros((len(Xh_test), num_classes), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            X_tr, X_va = Xh[trn_idx], Xh[val_idx]
            X_te = Xh_test.copy()
            if use_block_scaler:
                for idx in (m_idx, s_idx, t_idx):
                    sc = StandardScaler(with_mean=True, with_std=True)
                    X_tr[:, idx] = sc.fit_transform(X_tr[:, idx])
                    X_va[:, idx] = sc.transform(X_va[:, idx])
                    X_te[:, idx] = sc.transform(X_te[:, idx])
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=4000, random_state=SEED)
            clf.fit(X_tr, y_idx[trn_idx])
            va_proba = clf.predict_proba(X_va)
            oof[val_idx] = va_proba
            test_pred += clf.predict_proba(X_te) / n_splits
            print(f'[BlkHell LR fold {i}/{n_splits}] C={C}, scaler={use_block_scaler}', flush=True)
        oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
        print(f'BlkHell LR OOF: {oof_ll:.6f} | C={C}, scaler={use_block_scaler} in {time.time()-t0:.2f}s', flush=True)
        if oof_ll < best[1]:
            best = ((C, use_block_scaler), oof_ll); best_oof=oof; best_test=test_pred
    np.save('oof_block_hell_lr.npy', best_oof)
    np.save('test_block_hell_lr.npy', best_test)
    print('Best Block-Hellinger LR:', best)

# B) AdditiveChi2Sampler pipeline -> StandardScaler(with_mean=False) -> LinearSVC + Platt scaling (sigmoid) with stronger optimization
def run_chi2_linsvc(sample_steps=2, Cs=(0.5,1,2,4,8)):
    Xn = block_l1_only(X_full)  # ensure histograms per block
    Xn_test = block_l1_only(X_test_full)
    Xn = np.clip(Xn, 0, None); Xn_test = np.clip(Xn_test, 0, None)
    best = (None, 1e9); best_oof=None; best_test=None
    for C in Cs:
        oof = np.zeros((len(Xn), num_classes), dtype=np.float64)
        test_pred = np.zeros((len(Xn_test), num_classes), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            X_tr, X_va = Xn[trn_idx], Xn[val_idx]
            chi2_fold = AdditiveChi2Sampler(sample_steps=sample_steps)
            X_tr_c = chi2_fold.fit_transform(X_tr)
            X_va_c = chi2_fold.transform(X_va)
            X_te_c = chi2_fold.transform(Xn_test)
            sc = StandardScaler(with_mean=False)
            X_tr_c = sc.fit_transform(X_tr_c)
            X_va_c = sc.transform(X_va_c)
            X_te_c = sc.transform(X_te_c)
            base = LinearSVC(C=C, dual='auto', max_iter=20000, tol=1e-4, random_state=SEED)
            clf = CalibratedClassifierCV(base, method='sigmoid', cv=5)
            clf.fit(X_tr_c, y_idx[trn_idx])
            va_proba = clf.predict_proba(X_va_c)
            oof[val_idx] = va_proba
            test_pred += clf.predict_proba(X_te_c) / n_splits
            print(f'[Chi2-LinSVC fold {i}/{n_splits}] C={C}, steps={sample_steps}', flush=True)
        oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
        print(f'Chi2-LinSVC OOF: {oof_ll:.6f} | C={C}, steps={sample_steps} in {time.time()-t0:.2f}s', flush=True)
        if oof_ll < best[1]:
            best = ((C, sample_steps), oof_ll); best_oof=oof; best_test=test_pred
    np.save('oof_chi2_linsvc_cal.npy', best_oof)
    np.save('test_chi2_linsvc_cal.npy', best_test)
    print('Best Chi2-LinSVC:', best)

# C) AdditiveChi2 features + LogisticRegression for diversity
def run_chi2_lr(sample_steps=2, Cs=(50,100,200,500)):
    Xn = block_l1_only(X_full)
    Xn_test = block_l1_only(X_test_full)
    Xn = np.clip(Xn, 0, None); Xn_test = np.clip(Xn_test, 0, None)
    best = (None, 1e9); best_oof=None; best_test=None
    for C in Cs:
        oof = np.zeros((len(Xn), num_classes), dtype=np.float64)
        test_pred = np.zeros((len(Xn_test), num_classes), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            X_tr, X_va = Xn[trn_idx], Xn[val_idx]
            chi2_fold = AdditiveChi2Sampler(sample_steps=sample_steps)
            X_tr_c = chi2_fold.fit_transform(X_tr)
            X_va_c = chi2_fold.transform(X_va)
            X_te_c = chi2_fold.transform(Xn_test)
            sc = StandardScaler(with_mean=False)
            X_tr_c = sc.fit_transform(X_tr_c)
            X_va_c = sc.transform(X_va_c)
            X_te_c = sc.transform(X_te_c)
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=4000, random_state=SEED)
            clf.fit(X_tr_c, y_idx[trn_idx])
            va_proba = clf.predict_proba(X_va_c)
            oof[val_idx] = va_proba
            test_pred += clf.predict_proba(X_te_c) / n_splits
            print(f'[Chi2-LR fold {i}/{n_splits}] C={C}, steps={sample_steps}', flush=True)
        oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
        print(f'Chi2-LR OOF: {oof_ll:.6f} | C={C}, steps={sample_steps} in {time.time()-t0:.2f}s', flush=True)
        if oof_ll < best[1]:
            best = ((C, sample_steps), oof_ll); best_oof=oof; best_test=test_pred
    np.save('oof_chi2_lr.npy', best_oof)
    np.save('test_chi2_lr.npy', best_test)
    print('Best Chi2-LR:', best)

print('Prepared block-wise and Chi2 pipelines. Execute:')
print('- run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=False)')
print('- run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=True)')
print('- run_chi2_linsvc(sample_steps=2, Cs=(0.5,1,2,4,8))')
print('- run_chi2_linsvc(sample_steps=3, Cs=(0.5,1,2,4,8))')
print('- run_chi2_lr(sample_steps=2, Cs=(50,100,200,500))')

Prepared block-wise and Chi2 pipelines. Execute:
- run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=False)
- run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=True)
- run_chi2_linsvc(sample_steps=2, Cs=(0.5,1,2,4,8))
- run_chi2_linsvc(sample_steps=3, Cs=(0.5,1,2,4,8))
- run_chi2_lr(sample_steps=2, Cs=(50,100,200,500))


In [15]:
# Execute histogram-aware models: Block Hellinger LR, Chi2 + LinearSVC (cal), Chi2 + LR
import time, numpy as np

t0 = time.time()
print('Running Block-wise Hellinger LR (no scaler)...', flush=True)
run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=False)
print('Running Block-wise Hellinger LR (with per-block scaler)...', flush=True)
run_block_hell_lr(Cs=(10.0,30.0,100.0), use_block_scaler=True)

print('Running AdditiveChi2 + LinearSVC (isotonic calib), sample_steps=2 ...', flush=True)
run_chi2_linsvc(sample_steps=2, Cs=(0.5,1,2,4,8))
print('Running AdditiveChi2 + LinearSVC (isotonic calib), sample_steps=3 ...', flush=True)
run_chi2_linsvc(sample_steps=3, Cs=(0.5,1,2,4,8))

print('Running AdditiveChi2 + LogisticRegression, sample_steps=2 ...', flush=True)
run_chi2_lr(sample_steps=2, Cs=(50,100,200,500))

print(f'All histogram-aware runs finished in {time.time()-t0:.2f}s', flush=True)

Running Block-wise Hellinger LR (no scaler)...




[BlkHell LR fold 1/6] C=10.0, scaler=False




[BlkHell LR fold 2/6] C=10.0, scaler=False




[BlkHell LR fold 3/6] C=10.0, scaler=False




[BlkHell LR fold 4/6] C=10.0, scaler=False




[BlkHell LR fold 5/6] C=10.0, scaler=False




[BlkHell LR fold 6/6] C=10.0, scaler=False


BlkHell LR OOF: 0.613177 | C=10.0, scaler=False in 4.35s




[BlkHell LR fold 1/6] C=30.0, scaler=False


[BlkHell LR fold 2/6] C=30.0, scaler=False




[BlkHell LR fold 3/6] C=30.0, scaler=False




[BlkHell LR fold 4/6] C=30.0, scaler=False




[BlkHell LR fold 5/6] C=30.0, scaler=False




[BlkHell LR fold 6/6] C=30.0, scaler=False


BlkHell LR OOF: 0.346896 | C=30.0, scaler=False in 4.61s




[BlkHell LR fold 1/6] C=100.0, scaler=False




[BlkHell LR fold 2/6] C=100.0, scaler=False




[BlkHell LR fold 3/6] C=100.0, scaler=False




[BlkHell LR fold 4/6] C=100.0, scaler=False




[BlkHell LR fold 5/6] C=100.0, scaler=False




[BlkHell LR fold 6/6] C=100.0, scaler=False


BlkHell LR OOF: 0.202787 | C=100.0, scaler=False in 4.20s


Best Block-Hellinger LR: ((100.0, False), 0.20278724684007093)
Running Block-wise Hellinger LR (with per-block scaler)...




[BlkHell LR fold 1/6] C=10.0, scaler=True




[BlkHell LR fold 2/6] C=10.0, scaler=True




[BlkHell LR fold 3/6] C=10.0, scaler=True




[BlkHell LR fold 4/6] C=10.0, scaler=True




[BlkHell LR fold 5/6] C=10.0, scaler=True




[BlkHell LR fold 6/6] C=10.0, scaler=True


BlkHell LR OOF: 0.081535 | C=10.0, scaler=True in 2.41s




[BlkHell LR fold 1/6] C=30.0, scaler=True




[BlkHell LR fold 2/6] C=30.0, scaler=True




[BlkHell LR fold 3/6] C=30.0, scaler=True




[BlkHell LR fold 4/6] C=30.0, scaler=True




[BlkHell LR fold 5/6] C=30.0, scaler=True




[BlkHell LR fold 6/6] C=30.0, scaler=True


BlkHell LR OOF: 0.074199 | C=30.0, scaler=True in 2.03s




[BlkHell LR fold 1/6] C=100.0, scaler=True




[BlkHell LR fold 2/6] C=100.0, scaler=True




[BlkHell LR fold 3/6] C=100.0, scaler=True




[BlkHell LR fold 4/6] C=100.0, scaler=True




[BlkHell LR fold 5/6] C=100.0, scaler=True




[BlkHell LR fold 6/6] C=100.0, scaler=True


BlkHell LR OOF: 0.072853 | C=100.0, scaler=True in 2.01s


Best Block-Hellinger LR: ((100.0, True), 0.07285271495962396)
Running AdditiveChi2 + LinearSVC (isotonic calib), sample_steps=2 ...


In [16]:
# Streamlined run: AdditiveChi2 + LinearSVC (isotonic calib) with tight grid
print('Running streamlined Chi2-LinSVC (isotonic calib), steps=2, Cs=(1,2,4)...', flush=True)
run_chi2_linsvc(sample_steps=2, Cs=(1,2,4))

Running streamlined Chi2-LinSVC (isotonic calib), steps=2, Cs=(1,2,4)...








[Chi2-LinSVC fold 1/6] C=1, steps=2








[Chi2-LinSVC fold 2/6] C=1, steps=2








[Chi2-LinSVC fold 3/6] C=1, steps=2






[Chi2-LinSVC fold 4/6] C=1, steps=2






[Chi2-LinSVC fold 5/6] C=1, steps=2








[Chi2-LinSVC fold 6/6] C=1, steps=2


Chi2-LinSVC OOF: 0.155785 | C=1, steps=2 in 156.16s






[Chi2-LinSVC fold 1/6] C=2, steps=2








[Chi2-LinSVC fold 2/6] C=2, steps=2








[Chi2-LinSVC fold 3/6] C=2, steps=2








[Chi2-LinSVC fold 4/6] C=2, steps=2








[Chi2-LinSVC fold 5/6] C=2, steps=2






[Chi2-LinSVC fold 6/6] C=2, steps=2


Chi2-LinSVC OOF: 0.155897 | C=2, steps=2 in 156.56s








[Chi2-LinSVC fold 1/6] C=4, steps=2








[Chi2-LinSVC fold 2/6] C=4, steps=2




In [17]:
# Temperature scaling of LR(no-PCA) probabilities + Hungarian assignment post-process
import numpy as np, time, sys, subprocess
import pandas as pd
from sklearn.metrics import log_loss

def ensure_scipy():
    try:
        from scipy.optimize import linear_sum_assignment  # noqa: F401
        import scipy  # noqa: F401
        return True
    except Exception:
        print('Installing scipy for Hungarian assignment...')
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# Load base LR(no-PCA)
oof_lr = np.load('oof_logreg_nopca.npy')
test_lr = np.load('test_pred_logreg_nopca.npy')
K = oof_lr.shape[1]

# Temperature scaling via alpha on probabilities: p' = normalize(p**alpha)
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

# Grid search alpha on OOF
alphas = np.linspace(0.5, 3.0, 251)
best = (1.0, 1e9)
t0 = time.time()
for a in alphas:
    Po = temp_scale(oof_lr, a)
    ll = log_loss(y_enc, clip_norm(Po), labels=list(range(K)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Temperature scaling: best alpha={best[0]:.4f}, OOF logloss={best[1]:.6f} (grid {len(alphas)} in {time.time()-t0:.2f}s)')

alpha_opt = best[0]
oof_cal = temp_scale(oof_lr, alpha_opt)
test_cal = temp_scale(test_lr, alpha_opt)
np.save('oof_lr_nopca_calibrated.npy', oof_cal)
np.save('test_lr_nopca_calibrated.npy', test_cal)

# Build calibrated submission (soft)
pred_df_soft = pd.DataFrame(clip_norm(test_cal), columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df_soft = pred_df_soft[sub_cols]
submission_soft = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_soft.reset_index(drop=True)], axis=1)
submission_soft.to_csv('submission_soft_calibrated.csv', index=False)
print('Wrote submission_soft_calibrated.csv', submission_soft.shape)

# Hungarian assignment to enforce 1-per-class in test
ensure_scipy()
from scipy.optimize import linear_sum_assignment
cost = -np.log(np.clip(test_cal, 1e-15, 1-1e-15))
row_ind, col_ind = linear_sum_assignment(cost)
assign = np.zeros_like(test_cal)
assign[row_ind, col_ind] = 1.0

# Build hard-assigned submission
pred_df_hard = pd.DataFrame(assign, columns=list(le.classes_))
pred_df_hard = pred_df_hard[sub_cols]
submission_hard = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_hard.reset_index(drop=True)], axis=1)
submission_hard.to_csv('submission.csv', index=False)
print('Saved submission.csv (Hungarian-assigned, calibrated) with shape', submission_hard.shape)

Temperature scaling: best alpha=1.4900, OOF logloss=0.039986 (grid 251 in 1.11s)
Wrote submission_soft_calibrated.csv (99, 100)
Saved submission.csv (Hungarian-assigned, calibrated) with shape (99, 100)


In [18]:
# Run AdditiveChi2Sampler + LogisticRegression with specified grids
print('Running Chi2 + LogisticRegression: steps=2, Cs=(100,200,300,500,1000)...', flush=True)
run_chi2_lr(sample_steps=2, Cs=(100,200,300,500,1000))

Running Chi2 + LogisticRegression: steps=2, Cs=(100,200,300,500,1000)...




[Chi2-LR fold 1/6] C=100, steps=2




[Chi2-LR fold 2/6] C=100, steps=2




[Chi2-LR fold 3/6] C=100, steps=2




[Chi2-LR fold 4/6] C=100, steps=2




[Chi2-LR fold 5/6] C=100, steps=2




[Chi2-LR fold 6/6] C=100, steps=2


Chi2-LR OOF: 0.109878 | C=100, steps=2 in 13.01s




[Chi2-LR fold 1/6] C=200, steps=2




[Chi2-LR fold 2/6] C=200, steps=2




[Chi2-LR fold 3/6] C=200, steps=2




[Chi2-LR fold 4/6] C=200, steps=2




[Chi2-LR fold 5/6] C=200, steps=2




[Chi2-LR fold 6/6] C=200, steps=2


Chi2-LR OOF: 0.134782 | C=200, steps=2 in 11.19s




[Chi2-LR fold 1/6] C=300, steps=2




[Chi2-LR fold 2/6] C=300, steps=2




[Chi2-LR fold 3/6] C=300, steps=2




[Chi2-LR fold 4/6] C=300, steps=2




[Chi2-LR fold 5/6] C=300, steps=2




[Chi2-LR fold 6/6] C=300, steps=2


Chi2-LR OOF: 0.152987 | C=300, steps=2 in 10.02s




[Chi2-LR fold 1/6] C=500, steps=2




[Chi2-LR fold 2/6] C=500, steps=2




[Chi2-LR fold 3/6] C=500, steps=2




[Chi2-LR fold 4/6] C=500, steps=2




[Chi2-LR fold 5/6] C=500, steps=2




[Chi2-LR fold 6/6] C=500, steps=2


Chi2-LR OOF: 0.172753 | C=500, steps=2 in 8.31s




[Chi2-LR fold 1/6] C=1000, steps=2




[Chi2-LR fold 2/6] C=1000, steps=2




[Chi2-LR fold 3/6] C=1000, steps=2




[Chi2-LR fold 4/6] C=1000, steps=2




[Chi2-LR fold 5/6] C=1000, steps=2




[Chi2-LR fold 6/6] C=1000, steps=2


Chi2-LR OOF: 0.237121 | C=1000, steps=2 in 7.56s


Best Chi2-LR: ((100, 2), 0.10987776938474948)


In [19]:
# SLSQP blend: lr_base, chi2_lr, block_hell_lr, lgbm(if exists) -> temp scale -> Hungarian
import numpy as np, pandas as pd, sys, subprocess, time, os
from sklearn.metrics import log_loss

def ensure_scipy():
    try:
        import scipy  # noqa: F401
        from scipy.optimize import minimize, linear_sum_assignment  # noqa: F401
        return True
    except Exception:
        print('Installing scipy...')
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

ensure_scipy()
from scipy.optimize import minimize
from scipy.optimize import linear_sum_assignment

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# Load candidates
cands = []  # (name, oof, test)
try:
    oof_lr = np.load('oof_logreg_nopca.npy'); test_lr = np.load('test_pred_logreg_nopca.npy')
    cands.append(('lr_base', oof_lr, test_lr))
except Exception as e:
    print('Missing lr_base:', e)
try:
    oof_c2 = np.load('oof_chi2_lr.npy'); test_c2 = np.load('test_chi2_lr.npy')
    cands.append(('chi2_lr', oof_c2, test_c2))
except Exception as e:
    print('Missing chi2_lr:', e)
try:
    oof_bh = np.load('oof_block_hell_lr.npy'); test_bh = np.load('test_block_hell_lr.npy')
    cands.append(('block_hell_lr', oof_bh, test_bh))
except Exception as e:
    print('Missing block_hell_lr:', e)
try:
    oof_lgb = np.load('oof_lgbm.npy'); test_lgb = np.load('test_pred_lgbm.npy')
    cands.append(('lgbm', oof_lgb, test_lgb))
except Exception as e:
    print('Missing lgbm (skipping):', e)

assert len(cands) >= 2, 'Need at least two candidates for a meaningful blend'
names = [n for n,_,_ in cands]
oofs = [o for _,o,_ in cands]
tests = [t for _,_,t in cands]
K = oofs[0].shape[1]

# Build SLSQP optimization with constraints: w>=0, sum(w)=1
def blend_from_w(weights, mats):
    P = np.zeros_like(mats[0])
    for wi, Pi in zip(weights, mats):
        P += wi * Pi
    return clip_norm(P)

def objective(w):
    P = blend_from_w(w, oofs)
    return log_loss(y_enc, P, labels=list(range(K)))

m = len(cands)
w0 = np.ones(m, dtype=np.float64) / m
bounds = [(0.0, 1.0)] * m
cons = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0},)
t0 = time.time()
res = minimize(objective, w0, method='SLSQP', bounds=bounds, constraints=cons, options={'maxiter': 500, 'ftol': 1e-9, 'disp': False})
print('SLSQP success:', res.success, 'status:', res.status, 'message:', res.message, 'time:', f'{time.time()-t0:.2f}s')
w_opt = res.x if res.success else w0
w_opt = np.maximum(w_opt, 0); w_opt = w_opt / w_opt.sum()
print('Weights:', dict(zip(names, np.round(w_opt, 4))))
oof_blend = blend_from_w(w_opt, oofs)
test_blend = blend_from_w(w_opt, tests)
oof_ll = log_loss(y_enc, oof_blend, labels=list(range(K)))
print(f'Blended OOF logloss: {oof_ll:.6f}')

# Temperature scaling on blended probs: p' = normalize(p**alpha)
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

alphas = np.linspace(0.6, 2.2, 161)
best = (1.0, 1e9)
for a in alphas:
    ll = log_loss(y_enc, temp_scale(oof_blend, a), labels=list(range(K)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Temp scaling on blend: alpha={best[0]:.4f}, OOF={best[1]:.6f}')
alpha_opt = best[0]
oof_cal = temp_scale(oof_blend, alpha_opt)
test_cal = temp_scale(test_blend, alpha_opt)

# Soft calibrated submission (for sanity check if needed)
pred_df_soft = pd.DataFrame(test_cal, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df_soft = pred_df_soft[sub_cols]
submission_soft = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_soft.reset_index(drop=True)], axis=1)
submission_soft.to_csv('submission_soft_blend_calibrated.csv', index=False)
print('Wrote submission_soft_blend_calibrated.csv', submission_soft.shape)

# Hungarian assignment on calibrated blend
cost = -np.log(np.clip(test_cal, 1e-15, 1-1e-15))
row_ind, col_ind = linear_sum_assignment(cost)
assign = np.zeros_like(test_cal)
assign[row_ind, col_ind] = 1.0
pred_df_hard = pd.DataFrame(assign, columns=list(le.classes_))
pred_df_hard = pred_df_hard[sub_cols]
submission_hard = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_hard.reset_index(drop=True)], axis=1)
submission_hard.to_csv('submission.csv', index=False)
print('Saved submission.csv (Hungarian on calibrated blend). Shape:', submission_hard.shape)

Missing lgbm (skipping): [Errno 2] No such file or directory: 'oof_lgbm.npy'
SLSQP success: True status: 0 message: Optimization terminated successfully time: 0.14s
Weights: {'lr_base': 0.5687, 'chi2_lr': 0.4313, 'block_hell_lr': 0.0}
Blended OOF logloss: 0.043800


Temp scaling on blend: alpha=1.8500, OOF=0.031028
Wrote submission_soft_blend_calibrated.csv (99, 100)
Saved submission.csv (Hungarian on calibrated blend). Shape: (99, 100)


In [20]:
# Run improved Chi2 + LinearSVC (Platt) with tight grid per expert advice
print('Running Chi2-LinSVC (Platt), steps=2, Cs=(1,2,4)...', flush=True)
run_chi2_linsvc(sample_steps=2, Cs=(1,2,4))

Running Chi2-LinSVC (Platt), steps=2, Cs=(1,2,4)...








[Chi2-LinSVC fold 1/6] C=1, steps=2






[Chi2-LinSVC fold 2/6] C=1, steps=2








[Chi2-LinSVC fold 3/6] C=1, steps=2








[Chi2-LinSVC fold 4/6] C=1, steps=2






[Chi2-LinSVC fold 5/6] C=1, steps=2






[Chi2-LinSVC fold 6/6] C=1, steps=2


Chi2-LinSVC OOF: 0.155785 | C=1, steps=2 in 155.42s








[Chi2-LinSVC fold 1/6] C=2, steps=2






[Chi2-LinSVC fold 2/6] C=2, steps=2








[Chi2-LinSVC fold 3/6] C=2, steps=2








[Chi2-LinSVC fold 4/6] C=2, steps=2








[Chi2-LinSVC fold 5/6] C=2, steps=2








[Chi2-LinSVC fold 6/6] C=2, steps=2


Chi2-LinSVC OOF: 0.155897 | C=2, steps=2 in 155.99s








[Chi2-LinSVC fold 1/6] C=4, steps=2








[Chi2-LinSVC fold 2/6] C=4, steps=2








[Chi2-LinSVC fold 3/6] C=4, steps=2






In [21]:
# Overwrite submission.csv with soft calibrated blend (no Hungarian)
import pandas as pd
soft_path = 'submission_soft_blend_calibrated.csv'
df = pd.read_csv(soft_path)
df.to_csv('submission.csv', index=False)
print('submission.csv overwritten with soft calibrated blend:', df.shape)

submission.csv overwritten with soft calibrated blend: (99, 100)


In [22]:
# Stacking meta-learner on OOF probabilities -> temp scale -> write soft and Hungarian submissions
import numpy as np, pandas as pd, json, time, sys, subprocess
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def ensure_scipy():
    try:
        from scipy.optimize import linear_sum_assignment  # noqa: F401
        import scipy  # noqa: F401
        return True
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# Load candidates (ensure at least LR base + Chi2 LR)
cands = []
names = []
try:
    oof_lr = np.load('oof_logreg_nopca.npy'); test_lr = np.load('test_pred_logreg_nopca.npy')
    cands.append((oof_lr, test_lr)); names.append('lr_base')
except Exception as e:
    print('Missing lr_base:', e)
try:
    oof_c2 = np.load('oof_chi2_lr.npy'); test_c2 = np.load('test_chi2_lr.npy')
    cands.append((oof_c2, test_c2)); names.append('chi2_lr')
except Exception as e:
    print('Missing chi2_lr:', e)
try:
    oof_bh = np.load('oof_block_hell_lr.npy'); test_bh = np.load('test_block_hell_lr.npy')
    cands.append((oof_bh, test_bh)); names.append('block_hell_lr')
except Exception as e:
    print('Missing block_hell_lr:', e)
try:
    oof_lgb = np.load('oof_lgbm.npy'); test_lgb = np.load('test_pred_lgbm.npy')
    cands.append((oof_lgb, test_lgb)); names.append('lgbm')
except Exception as e:
    print('Missing lgbm:', e)

assert len(cands) >= 2, 'Need at least two base models for stacking'
num_classes = cands[0][0].shape[1]
n_train = cands[0][0].shape[0]
n_test = cands[0][1].shape[0]

# Build stacked features by concatenating probabilities
X_meta_oof = np.concatenate([o for o, _ in cands], axis=1)
X_meta_test_all = np.concatenate([t for _, t in cands], axis=1)

# Use same 6-fold indices to avoid leakage
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
y_idx = y_enc.astype(int)

oof_stack = np.zeros((n_train, num_classes), dtype=np.float64)
test_stack = np.zeros((n_test, num_classes), dtype=np.float64)

t0 = time.time()
for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    X_tr = X_meta_oof[trn_idx]
    X_va = X_meta_oof[val_idx]
    # Train multinomial LR as meta-learner
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1.0, max_iter=2000, random_state=2025)
    clf.fit(X_tr, y_idx[trn_idx])
    P_va = clf.predict_proba(X_va)
    oof_stack[val_idx] = P_va
    # Predict on test per fold and average
    test_stack += clf.predict_proba(X_meta_test_all) / len(fold_indices)
    ll = log_loss(y_idx[val_idx], clip_norm(P_va), labels=list(range(num_classes)))
    print(f'[Stack fold {i}/{len(fold_indices)}] val logloss={ll:.6f}', flush=True)

oof_ll = log_loss(y_idx, clip_norm(oof_stack), labels=list(range(num_classes)))
print(f'Stack meta OOF logloss: {oof_ll:.6f} in {time.time()-t0:.2f}s')

# Temperature scaling on stacked probs
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

alphas = np.linspace(0.6, 2.2, 161)
best = (1.0, 1e9)
for a in alphas:
    ll = log_loss(y_idx, temp_scale(oof_stack, a), labels=list(range(num_classes)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Stack temp scaling: alpha={best[0]:.4f}, OOF={best[1]:.6f}')
alpha_opt = best[0]
oof_cal = temp_scale(oof_stack, alpha_opt)
test_cal = temp_scale(test_stack, alpha_opt)

# Save for reuse
np.save('oof_stack.npy', oof_stack)
np.save('test_stack.npy', test_stack)
np.save('oof_stack_calibrated.npy', oof_cal)
np.save('test_stack_calibrated.npy', test_cal)

# Soft submission from stacked calibrated
pred_df_soft = pd.DataFrame(clip_norm(test_cal), columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df_soft = pred_df_soft[sub_cols]
submission_soft = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_soft.reset_index(drop=True)], axis=1)
submission_soft.to_csv('submission_soft_stacked_calibrated.csv', index=False)
print('Wrote submission_soft_stacked_calibrated.csv', submission_soft.shape)

# Hungarian assignment on stacked calibrated
ensure_scipy()
from scipy.optimize import linear_sum_assignment
cost = -np.log(np.clip(test_cal, 1e-15, 1-1e-15))
row_ind, col_ind = linear_sum_assignment(cost)
assign = np.zeros_like(test_cal)
assign[row_ind, col_ind] = 1.0
pred_df_hard = pd.DataFrame(assign, columns=list(le.classes_))
pred_df_hard = pred_df_hard[sub_cols]
submission_hard = pd.concat([test[[id_col]].reset_index(drop=True), pred_df_hard.reset_index(drop=True)], axis=1)
submission_hard.to_csv('submission.csv', index=False)
print('Saved submission.csv (Hungarian on stacked calibrated). Shape:', submission_hard.shape)

Missing lgbm: [Errno 2] No such file or directory: 'oof_lgbm.npy'




[Stack fold 1/6] val logloss=0.374331




[Stack fold 2/6] val logloss=0.401676




[Stack fold 3/6] val logloss=0.331341




[Stack fold 4/6] val logloss=0.388734




[Stack fold 5/6] val logloss=0.360118




[Stack fold 6/6] val logloss=0.361753


Stack meta OOF logloss: 0.369657 in 2.16s


Stack temp scaling: alpha=2.1600, OOF=0.041972
Wrote submission_soft_stacked_calibrated.csv (99, 100)
Saved submission.csv (Hungarian on stacked calibrated). Shape: (99, 100)


In [23]:
# Try Chi2 + LogisticRegression with sample_steps=3 as per expert fallback
print('Running Chi2 + LogisticRegression: steps=3, Cs=(200,400,800)...', flush=True)
run_chi2_lr(sample_steps=3, Cs=(200,400,800))

Running Chi2 + LogisticRegression: steps=3, Cs=(200,400,800)...




[Chi2-LR fold 1/6] C=200, steps=3




[Chi2-LR fold 2/6] C=200, steps=3




[Chi2-LR fold 3/6] C=200, steps=3






[Chi2-LR fold 5/6] C=200, steps=3




[Chi2-LR fold 6/6] C=200, steps=3


Chi2-LR OOF: 0.154042 | C=200, steps=3 in 8.41s




[Chi2-LR fold 1/6] C=400, steps=3




[Chi2-LR fold 2/6] C=400, steps=3




[Chi2-LR fold 3/6] C=400, steps=3




[Chi2-LR fold 4/6] C=400, steps=3




[Chi2-LR fold 5/6] C=400, steps=3




[Chi2-LR fold 6/6] C=400, steps=3


Chi2-LR OOF: 0.166020 | C=400, steps=3 in 7.56s




[Chi2-LR fold 1/6] C=800, steps=3




[Chi2-LR fold 2/6] C=800, steps=3




[Chi2-LR fold 3/6] C=800, steps=3




[Chi2-LR fold 4/6] C=800, steps=3




[Chi2-LR fold 5/6] C=800, steps=3




[Chi2-LR fold 6/6] C=800, steps=3


Chi2-LR OOF: 0.173514 | C=800, steps=3 in 7.76s


Best Chi2-LR: ((200, 3), 0.1540423627481536)


In [24]:
# Block-wise L1-only + per-block StandardScaler + LogisticRegression (no sqrt); save under block_hell filenames for blending reuse
import numpy as np, time, json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
X_full = train[feature_cols].values.astype(np.float64)
X_test_full = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)

# Reuse block indices from cell 12 if present; otherwise define
def get_blocks(cols):
    margin_cols = [c for c in cols if c.startswith('margin')]
    shape_cols = [c for c in cols if c.startswith('shape')]
    texture_cols = [c for c in cols if c.startswith('texture')]
    m_idx = np.array([cols.index(c) for c in margin_cols])
    s_idx = np.array([cols.index(c) for c in shape_cols])
    t_idx = np.array([cols.index(c) for c in texture_cols])
    return m_idx, s_idx, t_idx
try:
    m_idx, s_idx, t_idx
except NameError:
    m_idx, s_idx, t_idx = get_blocks(feature_cols)

def block_l1_only(A):
    A = np.clip(A, 0, None)
    out = np.empty_like(A)
    for idx in (m_idx, s_idx, t_idx):
        B = A[:, idx]
        denom = B.sum(axis=1, keepdims=True) + 1e-12
        out[:, idx] = B / denom
    return out

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

Xn = block_l1_only(X_full)
Xn_test = block_l1_only(X_test_full)

best = (None, 1e9)
best_oof = None
best_test = None
for C in (10.0, 20.0, 30.0, 60.0, 100.0):
    oof = np.zeros((len(Xn), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(Xn_test), num_classes), dtype=np.float64)
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = Xn[trn_idx].copy(), Xn[val_idx].copy()
        X_te = Xn_test.copy()
        # Per-block StandardScaler
        for idx in (m_idx, s_idx, t_idx):
            sc = StandardScaler(with_mean=True, with_std=True)
            X_tr[:, idx] = sc.fit_transform(X_tr[:, idx])
            X_va[:, idx] = sc.transform(X_va[:, idx])
            X_te[:, idx] = sc.transform(X_te[:, idx])
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=4000, random_state=SEED)
        clf.fit(X_tr, y_idx[trn_idx])
        va_proba = clf.predict_proba(X_va)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te) / len(fold_indices)
        print(f'[BlkL1 LR fold {i}/{len(fold_indices)}] C={C}', flush=True)
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'BlkL1 LR OOF: {oof_ll:.6f} | C={C} in {time.time()-t0:.2f}s', flush=True)
    if oof_ll < best[1]:
        best = (C, oof_ll); best_oof = oof; best_test = test_pred

# Save using block_hell filenames so existing blend cell picks it up
np.save('oof_block_hell_lr.npy', best_oof)
np.save('test_block_hell_lr.npy', best_test)
print('Best Block-L1 LR (saved as block_hell):', best)

[BlkL1 LR fold 1/6] C=10.0




[BlkL1 LR fold 2/6] C=10.0




[BlkL1 LR fold 3/6] C=10.0




[BlkL1 LR fold 4/6] C=10.0




[BlkL1 LR fold 5/6] C=10.0




[BlkL1 LR fold 6/6] C=10.0


BlkL1 LR OOF: 0.093609 | C=10.0 in 1.78s




[BlkL1 LR fold 1/6] C=20.0




[BlkL1 LR fold 2/6] C=20.0




[BlkL1 LR fold 3/6] C=20.0




[BlkL1 LR fold 4/6] C=20.0




[BlkL1 LR fold 5/6] C=20.0




[BlkL1 LR fold 6/6] C=20.0


BlkL1 LR OOF: 0.089698 | C=20.0 in 1.98s




[BlkL1 LR fold 1/6] C=30.0




[BlkL1 LR fold 2/6] C=30.0




[BlkL1 LR fold 3/6] C=30.0






[BlkL1 LR fold 5/6] C=30.0




[BlkL1 LR fold 6/6] C=30.0


BlkL1 LR OOF: 0.088921 | C=30.0 in 2.14s




[BlkL1 LR fold 1/6] C=60.0




[BlkL1 LR fold 2/6] C=60.0




[BlkL1 LR fold 3/6] C=60.0




[BlkL1 LR fold 4/6] C=60.0




[BlkL1 LR fold 5/6] C=60.0




[BlkL1 LR fold 6/6] C=60.0


BlkL1 LR OOF: 0.089946 | C=60.0 in 2.39s




[BlkL1 LR fold 1/6] C=100.0




[BlkL1 LR fold 2/6] C=100.0




[BlkL1 LR fold 3/6] C=100.0




[BlkL1 LR fold 4/6] C=100.0




[BlkL1 LR fold 5/6] C=100.0




[BlkL1 LR fold 6/6] C=100.0


BlkL1 LR OOF: 0.090940 | C=100.0 in 2.23s


Best Block-L1 LR (saved as block_hell): (30.0, 0.08892056812827395)


In [25]:
# Canonical Block-Hellinger (per-block L1 -> sqrt) + Global row L2 -> LR (no scaler)
import numpy as np, json, time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
X_raw = train[feature_cols].values.astype(np.float64)
X_test_raw = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)

# Build block indices (64 each for margin/shape/texture)
def get_blocks(cols):
    margin_cols = [c for c in cols if c.startswith('margin')]
    shape_cols = [c for c in cols if c.startswith('shape')]
    texture_cols = [c for c in cols if c.startswith('texture')]
    assert len(margin_cols)==64 and len(shape_cols)==64 and len(texture_cols)==64, 'Expected 64 per block'
    m_idx = np.array([cols.index(c) for c in margin_cols])
    s_idx = np.array([cols.index(c) for c in shape_cols])
    t_idx = np.array([cols.index(c) for c in texture_cols])
    return m_idx, s_idx, t_idx

m_idx, s_idx, t_idx = get_blocks(feature_cols)

def block_hellinger_global_l2(A, m_idx, s_idx, t_idx):
    A = np.clip(A, 0, None).astype(np.float64, copy=False)
    out = np.empty_like(A)
    for idx in (m_idx, s_idx, t_idx):
        B = A[:, idx]
        B = B / (B.sum(axis=1, keepdims=True) + 1e-12)
        out[:, idx] = np.sqrt(B)
    out = out / (np.linalg.norm(out, axis=1, keepdims=True) + 1e-12)
    return out

# Fixed 6-fold splits
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# Precompute transformed features (row-wise ops only; no leakage)
X_h = block_hellinger_global_l2(X_raw, m_idx, s_idx, t_idx)
X_test_h = block_hellinger_global_l2(X_test_raw, m_idx, s_idx, t_idx)

Cs = [30.0, 50.0, 80.0, 100.0, 150.0]
best = (None, 1e9)
best_oof = None
best_test = None
t_all = time.time()
for C in Cs:
    oof = np.zeros((len(X_h), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test_h), num_classes), dtype=np.float64)
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        X_tr, X_va = X_h[trn_idx], X_h[val_idx]
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=5000, random_state=SEED)
        fstart = time.time()
        clf.fit(X_tr, y_idx[trn_idx])
        va_proba = clf.predict_proba(X_va)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_test_h) / n_splits
        print(f'[Hellinger+L2 LR fold {i}/{n_splits}] C={C}, time={time.time()-fstart:.2f}s', flush=True)
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'Hellinger+L2 LR OOF: {oof_ll:.6f} | C={C} in {time.time()-t0:.2f}s', flush=True)
    if oof_ll < best[1]:
        best = (C, oof_ll); best_oof = oof; best_test = test_pred

print('Best Hellinger+L2 LR:', best, '| total {:.2f}s'.format(time.time()-t_all))

# Save under dedicated and blend-compatible filenames
np.save('oof_hellinger_lr.npy', best_oof)
np.save('test_hellinger_lr.npy', best_test)
np.save('oof_block_hell_lr.npy', best_oof)  # for existing blend cell compatibility
np.save('test_block_hell_lr.npy', best_test)
print('Saved oof_hellinger_lr.npy/test_hellinger_lr.npy and updated block_hell files')



[Hellinger+L2 LR fold 1/6] C=30.0, time=0.46s




[Hellinger+L2 LR fold 2/6] C=30.0, time=0.67s




[Hellinger+L2 LR fold 3/6] C=30.0, time=0.63s




[Hellinger+L2 LR fold 4/6] C=30.0, time=0.62s




[Hellinger+L2 LR fold 5/6] C=30.0, time=0.62s




[Hellinger+L2 LR fold 6/6] C=30.0, time=0.47s


Hellinger+L2 LR OOF: 0.613632 | C=30.0 in 3.51s




[Hellinger+L2 LR fold 1/6] C=50.0, time=0.73s




[Hellinger+L2 LR fold 2/6] C=50.0, time=0.90s


[Hellinger+L2 LR fold 3/6] C=50.0, time=0.67s




[Hellinger+L2 LR fold 4/6] C=50.0, time=0.71s




[Hellinger+L2 LR fold 5/6] C=50.0, time=0.83s




[Hellinger+L2 LR fold 6/6] C=50.0, time=0.96s


Hellinger+L2 LR OOF: 0.464014 | C=50.0 in 4.84s




[Hellinger+L2 LR fold 1/6] C=80.0, time=1.03s




[Hellinger+L2 LR fold 2/6] C=80.0, time=1.11s




[Hellinger+L2 LR fold 3/6] C=80.0, time=0.99s




[Hellinger+L2 LR fold 4/6] C=80.0, time=0.93s




[Hellinger+L2 LR fold 5/6] C=80.0, time=0.97s




[Hellinger+L2 LR fold 6/6] C=80.0, time=0.96s


Hellinger+L2 LR OOF: 0.366182 | C=80.0 in 6.04s




[Hellinger+L2 LR fold 1/6] C=100.0, time=0.95s




[Hellinger+L2 LR fold 2/6] C=100.0, time=1.05s




[Hellinger+L2 LR fold 3/6] C=100.0, time=0.98s




[Hellinger+L2 LR fold 4/6] C=100.0, time=0.91s




[Hellinger+L2 LR fold 5/6] C=100.0, time=0.80s




[Hellinger+L2 LR fold 6/6] C=100.0, time=0.75s


Hellinger+L2 LR OOF: 0.328459 | C=100.0 in 5.47s




[Hellinger+L2 LR fold 1/6] C=150.0, time=0.59s




[Hellinger+L2 LR fold 2/6] C=150.0, time=0.47s




[Hellinger+L2 LR fold 3/6] C=150.0, time=0.44s




[Hellinger+L2 LR fold 4/6] C=150.0, time=0.45s




[Hellinger+L2 LR fold 5/6] C=150.0, time=0.53s




[Hellinger+L2 LR fold 6/6] C=150.0, time=0.50s


Hellinger+L2 LR OOF: 0.272277 | C=150.0 in 3.01s


Best Hellinger+L2 LR: (150.0, 0.2722767605852533) | total 22.87s
Saved oof_hellinger_lr.npy/test_hellinger_lr.npy and updated block_hell files


In [26]:
# LightGBM (regularized) and NearestCentroid on corrected Hellinger features
import numpy as np, pandas as pd, json, time, sys, subprocess, importlib
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float32)
X_test = test[feature_cols].values.astype(np.float32)
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)
with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def ensure_lightgbm():
    try:
        import lightgbm as lgb
        return lgb
    except Exception:
        print('Installing lightgbm...')
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'], check=True)
        import lightgbm as lgb
        return lgb

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# 1) LightGBM with provided regularized params
print('Running LightGBM (regularized) 6-fold CV...', flush=True)
lgb = ensure_lightgbm()
params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'num_leaves': 10,
    'max_depth': 4,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'lambda_l2': 5.0,
    'verbosity': -1,
    'seed': SEED,
}
oof_lgb = np.zeros((len(X), num_classes), dtype=np.float32)
test_lgb = np.zeros((len(X_test), num_classes), dtype=np.float32)
t0 = time.time()
for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    dtrain = lgb.Dataset(X[trn_idx], label=y_idx[trn_idx], free_raw_data=True)
    dvalid = lgb.Dataset(X[val_idx], label=y_idx[val_idx], free_raw_data=True)
    start = time.time()
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=4000,
        valid_sets=[dvalid],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=200)
        ]
    )
    P_va = model.predict(X[val_idx], num_iteration=model.best_iteration)
    oof_lgb[val_idx] = P_va
    test_lgb += model.predict(X_test, num_iteration=model.best_iteration) / n_splits
    ll = log_loss(y_idx[val_idx], clip_norm(P_va), labels=list(range(num_classes)))
    print(f'[LGBM fold {i}/{n_splits}] best_iter={model.best_iteration}, val logloss={ll:.6f}, time={time.time()-start:.1f}s', flush=True)
oof_ll = log_loss(y_idx, clip_norm(oof_lgb), labels=list(range(num_classes)))
print(f'LGBM OOF logloss: {oof_ll:.6f} | total {time.time()-t0:.1f}s', flush=True)
np.save('oof_lgbm.npy', oof_lgb)
np.save('test_pred_lgbm.npy', test_lgb)

# 2) NearestCentroid in corrected Hellinger space with cosine-softmax scoring
print('Running NearestCentroid (Hellinger + global L2) ...', flush=True)
# Reuse the canonical transform from cell 23 if defined; else define here
try:
    block_hellinger_global_l2
except NameError:
    def get_blocks(cols):
        margin_cols = [c for c in cols if c.startswith('margin')]
        shape_cols = [c for c in cols if c.startswith('shape')]
        texture_cols = [c for c in cols if c.startswith('texture')]
        m_idx = np.array([cols.index(c) for c in margin_cols])
        s_idx = np.array([cols.index(c) for c in shape_cols])
        t_idx = np.array([cols.index(c) for c in texture_cols])
        return m_idx, s_idx, t_idx
    m_idx, s_idx, t_idx = get_blocks(feature_cols)
    def block_hellinger_global_l2(A, m_idx, s_idx, t_idx):
        A = np.clip(A, 0, None).astype(np.float64, copy=False)
        out = np.empty_like(A, dtype=np.float64)
        for idx in (m_idx, s_idx, t_idx):
            B = A[:, idx]
            B = B / (B.sum(axis=1, keepdims=True) + 1e-12)
            out[:, idx] = np.sqrt(B)
        out = out / (np.linalg.norm(out, axis=1, keepdims=True) + 1e-12)
        return out
X_h = block_hellinger_global_l2(train[feature_cols].values, m_idx, s_idx, t_idx).astype(np.float32)
X_test_h = block_hellinger_global_l2(test[feature_cols].values, m_idx, s_idx, t_idx).astype(np.float32)

def softmax_rowwise(Z):
    Z = Z - Z.max(axis=1, keepdims=True)
    P = np.exp(Z)
    return P / P.sum(axis=1, keepdims=True)

taus = [20.0, 50.0, 80.0]
# Pick first fold for tau selection
trn_idx_sel, val_idx_sel = np.array(fold_indices[0][0]), np.array(fold_indices[0][1])
best_tau = None; best_ll = 1e9
for tau in taus:
    # Compute centroids on train part
    Cmeans = np.zeros((num_classes, X_h.shape[1]), dtype=np.float32)
    for c in range(num_classes):
        Cmeans[c] = X_h[trn_idx_sel][y_idx[trn_idx_sel]==c].mean(axis=0)
    # Normalize centroids to unit length
    Cmeans /= (np.linalg.norm(Cmeans, axis=1, keepdims=True) + 1e-12)
    sims = X_h[val_idx_sel] @ Cmeans.T  # cosine in L2-normalized space
    P = softmax_rowwise(sims * tau)
    ll = log_loss(y_idx[val_idx_sel], clip_norm(P), labels=list(range(num_classes)))
    print(f'[NC tau search] tau={tau}, fold1 val logloss={ll:.6f}')
    if ll < best_ll:
        best_ll = ll; best_tau = tau
print('NearestCentroid selected tau:', best_tau)

# Full 6-fold OOF/Test with chosen tau
oof_nc = np.zeros((len(X_h), num_classes), dtype=np.float32)
test_nc = np.zeros((len(X_test_h), num_classes), dtype=np.float32)
for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    Cmeans = np.zeros((num_classes, X_h.shape[1]), dtype=np.float32)
    for c in range(num_classes):
        Cmeans[c] = X_h[trn_idx][y_idx[trn_idx]==c].mean(axis=0)
    Cmeans /= (np.linalg.norm(Cmeans, axis=1, keepdims=True) + 1e-12)
    sims_val = X_h[val_idx] @ Cmeans.T
    oof_nc[val_idx] = softmax_rowwise(sims_val * best_tau)
    sims_test = X_test_h @ Cmeans.T
    test_nc += softmax_rowwise(sims_test * best_tau) / n_splits
    ll = log_loss(y_idx[val_idx], clip_norm(oof_nc[val_idx]), labels=list(range(num_classes)))
    print(f'[NC fold {i}/{n_splits}] tau={best_tau}, val logloss={ll:.6f}', flush=True)
oof_ll_nc = log_loss(y_idx, clip_norm(oof_nc), labels=list(range(num_classes)))
print(f'NearestCentroid OOF logloss: {oof_ll_nc:.6f}')
np.save('oof_nc_hell.npy', oof_nc)
np.save('test_nc_hell.npy', test_nc)
print('Saved oof_nc_hell.npy and test_nc_hell.npy')

Running LightGBM (regularized) 6-fold CV...


[200]	valid's multi_logloss: 1.4887


[400]	valid's multi_logloss: 0.94224


[600]	valid's multi_logloss: 0.766715


[800]	valid's multi_logloss: 0.678883


[1000]	valid's multi_logloss: 0.622631


[1200]	valid's multi_logloss: 0.584721


[1400]	valid's multi_logloss: 0.555998


[1600]	valid's multi_logloss: 0.533277


[1800]	valid's multi_logloss: 0.514901


[2000]	valid's multi_logloss: 0.499235


[2200]	valid's multi_logloss: 0.485937


[2400]	valid's multi_logloss: 0.474692


[2600]	valid's multi_logloss: 0.464547


[2800]	valid's multi_logloss: 0.455674


[3000]	valid's multi_logloss: 0.448144


[3200]	valid's multi_logloss: 0.441088


[3400]	valid's multi_logloss: 0.434512


[3600]	valid's multi_logloss: 0.428373


[3800]	valid's multi_logloss: 0.422992


[4000]	valid's multi_logloss: 0.417916


[LGBM fold 1/6] best_iter=4000, val logloss=0.417916, time=92.2s


[200]	valid's multi_logloss: 1.40441


[400]	valid's multi_logloss: 0.834817


[600]	valid's multi_logloss: 0.654397


[800]	valid's multi_logloss: 0.564319


[1000]	valid's multi_logloss: 0.509355


[1200]	valid's multi_logloss: 0.471447


[1400]	valid's multi_logloss: 0.442914


[1600]	valid's multi_logloss: 0.420887


[1800]	valid's multi_logloss: 0.40281


[2000]	valid's multi_logloss: 0.38712


[2200]	valid's multi_logloss: 0.374391


[2400]	valid's multi_logloss: 0.362784


[2600]	valid's multi_logloss: 0.352999


[2800]	valid's multi_logloss: 0.344034


[3000]	valid's multi_logloss: 0.336016


[3200]	valid's multi_logloss: 0.329071


[3400]	valid's multi_logloss: 0.322511


[3600]	valid's multi_logloss: 0.316594


[3800]	valid's multi_logloss: 0.311256


[4000]	valid's multi_logloss: 0.306367


[LGBM fold 2/6] best_iter=4000, val logloss=0.306367, time=92.6s


[200]	valid's multi_logloss: 1.37654


[400]	valid's multi_logloss: 0.813199


[600]	valid's multi_logloss: 0.638713


[800]	valid's multi_logloss: 0.552334


[1000]	valid's multi_logloss: 0.500231


[1200]	valid's multi_logloss: 0.463989


[1400]	valid's multi_logloss: 0.437159


[1600]	valid's multi_logloss: 0.415951


[1800]	valid's multi_logloss: 0.398593


[2000]	valid's multi_logloss: 0.384883


[2200]	valid's multi_logloss: 0.372694


[2400]	valid's multi_logloss: 0.362415


[2600]	valid's multi_logloss: 0.353298


[2800]	valid's multi_logloss: 0.345191


[3000]	valid's multi_logloss: 0.337896


[3200]	valid's multi_logloss: 0.331394


[3400]	valid's multi_logloss: 0.325505


[3600]	valid's multi_logloss: 0.32015


[3800]	valid's multi_logloss: 0.315084


[4000]	valid's multi_logloss: 0.310566


[LGBM fold 3/6] best_iter=4000, val logloss=0.310566, time=91.8s


[200]	valid's multi_logloss: 1.50034


[400]	valid's multi_logloss: 0.956768


[600]	valid's multi_logloss: 0.788039


[800]	valid's multi_logloss: 0.700572


[1000]	valid's multi_logloss: 0.649272


[1200]	valid's multi_logloss: 0.611201


[1400]	valid's multi_logloss: 0.583912


[1600]	valid's multi_logloss: 0.562915


[1800]	valid's multi_logloss: 0.544832


[2000]	valid's multi_logloss: 0.52989


[2200]	valid's multi_logloss: 0.517432


[2400]	valid's multi_logloss: 0.506255


[2600]	valid's multi_logloss: 0.496326


[2800]	valid's multi_logloss: 0.487666


[3000]	valid's multi_logloss: 0.479794


[3200]	valid's multi_logloss: 0.472743


[3400]	valid's multi_logloss: 0.466276


[3600]	valid's multi_logloss: 0.460309


[3800]	valid's multi_logloss: 0.454844


[4000]	valid's multi_logloss: 0.449809


[LGBM fold 4/6] best_iter=4000, val logloss=0.449809, time=95.6s


[200]	valid's multi_logloss: 1.36315


[400]	valid's multi_logloss: 0.795145


[600]	valid's multi_logloss: 0.624357


[800]	valid's multi_logloss: 0.53861


[1000]	valid's multi_logloss: 0.487209


[1200]	valid's multi_logloss: 0.451141


[1400]	valid's multi_logloss: 0.424653


[1600]	valid's multi_logloss: 0.403339


[1800]	valid's multi_logloss: 0.386437


[2000]	valid's multi_logloss: 0.372527


[2200]	valid's multi_logloss: 0.360417


[2400]	valid's multi_logloss: 0.350206


[2600]	valid's multi_logloss: 0.341021


[2800]	valid's multi_logloss: 0.33332


[3000]	valid's multi_logloss: 0.326058


[3200]	valid's multi_logloss: 0.319995


[3400]	valid's multi_logloss: 0.314318


[3600]	valid's multi_logloss: 0.309061


[3800]	valid's multi_logloss: 0.304143


[4000]	valid's multi_logloss: 0.29978


[LGBM fold 5/6] best_iter=4000, val logloss=0.299780, time=91.6s


[200]	valid's multi_logloss: 1.41695


[400]	valid's multi_logloss: 0.861829


[600]	valid's multi_logloss: 0.692759


[800]	valid's multi_logloss: 0.605658


[1000]	valid's multi_logloss: 0.552552


[1200]	valid's multi_logloss: 0.515327


[1400]	valid's multi_logloss: 0.487793


[1600]	valid's multi_logloss: 0.466015


[1800]	valid's multi_logloss: 0.448195


[2000]	valid's multi_logloss: 0.433157


[2200]	valid's multi_logloss: 0.420257


[2400]	valid's multi_logloss: 0.409449


[2600]	valid's multi_logloss: 0.40005


[2800]	valid's multi_logloss: 0.391441


[3000]	valid's multi_logloss: 0.383675


[3200]	valid's multi_logloss: 0.377006


[3400]	valid's multi_logloss: 0.37073


[3600]	valid's multi_logloss: 0.364899


[3800]	valid's multi_logloss: 0.359596


[4000]	valid's multi_logloss: 0.354726


[LGBM fold 6/6] best_iter=4000, val logloss=0.354726, time=92.4s


LGBM OOF logloss: 0.356488 | total 556.3s


Running NearestCentroid (Hellinger + global L2) ...


[NC tau search] tau=20.0, fold1 val logloss=1.733510
[NC tau search] tau=50.0, fold1 val logloss=0.556484
[NC tau search] tau=80.0, fold1 val logloss=0.350640
NearestCentroid selected tau: 80.0
[NC fold 1/6] tau=80.0, val logloss=0.350640


[NC fold 2/6] tau=80.0, val logloss=0.243507


[NC fold 3/6] tau=80.0, val logloss=0.309216


[NC fold 4/6] tau=80.0, val logloss=0.403777


[NC fold 5/6] tau=80.0, val logloss=0.437795


[NC fold 6/6] tau=80.0, val logloss=0.397755


NearestCentroid OOF logloss: 0.356926
Saved oof_nc_hell.npy and test_nc_hell.npy


In [27]:
# SLSQP blend v2: lr_base + chi2_lr + lgbm + nearest_centroid -> temperature scaling -> soft submission
import numpy as np, pandas as pd, time, sys, subprocess
from sklearn.metrics import log_loss

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def ensure_scipy():
    try:
        import scipy  # noqa: F401
        from scipy.optimize import minimize  # noqa: F401
        return True
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

ensure_scipy()
from scipy.optimize import minimize

# Load candidates (require lr_base + chi2 + lgbm; nc optional if exists)
cands = []
names = []
try:
    oof_lr = np.load('oof_logreg_nopca.npy'); test_lr = np.load('test_pred_logreg_nopca.npy')
    cands.append((oof_lr, test_lr)); names.append('lr_base')
except Exception as e:
    raise RuntimeError('Missing lr_base predictions')
try:
    oof_c2 = np.load('oof_chi2_lr.npy'); test_c2 = np.load('test_chi2_lr.npy')
    cands.append((oof_c2, test_c2)); names.append('chi2_lr')
except Exception as e:
    print('Missing chi2_lr, continuing without it:', e)
try:
    oof_lgb = np.load('oof_lgbm.npy'); test_lgb = np.load('test_pred_lgbm.npy')
    cands.append((oof_lgb, test_lgb)); names.append('lgbm')
except Exception as e:
    print('Missing lgbm (run cell 24 first), continuing without it:', e)
try:
    oof_nc = np.load('oof_nc_hell.npy'); test_nc = np.load('test_nc_hell.npy')
    cands.append((oof_nc, test_nc)); names.append('nc_hell')
except Exception as e:
    print('Missing nearest centroid (will be available after cell 24), continuing without it:', e)

assert len(cands) >= 2, 'Need at least two models to blend'
num_classes = cands[0][0].shape[1]
y_idx = y_enc.astype(int)

oofs = [o for o, _ in cands]
tests = [t for _, t in cands]

def blend_from_w(weights, mats):
    P = np.zeros_like(mats[0])
    for wi, Pi in zip(weights, mats):
        P += wi * Pi
    return clip_norm(P)

def objective(w):
    P = blend_from_w(w, oofs)
    return log_loss(y_idx, P, labels=list(range(num_classes)))

m = len(cands)
w0 = np.ones(m, dtype=np.float64) / m
bounds = [(0.0, 1.0)] * m
cons = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0},)
t0 = time.time()
res = minimize(objective, w0, method='SLSQP', bounds=bounds, constraints=cons, options={'maxiter': 500, 'ftol': 1e-9, 'disp': False})
w_opt = res.x if res.success else w0
w_opt = np.maximum(w_opt, 0); w_opt = w_opt / w_opt.sum()
print('SLSQP success:', res.success, '| OOF:', objective(w_opt), '| Weights:', dict(zip(names, np.round(w_opt, 4))), '| time {:.2f}s'.format(time.time()-t0))

oof_blend = blend_from_w(w_opt, oofs)
test_blend = blend_from_w(w_opt, tests)
oof_ll = log_loss(y_idx, oof_blend, labels=list(range(num_classes)))
print(f'Blended OOF logloss (pre-calibration): {oof_ll:.6f}')

# Temperature scaling on blended probs
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

alphas = np.linspace(0.6, 2.2, 161)
best = (1.0, 1e9)
for a in alphas:
    ll = log_loss(y_idx, temp_scale(oof_blend, a), labels=list(range(num_classes)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Temp scaling on blend: alpha={best[0]:.4f}, OOF={best[1]:.6f}')
alpha_opt = best[0]
test_cal = temp_scale(test_blend, alpha_opt)

# Build soft submission only (no Hungarian)
pred_df = pd.DataFrame(test_cal, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission.to_csv('submission_soft_blend_calibrated_v2.csv', index=False)
submission.to_csv('submission.csv', index=False)
print('Saved submission_soft_blend_calibrated_v2.csv and submission.csv:', submission.shape)

SLSQP success: True | OOF: 0.04631740493480847 | Weights: {'lr_base': 0.751, 'chi2_lr': 0.249, 'lgbm': 0.0, 'nc_hell': 0.0} | time 0.12s
Blended OOF logloss (pre-calibration): 0.046317


Temp scaling on blend: alpha=1.8000, OOF=0.033411
Saved submission_soft_blend_calibrated_v2.csv and submission.csv: (99, 100)


In [28]:
# Concatenate Raw(StandardScaler) + Block-Hellinger(global L2) -> LR (diversity model)
import numpy as np, json, time
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
X_raw = train[feature_cols].values.astype(np.float64)
X_test_raw = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)

# Reuse block_hellinger_global_l2 from cell 23; define if missing
try:
    block_hellinger_global_l2
except NameError:
    def get_blocks(cols):
        margin_cols = [c for c in cols if c.startswith('margin')]
        shape_cols = [c for c in cols if c.startswith('shape')]
        texture_cols = [c for c in cols if c.startswith('texture')]
        m_idx = np.array([cols.index(c) for c in margin_cols])
        s_idx = np.array([cols.index(c) for c in shape_cols])
        t_idx = np.array([cols.index(c) for c in texture_cols])
        return m_idx, s_idx, t_idx
    m_idx, s_idx, t_idx = get_blocks(feature_cols)
    def block_hellinger_global_l2(A, m_idx, s_idx, t_idx):
        A = np.clip(A, 0, None).astype(np.float64, copy=False)
        out = np.empty_like(A)
        for idx in (m_idx, s_idx, t_idx):
            B = A[:, idx]
            B = B / (B.sum(axis=1, keepdims=True) + 1e-12)
            out[:, idx] = np.sqrt(B)
        out = out / (np.linalg.norm(out, axis=1, keepdims=True) + 1e-12)
        return out

X_hell = block_hellinger_global_l2(X_raw, m_idx, s_idx, t_idx)
X_test_hell = block_hellinger_global_l2(X_test_raw, m_idx, s_idx, t_idx)

with open('folds_6.json', 'r') as f:
    fold_indices = json.load(f)
n_splits = len(fold_indices)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

Cs = [10.0, 30.0, 60.0, 100.0, 150.0, 200.0]
best = (None, 1e9)
best_oof = None
best_test = None
t_all = time.time()
for C in Cs:
    oof = np.zeros((len(X_raw), num_classes), dtype=np.float64)
    test_pred = np.zeros((len(X_test_raw), num_classes), dtype=np.float64)
    t0 = time.time()
    for i, (trn_idx, val_idx) in enumerate(fold_indices, 1):
        trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
        # Standardize raw features within fold
        sc = StandardScaler()
        X_tr_raw = sc.fit_transform(X_raw[trn_idx])
        X_va_raw = sc.transform(X_raw[val_idx])
        X_te_raw = sc.transform(X_test_raw)
        # Hellinger part is precomputed (fitless, row-ops only)
        X_tr = np.hstack([X_tr_raw, X_hell[trn_idx]])
        X_va = np.hstack([X_va_raw, X_hell[val_idx]])
        X_te = np.hstack([X_te_raw, X_test_hell])
        clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=C, max_iter=5000, random_state=SEED)
        fstart = time.time()
        clf.fit(X_tr, y_idx[trn_idx])
        va_proba = clf.predict_proba(X_va)
        oof[val_idx] = va_proba
        test_pred += clf.predict_proba(X_te) / n_splits
        print(f'[Concat Raw+Hell LR fold {i}/{n_splits}] C={C}, time={time.time()-fstart:.2f}s', flush=True)
    oof_ll = log_loss(y_idx, clip_norm(oof), labels=list(range(num_classes)))
    print(f'Concat Raw+Hell LR OOF: {oof_ll:.6f} | C={C} in {time.time()-t0:.2f}s', flush=True)
    if oof_ll < best[1]:
        best = (C, oof_ll); best_oof = oof; best_test = test_pred

print('Best Concat Raw+Hell LR:', best, '| total {:.2f}s'.format(time.time()-t_all))
np.save('oof_concat_lr.npy', best_oof)
np.save('test_concat_lr.npy', best_test)
print('Saved oof_concat_lr.npy and test_concat_lr.npy')



[Concat Raw+Hell LR fold 1/6] C=10.0, time=0.81s




[Concat Raw+Hell LR fold 2/6] C=10.0, time=0.42s




[Concat Raw+Hell LR fold 3/6] C=10.0, time=0.52s




[Concat Raw+Hell LR fold 4/6] C=10.0, time=0.61s




[Concat Raw+Hell LR fold 5/6] C=10.0, time=0.59s




[Concat Raw+Hell LR fold 6/6] C=10.0, time=0.40s


Concat Raw+Hell LR OOF: 0.057777 | C=10.0 in 3.44s




[Concat Raw+Hell LR fold 1/6] C=30.0, time=0.36s




[Concat Raw+Hell LR fold 2/6] C=30.0, time=0.53s




[Concat Raw+Hell LR fold 3/6] C=30.0, time=0.47s




[Concat Raw+Hell LR fold 4/6] C=30.0, time=0.66s




[Concat Raw+Hell LR fold 5/6] C=30.0, time=0.71s




[Concat Raw+Hell LR fold 6/6] C=30.0, time=0.71s


Concat Raw+Hell LR OOF: 0.051296 | C=30.0 in 3.60s




[Concat Raw+Hell LR fold 1/6] C=60.0, time=0.41s




[Concat Raw+Hell LR fold 2/6] C=60.0, time=0.52s




[Concat Raw+Hell LR fold 3/6] C=60.0, time=0.60s




[Concat Raw+Hell LR fold 4/6] C=60.0, time=0.50s




[Concat Raw+Hell LR fold 5/6] C=60.0, time=0.45s




[Concat Raw+Hell LR fold 6/6] C=60.0, time=0.48s


Concat Raw+Hell LR OOF: 0.052601 | C=60.0 in 3.08s




[Concat Raw+Hell LR fold 1/6] C=100.0, time=0.42s




[Concat Raw+Hell LR fold 2/6] C=100.0, time=0.49s




[Concat Raw+Hell LR fold 3/6] C=100.0, time=0.43s




[Concat Raw+Hell LR fold 4/6] C=100.0, time=0.54s




[Concat Raw+Hell LR fold 5/6] C=100.0, time=0.51s




[Concat Raw+Hell LR fold 6/6] C=100.0, time=0.36s


Concat Raw+Hell LR OOF: 0.051889 | C=100.0 in 2.89s




[Concat Raw+Hell LR fold 1/6] C=150.0, time=0.32s




[Concat Raw+Hell LR fold 2/6] C=150.0, time=0.49s




[Concat Raw+Hell LR fold 3/6] C=150.0, time=0.45s




[Concat Raw+Hell LR fold 4/6] C=150.0, time=0.43s




[Concat Raw+Hell LR fold 5/6] C=150.0, time=0.44s




[Concat Raw+Hell LR fold 6/6] C=150.0, time=0.38s


Concat Raw+Hell LR OOF: 0.051803 | C=150.0 in 2.60s




[Concat Raw+Hell LR fold 1/6] C=200.0, time=0.32s




[Concat Raw+Hell LR fold 2/6] C=200.0, time=0.38s




[Concat Raw+Hell LR fold 3/6] C=200.0, time=0.41s




[Concat Raw+Hell LR fold 4/6] C=200.0, time=0.42s




[Concat Raw+Hell LR fold 5/6] C=200.0, time=0.35s




[Concat Raw+Hell LR fold 6/6] C=200.0, time=0.30s


Concat Raw+Hell LR OOF: 0.051748 | C=200.0 in 2.27s


Best Concat Raw+Hell LR: (30.0, 0.05129633843739897) | total 17.89s
Saved oof_concat_lr.npy and test_concat_lr.npy


In [30]:
# FIXED Hellinger pipeline: sorted blocks + proper eps + smaller C grid; no scaler/centering
import re, json, time, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
eps = 1e-9
X_raw = train[feature_cols].values.astype(np.float64)
X_test_raw = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
K = len(le.classes_)

def get_blocks_sorted(cols):
    patt_cache = {}
    def sort_keys(prefix):
        patt = patt_cache.get(prefix)
        if patt is None:
            patt = re.compile(r'^'+re.escape(prefix)+r'(\d+)$')
            patt_cache[prefix] = patt
        items = []
        for c in cols:
            m = patt.match(c)
            if m:
                items.append((int(m.group(1)), c))
        items.sort(key=lambda x: x[0])
        return [c for _, c in items]
    margin = sort_keys('margin')
    shape = sort_keys('shape')
    texture = sort_keys('texture')
    assert len(margin)==len(shape)==len(texture)==64, (len(margin), len(shape), len(texture))
    idx = {c:i for i,c in enumerate(cols)}
    m_idx = np.array([idx[c] for c in margin])
    s_idx = np.array([idx[c] for c in shape])
    t_idx = np.array([idx[c] for c in texture])
    return m_idx, s_idx, t_idx, margin, shape, texture

m_idx_s, s_idx_s, t_idx_s, m_names, s_names, t_names = get_blocks_sorted(feature_cols)

# Sanity print of ordering
print('Margin first/last 5:', m_names[:5], m_names[-5:])
print('Shape  first/last 5:', s_names[:5], s_names[-5:])
print('Texture first/last 5:', t_names[:5], t_names[-5:])

def make_hell(A, m_idx, s_idx, t_idx, eps=1e-9, do_global_l2=True):
    X = np.clip(A.astype(np.float64, copy=False), 0, None).copy()
    # per-block L1 then sqrt; IMPORTANT: assign back since advanced indexing returns a copy
    for idx in (m_idx, s_idx, t_idx):
        B = X[:, idx].copy()
        B /= (B.sum(axis=1, keepdims=True) + eps)
        np.sqrt(B, out=B)
        X[:, idx] = B
    if do_global_l2:
        X /= (np.linalg.norm(X, axis=1, keepdims=True) + eps)
    return X

# Quick sanity on sums and norms
X_tmp = make_hell(X_raw[:5].copy(), m_idx_s, s_idx_s, t_idx_s, eps=eps, do_global_l2=False)
print('Block L1 sums before sqrt (raw sums):',
      X_raw[:3, m_idx_s].sum(axis=1), X_raw[:3, s_idx_s].sum(axis=1), X_raw[:3, t_idx_s].sum(axis=1))
print('After L1, sum of squares per block (should be ~1):',
      (X_tmp[:3, m_idx_s]**2).sum(axis=1), (X_tmp[:3, s_idx_s]**2).sum(axis=1), (X_tmp[:3, t_idx_s]**2).sum(axis=1))

# Precompute transforms (row-wise only, no fit) with and without global L2
X_h_l2 = make_hell(X_raw, m_idx_s, s_idx_s, t_idx_s, eps=eps, do_global_l2=True)
X_te_h_l2 = make_hell(X_test_raw, m_idx_s, s_idx_s, t_idx_s, eps=eps, do_global_l2=True)
row_norms = np.linalg.norm(X_h_l2[:5], axis=1)
print('Row L2 norms after global L2 (should be ~1):', row_norms)

X_h_nol2 = make_hell(X_raw, m_idx_s, s_idx_s, t_idx_s, eps=eps, do_global_l2=False)
X_te_h_nol2 = make_hell(X_test_raw, m_idx_s, s_idx_s, t_idx_s, eps=eps, do_global_l2=False)

with open('folds_6.json', 'r') as f:
    folds = json.load(f)
n_splits = len(folds)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def run_lr_on_feats(Xf, Xtf, Cs, tag):
    best = (None, 1e9); best_oof=None; best_test=None
    t_all = time.time()
    for C in Cs:
        oof = np.zeros((len(Xf), K), dtype=np.float64)
        test_pred = np.zeros((len(Xtf), K), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=C, fit_intercept=True, max_iter=5000, random_state=SEED)
            fstart = time.time()
            clf.fit(Xf[trn_idx], y_idx[trn_idx])
            P_va = clf.predict_proba(Xf[val_idx])
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(Xtf) / n_splits
            print(f'[{tag} fold {i}/{n_splits}] C={C}, time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y_idx, clip_norm(oof), labels=list(range(K)))
        print(f'{tag} OOF: {ll:.6f} | C={C} in {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof=oof; best_test=test_pred
    print('Best', tag, ':', best, '| total {:.2f}s'.format(time.time()-t_all))
    return best, best_oof, best_test

# 1) With global L2
Cs_main = [1.0, 3.0, 5.0, 8.0, 12.0]
best_l2, oof_l2, test_l2 = run_lr_on_feats(X_h_l2, X_te_h_l2, Cs_main, tag='Hell+L2 LR')
np.save('oof_hellinger_lr.npy', oof_l2); np.save('test_hellinger_lr.npy', test_l2)
print('Saved oof_hellinger_lr.npy/test_hellinger_lr.npy')

# 2) Without global L2 (variant) - smaller sweep
Cs_nol2 = [1.0, 3.0, 10.0, 30.0]
best_nol2, oof_nol2, test_nol2 = run_lr_on_feats(X_h_nol2, X_te_h_nol2, Cs_nol2, tag='Hell(noL2) LR')
np.save('oof_hellinger_nol2_lr.npy', oof_nol2); np.save('test_hellinger_nol2_lr.npy', test_nol2)
print('Saved oof_hellinger_nol2_lr.npy/test_hellinger_nol2_lr.npy')

# For blending convenience, keep best of the two under block_hell files
if best_l2[1] <= best_nol2[1]:
    np.save('oof_block_hell_lr.npy', oof_l2); np.save('test_block_hell_lr.npy', test_l2)
    print('Block-hell files updated from Hell+L2 LR:', best_l2)
else:
    np.save('oof_block_hell_lr.npy', oof_nol2); np.save('test_block_hell_lr.npy', test_nol2)
    print('Block-hell files updated from Hell(noL2) LR:', best_nol2)

Margin first/last 5: ['margin1', 'margin2', 'margin3', 'margin4', 'margin5'] ['margin60', 'margin61', 'margin62', 'margin63', 'margin64']
Shape  first/last 5: ['shape1', 'shape2', 'shape3', 'shape4', 'shape5'] ['shape60', 'shape61', 'shape62', 'shape63', 'shape64']
Texture first/last 5: ['texture1', 'texture2', 'texture3', 'texture4', 'texture5'] ['texture60', 'texture61', 'texture62', 'texture63', 'texture64']
Block L1 sums before sqrt (raw sums): [0.999996 0.999995 1.      ] [0.05032049 0.05670366 0.02403151] [1.000003 0.999998 0.999998]
After L1, sum of squares per block (should be ~1): [1. 1. 1.] [0.99999998 0.99999998 0.99999996] [1. 1. 1.]
Row L2 norms after global L2 (should be ~1): [1. 1. 1. 1. 1.]
[Hell+L2 LR fold 1/6] C=1.0, time=0.12s




[Hell+L2 LR fold 2/6] C=1.0, time=0.22s


[Hell+L2 LR fold 3/6] C=1.0, time=0.16s




[Hell+L2 LR fold 4/6] C=1.0, time=0.16s




[Hell+L2 LR fold 5/6] C=1.0, time=0.34s


[Hell+L2 LR fold 6/6] C=1.0, time=0.18s


Hell+L2 LR OOF: 3.463161 | C=1.0 in 1.19s




[Hell+L2 LR fold 1/6] C=3.0, time=0.39s




[Hell+L2 LR fold 2/6] C=3.0, time=0.47s




[Hell+L2 LR fold 3/6] C=3.0, time=0.42s




[Hell+L2 LR fold 4/6] C=3.0, time=0.38s




[Hell+L2 LR fold 5/6] C=3.0, time=0.46s




[Hell+L2 LR fold 6/6] C=3.0, time=0.40s


Hell+L2 LR OOF: 2.304122 | C=3.0 in 2.54s




[Hell+L2 LR fold 1/6] C=5.0, time=0.37s




[Hell+L2 LR fold 2/6] C=5.0, time=0.51s




[Hell+L2 LR fold 3/6] C=5.0, time=0.38s


[Hell+L2 LR fold 4/6] C=5.0, time=0.40s




[Hell+L2 LR fold 5/6] C=5.0, time=0.42s




[Hell+L2 LR fold 6/6] C=5.0, time=0.61s


Hell+L2 LR OOF: 1.755846 | C=5.0 in 2.72s




[Hell+L2 LR fold 1/6] C=8.0, time=0.49s




[Hell+L2 LR fold 2/6] C=8.0, time=0.57s




[Hell+L2 LR fold 3/6] C=8.0, time=0.55s




[Hell+L2 LR fold 4/6] C=8.0, time=0.42s




[Hell+L2 LR fold 5/6] C=8.0, time=0.68s




[Hell+L2 LR fold 6/6] C=8.0, time=0.60s


Hell+L2 LR OOF: 1.329043 | C=8.0 in 3.34s




[Hell+L2 LR fold 1/6] C=12.0, time=0.69s




[Hell+L2 LR fold 2/6] C=12.0, time=0.83s




[Hell+L2 LR fold 3/6] C=12.0, time=0.69s




[Hell+L2 LR fold 4/6] C=12.0, time=0.57s




[Hell+L2 LR fold 5/6] C=12.0, time=0.42s




[Hell+L2 LR fold 6/6] C=12.0, time=0.37s


Hell+L2 LR OOF: 1.044082 | C=12.0 in 3.60s


Best Hell+L2 LR : (12.0, 1.0440822959332978) | total 13.42s
Saved oof_hellinger_lr.npy/test_hellinger_lr.npy




[Hell(noL2) LR fold 1/6] C=1.0, time=0.34s




[Hell(noL2) LR fold 2/6] C=1.0, time=0.52s




[Hell(noL2) LR fold 3/6] C=1.0, time=0.48s




[Hell(noL2) LR fold 4/6] C=1.0, time=0.47s




[Hell(noL2) LR fold 5/6] C=1.0, time=0.47s




[Hell(noL2) LR fold 6/6] C=1.0, time=0.47s


Hell(noL2) LR OOF: 2.305619 | C=1.0 in 2.78s




[Hell(noL2) LR fold 1/6] C=3.0, time=0.57s




[Hell(noL2) LR fold 2/6] C=3.0, time=0.52s




[Hell(noL2) LR fold 3/6] C=3.0, time=0.75s




[Hell(noL2) LR fold 4/6] C=3.0, time=0.66s




[Hell(noL2) LR fold 5/6] C=3.0, time=0.59s




[Hell(noL2) LR fold 6/6] C=3.0, time=0.60s


Hell(noL2) LR OOF: 1.242183 | C=3.0 in 3.71s




[Hell(noL2) LR fold 1/6] C=10.0, time=0.64s




[Hell(noL2) LR fold 2/6] C=10.0, time=0.73s




[Hell(noL2) LR fold 3/6] C=10.0, time=0.59s




[Hell(noL2) LR fold 4/6] C=10.0, time=0.73s




[Hell(noL2) LR fold 5/6] C=10.0, time=0.59s




[Hell(noL2) LR fold 6/6] C=10.0, time=0.43s


Hell(noL2) LR OOF: 0.613177 | C=10.0 in 3.73s




[Hell(noL2) LR fold 1/6] C=30.0, time=0.60s




[Hell(noL2) LR fold 2/6] C=30.0, time=0.49s




[Hell(noL2) LR fold 3/6] C=30.0, time=0.65s




[Hell(noL2) LR fold 4/6] C=30.0, time=0.55s




[Hell(noL2) LR fold 5/6] C=30.0, time=0.68s




[Hell(noL2) LR fold 6/6] C=30.0, time=0.63s


Hell(noL2) LR OOF: 0.346896 | C=30.0 in 3.62s


Best Hell(noL2) LR : (30.0, 0.34689602431619043) | total 13.86s
Saved oof_hellinger_nol2_lr.npy/test_hellinger_nol2_lr.npy
Block-hell files updated from Hell(noL2) LR: (30.0, 0.34689602431619043)


In [31]:
# Hellinger+global L2 with fold-centering (with_mean=True, with_std=False) -> LR; wide C sweep
import numpy as np, time, json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025

# Reuse precomputed Hellinger+L2 from cell 27
try:
    X_h_l2, X_te_h_l2
except NameError:
    raise RuntimeError('Run cell 27 first to build X_h_l2/X_te_h_l2')

with open('folds_6.json', 'r') as f:
    folds = json.load(f)
y_idx = y_enc.astype(int)
K = len(le.classes_)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def pctiles_maxprob(P):
    mx = P.max(axis=1)
    qs = np.percentile(mx, [5,25,50,75,95])
    return np.round(qs, 4)

def run_hell_centered(Cs, solver='lbfgs', fit_intercept=True):
    best = (None, 1e9); best_oof=None; best_test=None
    t_all = time.time()
    for C in Cs:
        oof = np.zeros((len(X_h_l2), K), dtype=np.float64)
        test_pred = np.zeros((len(X_te_h_l2), K), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            sc = StandardScaler(with_mean=True, with_std=False)
            X_tr = sc.fit_transform(X_h_l2[trn_idx])
            X_va = sc.transform(X_h_l2[val_idx])
            X_te = sc.transform(X_te_h_l2)
            clf = LogisticRegression(multi_class='multinomial', solver=solver, penalty='l2', C=C, fit_intercept=fit_intercept, max_iter=5000, random_state=SEED)
            fstart = time.time()
            clf.fit(X_tr, y_idx[trn_idx])
            P_va = clf.predict_proba(X_va)
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(X_te) / len(folds)
            print(f'[Hell+L2 Center fold {i}/{len(folds)}] C={C}, time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y_idx, clip_norm(oof), labels=list(range(K)))
        print(f'Hell+L2 Center OOF: {ll:.6f} | C={C} in {time.time()-t0:.2f}s | maxprob pctiles {pctiles_maxprob(oof)}', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof=oof; best_test=test_pred
    print('Best Hell+L2 Center:', best, '| total {:.2f}s'.format(time.time()-t_all))
    return best, best_oof, best_test

# Primary sweep (moderate C)
Cs_main = [20.0, 30.0, 50.0, 80.0, 100.0]
best_main, oof_main, test_main = run_hell_centered(Cs_main, solver='lbfgs', fit_intercept=True)

# Extended large-C sweep
Cs_big = [150.0, 300.0, 500.0, 1000.0, 2000.0]
best_big, oof_big, test_big = run_hell_centered(Cs_big, solver='lbfgs', fit_intercept=True)

# Select overall best and save
if best_main[1] <= best_big[1]:
    oof_best, test_best, best_desc = oof_main, test_main, best_main
else:
    oof_best, test_best, best_desc = oof_big, test_big, best_big
print('Selected Hell+L2 Center best:', best_desc)
np.save('oof_hellinger_lr.npy', oof_best)
np.save('test_hellinger_lr.npy', test_best)
np.save('oof_block_hell_lr.npy', oof_best)  # for existing blend compatibility
np.save('test_block_hell_lr.npy', test_best)
print('Saved oof_hellinger_lr.npy/test_hellinger_lr.npy and block_hell files')



[Hell+L2 Center fold 1/6] C=20.0, time=0.34s




[Hell+L2 Center fold 2/6] C=20.0, time=0.40s




[Hell+L2 Center fold 3/6] C=20.0, time=0.36s




[Hell+L2 Center fold 4/6] C=20.0, time=0.35s




[Hell+L2 Center fold 5/6] C=20.0, time=0.39s




[Hell+L2 Center fold 6/6] C=20.0, time=0.43s


Hell+L2 Center OOF: 0.774308 | C=20.0 in 2.32s | maxprob pctiles [0.2112 0.3784 0.5096 0.6363 0.7947]




[Hell+L2 Center fold 1/6] C=30.0, time=0.41s




[Hell+L2 Center fold 2/6] C=30.0, time=0.52s




[Hell+L2 Center fold 3/6] C=30.0, time=0.57s




[Hell+L2 Center fold 4/6] C=30.0, time=0.56s




[Hell+L2 Center fold 5/6] C=30.0, time=0.54s






[Hell+L2 Center fold 1/6] C=50.0, time=0.64s




[Hell+L2 Center fold 2/6] C=50.0, time=0.62s




[Hell+L2 Center fold 3/6] C=50.0, time=0.63s




[Hell+L2 Center fold 4/6] C=50.0, time=0.80s




[Hell+L2 Center fold 5/6] C=50.0, time=0.89s




[Hell+L2 Center fold 6/6] C=50.0, time=0.90s


Hell+L2 Center OOF: 0.466748 | C=50.0 in 4.55s | maxprob pctiles [0.3211 0.5608 0.7007 0.8084 0.9065]




[Hell+L2 Center fold 1/6] C=80.0, time=0.55s




[Hell+L2 Center fold 2/6] C=80.0, time=0.82s




[Hell+L2 Center fold 3/6] C=80.0, time=1.01s




[Hell+L2 Center fold 4/6] C=80.0, time=0.86s




[Hell+L2 Center fold 5/6] C=80.0, time=0.62s




[Hell+L2 Center fold 6/6] C=80.0, time=0.72s


Hell+L2 Center OOF: 0.365167 | C=80.0 in 4.66s | maxprob pctiles [0.377  0.6369 0.7751 0.8669 0.9393]




[Hell+L2 Center fold 1/6] C=100.0, time=0.60s




[Hell+L2 Center fold 2/6] C=100.0, time=0.67s




[Hell+L2 Center fold 3/6] C=100.0, time=0.76s




[Hell+L2 Center fold 4/6] C=100.0, time=0.74s




[Hell+L2 Center fold 5/6] C=100.0, time=0.77s




[Hell+L2 Center fold 6/6] C=100.0, time=0.90s


Hell+L2 Center OOF: 0.327276 | C=100.0 in 4.51s | maxprob pctiles [0.3999 0.6673 0.8055 0.8879 0.9492]


Best Hell+L2 Center: (100.0, 0.32727622600064116) | total 19.29s




[Hell+L2 Center fold 1/6] C=150.0, time=0.72s




[Hell+L2 Center fold 2/6] C=150.0, time=0.78s




[Hell+L2 Center fold 3/6] C=150.0, time=0.74s




[Hell+L2 Center fold 4/6] C=150.0, time=0.62s




[Hell+L2 Center fold 5/6] C=150.0, time=0.75s




[Hell+L2 Center fold 6/6] C=150.0, time=0.63s


Hell+L2 Center OOF: 0.271632 | C=150.0 in 4.30s | maxprob pctiles [0.4372 0.7222 0.8501 0.9172 0.965 ]




[Hell+L2 Center fold 1/6] C=300.0, time=0.60s




[Hell+L2 Center fold 2/6] C=300.0, time=0.64s




[Hell+L2 Center fold 3/6] C=300.0, time=0.47s




[Hell+L2 Center fold 4/6] C=300.0, time=0.61s




[Hell+L2 Center fold 5/6] C=300.0, time=0.49s




[Hell+L2 Center fold 6/6] C=300.0, time=0.73s


Hell+L2 Center OOF: 0.200695 | C=300.0 in 3.60s | maxprob pctiles [0.5029 0.7942 0.9069 0.9529 0.9813]




[Hell+L2 Center fold 1/6] C=500.0, time=0.57s




[Hell+L2 Center fold 2/6] C=500.0, time=0.68s




[Hell+L2 Center fold 3/6] C=500.0, time=0.79s




[Hell+L2 Center fold 4/6] C=500.0, time=0.69s




[Hell+L2 Center fold 5/6] C=500.0, time=0.66s




[Hell+L2 Center fold 6/6] C=500.0, time=0.44s


Hell+L2 Center OOF: 0.162227 | C=500.0 in 3.88s | maxprob pctiles [0.5526 0.8422 0.9371 0.9692 0.9878]




[Hell+L2 Center fold 1/6] C=1000.0, time=0.36s




[Hell+L2 Center fold 2/6] C=1000.0, time=0.41s




[Hell+L2 Center fold 3/6] C=1000.0, time=0.36s




[Hell+L2 Center fold 4/6] C=1000.0, time=0.39s




[Hell+L2 Center fold 5/6] C=1000.0, time=0.53s




[Hell+L2 Center fold 6/6] C=1000.0, time=0.41s


Hell+L2 Center OOF: 0.135602 | C=1000.0 in 2.49s | maxprob pctiles [0.5851 0.8884 0.9582 0.9823 0.9936]




[Hell+L2 Center fold 1/6] C=2000.0, time=0.39s




[Hell+L2 Center fold 2/6] C=2000.0, time=0.39s




[Hell+L2 Center fold 3/6] C=2000.0, time=0.38s




[Hell+L2 Center fold 4/6] C=2000.0, time=0.34s




[Hell+L2 Center fold 5/6] C=2000.0, time=0.63s




[Hell+L2 Center fold 6/6] C=2000.0, time=0.53s


Hell+L2 Center OOF: 0.116258 | C=2000.0 in 2.69s | maxprob pctiles [0.644  0.9155 0.9725 0.9892 0.997 ]


Best Hell+L2 Center: (2000.0, 0.11625779469622119) | total 16.97s
Selected Hell+L2 Center best: (2000.0, 0.11625779469622119)
Saved oof_hellinger_lr.npy/test_hellinger_lr.npy and block_hell files


In [32]:
# SLSQP blend v3: lr_base + concat_lr + chi2_lr (+ optional quantile, blockL1) -> temp scale -> soft submission
import numpy as np, pandas as pd, time, sys, subprocess
from sklearn.metrics import log_loss

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def ensure_scipy():
    try:
        import scipy  # noqa: F401
        from scipy.optimize import minimize  # noqa: F401
        return True
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

ensure_scipy()
from scipy.optimize import minimize

# Load candidates
cands = []
names = []
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)

try:
    oof_lr = np.load('oof_logreg_nopca.npy'); test_lr = np.load('test_pred_logreg_nopca.npy')
    cands.append((oof_lr, test_lr)); names.append('lr_base')
except Exception as e:
    raise RuntimeError('Missing lr_base predictions')
try:
    oof_concat = np.load('oof_concat_lr.npy'); test_concat = np.load('test_concat_lr.npy')
    cands.append((oof_concat, test_concat)); names.append('concat_lr')
except Exception as e:
    print('Missing concat_lr, skipping:', e)
try:
    oof_c2 = np.load('oof_chi2_lr.npy'); test_c2 = np.load('test_chi2_lr.npy')
    cands.append((oof_c2, test_c2)); names.append('chi2_lr')
except Exception as e:
    print('Missing chi2_lr, skipping:', e)
try:
    oof_q = np.load('oof_lr_quant.npy'); test_q = np.load('test_pred_lr_quant.npy')
    cands.append((oof_q, test_q)); names.append('lr_quant')
except Exception as e:
    print('Missing lr_quant, skipping:', e)
try:
    oof_blkL1 = np.load('oof_block_hell_lr.npy'); test_blkL1 = np.load('test_block_hell_lr.npy')
    cands.append((oof_blkL1, test_blkL1)); names.append('blockL1_lr')
except Exception as e:
    print('Missing blockL1/hell files, skipping:', e)

assert len(cands) >= 2, 'Need at least two models to blend'

oofs = [o for o, _ in cands]
tests = [t for _, t in cands]

def blend_from_w(weights, mats):
    P = np.zeros_like(mats[0])
    for wi, Pi in zip(weights, mats):
        P += wi * Pi
    return clip_norm(P)

def objective(w):
    P = blend_from_w(w, oofs)
    return log_loss(y_idx, P, labels=list(range(num_classes)))

m = len(cands)
w0 = np.ones(m, dtype=np.float64) / m
bounds = [(0.0, 1.0)] * m
cons = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0},)
t0 = time.time()
res = minimize(objective, w0, method='SLSQP', bounds=bounds, constraints=cons, options={'maxiter': 800, 'ftol': 1e-10, 'disp': False})
w_opt = res.x if res.success else w0
w_opt = np.maximum(w_opt, 0); w_opt = w_opt / w_opt.sum()
print('SLSQP success:', res.success, '| OOF:', objective(w_opt), '| Weights:', dict(zip(names, np.round(w_opt, 4))), '| time {:.2f}s'.format(time.time()-t0))

oof_blend = blend_from_w(w_opt, oofs)
test_blend = blend_from_w(w_opt, tests)
oof_ll = log_loss(y_idx, oof_blend, labels=list(range(num_classes)))
print(f'Blended OOF logloss (pre-calibration): {oof_ll:.6f}')

# Temperature scaling
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

alphas = np.linspace(0.6, 2.2, 161)
best = (1.0, 1e9)
for a in alphas:
    ll = log_loss(y_idx, temp_scale(oof_blend, a), labels=list(range(num_classes)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Temp scaling on blend: alpha={best[0]:.4f}, OOF={best[1]:.6f}')
alpha_opt = best[0]
test_cal = temp_scale(test_blend, alpha_opt)

# Write soft submission only
pred_df = pd.DataFrame(test_cal, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission.to_csv('submission.csv', index=False)
submission.to_csv('submission_soft_blend_calibrated_v3.csv', index=False)
print('Saved submission.csv and submission_soft_blend_calibrated_v3.csv:', submission.shape)

SLSQP success: True | OOF: 0.04579940680457917 | Weights: {'lr_base': 0.1945, 'concat_lr': 0.524, 'chi2_lr': 0.2636, 'lr_quant': 0.0, 'blockL1_lr': 0.0179} | time 0.53s
Blended OOF logloss (pre-calibration): 0.045799


Temp scaling on blend: alpha=1.9800, OOF=0.030103
Saved submission.csv and submission_soft_blend_calibrated_v3.csv: (99, 100)


In [33]:
# Chi^2 1-NN (tau-softmax) and Naive Bayes (Multinomial/Complement) on block L1 histograms
import numpy as np, json, time
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB, ComplementNB

SEED = 2025
eps = 1e-9
X_full = train[feature_cols].values.astype(np.float64)
X_test_full = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
K = len(le.classes_)
with open('folds_6.json', 'r') as f:
    folds = json.load(f)

# Reuse sorted block indices from cell 27 if available; else define
try:
    m_idx_s, s_idx_s, t_idx_s
except NameError:
    import re
    def get_blocks_sorted(cols):
        def sort_keys(prefix):
            patt = re.compile(r'^'+re.escape(prefix)+r'(\d+)$')
            items = [(int(m.group(1)), c) for c in cols for m in [patt.match(c)] if m]
            items.sort(key=lambda x: x[0])
            return [c for _, c in items]
        margin = sort_keys('margin'); shape = sort_keys('shape'); texture = sort_keys('texture')
        idx = {c:i for i,c in enumerate(cols)}
        return (np.array([idx[c] for c in margin]),
                np.array([idx[c] for c in shape]),
                np.array([idx[c] for c in texture]))
    m_idx_s, s_idx_s, t_idx_s = get_blocks_sorted(feature_cols)

def block_l1_only(A, eps=1e-9):
    X = np.clip(A.astype(np.float64, copy=False), 0, None).copy()
    for idx in (m_idx_s, s_idx_s, t_idx_s):
        B = X[:, idx].copy()
        B /= (B.sum(axis=1, keepdims=True) + eps)
        X[:, idx] = B
    return X

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

# 1) Chi-square 1-NN with tau-softmax
def chi2_distance_matrix(A, B, eps=1e-9):
    # returns pairwise chi2 distances between rows of A and rows of B
    # D_ij = 0.5 * sum_k (a_k - b_k)^2 / (a_k + b_k + eps)
    # compute efficiently by broadcasting in chunks to limit memory
    nA, nB = A.shape[0], B.shape[0]
    D = np.empty((nA, nB), dtype=np.float64)
    chunk = 512
    for i0 in range(0, nA, chunk):
        i1 = min(i0+chunk, nA)
        Aa = A[i0:i1, :][:, None, :]
        Bb = B[None, :, :]
        num = (Aa - Bb)**2
        den = (Aa + Bb + eps)
        D[i0:i1] = 0.5 * (num / den).sum(axis=2)
    return D

def softmax_rowwise(Z):
    Z = Z - Z.max(axis=1, keepdims=True)
    P = np.exp(Z)
    return P / P.sum(axis=1, keepdims=True)

X_l1 = block_l1_only(X_full, eps=eps)
X_te_l1 = block_l1_only(X_test_full, eps=eps)

# Tau selection on first fold
taus = [10.0, 20.0, 30.0, 50.0, 80.0]
trn_sel, val_sel = np.array(folds[0][0]), np.array(folds[0][1])
D_val = chi2_distance_matrix(X_l1[val_sel], X_l1[trn_sel], eps=eps)
nbrs = D_val.argmin(axis=1)
dmin = D_val[np.arange(len(val_sel)), nbrs]
Y_trn_sel = y_idx[trn_sel]
best_tau = None; best_ll = 1e9
for tau in taus:
    # softmax over classes using nearest neighbor distance as similarity
    # Convert distances to similarities: sim = -d
    # We build a per-class score using the 1-NN class only
    S = np.full((len(val_sel), K), -1e9, dtype=np.float64)
    S[np.arange(len(val_sel)), Y_trn_sel[nbrs]] = -dmin * tau
    P = softmax_rowwise(S)
    ll = log_loss(y_idx[val_sel], clip_norm(P), labels=list(range(K)))
    print(f'[Chi2-1NN tau search] tau={tau}, fold1 val logloss={ll:.6f}')
    if ll < best_ll:
        best_ll = ll; best_tau = tau
print('Chi2-1NN selected tau:', best_tau)

# Full 6-fold OOF/Test
oof_chi1 = np.zeros((len(X_l1), K), dtype=np.float64)
test_chi1 = np.zeros((len(X_te_l1), K), dtype=np.float64)
for i, (trn_idx, val_idx) in enumerate(folds, 1):
    trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
    D = chi2_distance_matrix(X_l1[val_idx], X_l1[trn_idx], eps=eps)
    nbr = D.argmin(axis=1)
    dmin = D[np.arange(len(val_idx)), nbr]
    cls = y_idx[trn_idx][nbr]
    S = np.full((len(val_idx), K), -1e9, dtype=np.float64)
    S[np.arange(len(val_idx)), cls] = -dmin * best_tau
    oof_chi1[val_idx] = softmax_rowwise(S)
    # test: use full train as reference
    Dt = chi2_distance_matrix(X_te_l1, X_l1[trn_idx], eps=eps)
    nbr_t = Dt.argmin(axis=1)
    dmin_t = Dt[np.arange(len(X_te_l1)), nbr_t]
    cls_t = y_idx[trn_idx][nbr_t]
    St = np.full((len(X_te_l1), K), -1e9, dtype=np.float64)
    St[np.arange(len(X_te_l1)), cls_t] = -dmin_t * best_tau
    test_chi1 += softmax_rowwise(St) / len(folds)
    ll = log_loss(y_idx[val_idx], clip_norm(oof_chi1[val_idx]), labels=list(range(K)))
    print(f'[Chi2-1NN fold {i}/{len(folds)}] val logloss={ll:.6f}', flush=True)
oof_ll_chi1 = log_loss(y_idx, clip_norm(oof_chi1), labels=list(range(K)))
print(f'Chi2-1NN OOF logloss: {oof_ll_chi1:.6f}')
np.save('oof_chi2_1nn.npy', oof_chi1)
np.save('test_chi2_1nn.npy', test_chi1)

# 2) Naive Bayes on block L1 histograms
def run_nb(model_name='mnb', alphas=(0.05, 0.1, 0.2, 0.5, 1.0), scale=1000.0):
    best = (None, 1e9); best_oof=None; best_test=None
    Xs = X_l1 * scale; Xts = X_te_l1 * scale
    for a in alphas:
        oof = np.zeros((len(Xs), K), dtype=np.float64)
        test_pred = np.zeros((len(Xts), K), dtype=np.float64)
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            if model_name == 'mnb':
                clf = MultinomialNB(alpha=a)
            else:
                clf = ComplementNB(alpha=a)
            clf.fit(Xs[trn_idx], y_idx[trn_idx])
            P_va = clf.predict_proba(Xs[val_idx])
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(Xts) / len(folds)
        ll = log_loss(y_idx, clip_norm(oof), labels=list(range(K)))
        print(f'NB({model_name}) OOF: {ll:.6f} | alpha={a}')
        if ll < best[1]:
            best = (a, ll); best_oof=oof; best_test=test_pred
    print(f'Best NB({model_name}):', best)
    return best, best_oof, best_test

best_mnb, oof_mnb, test_mnb = run_nb('mnb', alphas=(0.05,0.1,0.2,0.5,1.0), scale=1000.0)
np.save('oof_mnb.npy', oof_mnb); np.save('test_mnb.npy', test_mnb)
best_cnb, oof_cnb, test_cnb = run_nb('cnb', alphas=(0.05,0.1,0.2,0.5,1.0), scale=1000.0)
np.save('oof_cnb.npy', oof_cnb); np.save('test_cnb.npy', test_cnb)
print('Saved NB predictions (MNB/CNB) and Chi2-1NN for blending.')

[Chi2-1NN tau search] tau=10.0, fold1 val logloss=1.159019
[Chi2-1NN tau search] tau=20.0, fold1 val logloss=1.159019
[Chi2-1NN tau search] tau=30.0, fold1 val logloss=1.159019
[Chi2-1NN tau search] tau=50.0, fold1 val logloss=1.159019
[Chi2-1NN tau search] tau=80.0, fold1 val logloss=1.159019
Chi2-1NN selected tau: 10.0


[Chi2-1NN fold 1/6] val logloss=1.159019


[Chi2-1NN fold 2/6] val logloss=0.463608


[Chi2-1NN fold 3/6] val logloss=1.854431


[Chi2-1NN fold 4/6] val logloss=1.633591


[Chi2-1NN fold 5/6] val logloss=2.100331


[Chi2-1NN fold 6/6] val logloss=1.400221


Chi2-1NN OOF logloss: 1.434270
NB(mnb) OOF: 2.304240 | alpha=0.05
NB(mnb) OOF: 2.305690 | alpha=0.1
NB(mnb) OOF: 2.309454 | alpha=0.2


NB(mnb) OOF: 2.337020 | alpha=0.5
NB(mnb) OOF: 2.417183 | alpha=1.0
Best NB(mnb): (0.05, 2.3042400378627694)
NB(cnb) OOF: 4.261315 | alpha=0.05
NB(cnb) OOF: 4.261317 | alpha=0.1


NB(cnb) OOF: 4.261323 | alpha=0.2
NB(cnb) OOF: 4.261337 | alpha=0.5
NB(cnb) OOF: 4.261355 | alpha=1.0
Best NB(cnb): (0.05, 4.261314818396908)
Saved NB predictions (MNB/CNB) and Chi2-1NN for blending.


In [34]:
# LR(no-PCA) ensemble over Cs and seeds -> average probs; save for blending
import numpy as np, json, time
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
K = len(le.classes_)
with open('folds_6.json', 'r') as f:
    folds = json.load(f)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

Cs = [20.0, 30.0, 40.0, 60.0, 80.0, 100.0]
seeds = [2023, 2024, 2025, 2026]
oof_list = []
test_list = []
t_all = time.time()
grid_total = len(Cs)*len(seeds)
g = 0
for C in Cs:
    for sd in seeds:
        g += 1
        oof = np.zeros((len(X), K), dtype=np.float64)
        test_pred = np.zeros((len(X_test), K), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            sc = StandardScaler()
            X_tr = sc.fit_transform(X[trn_idx])
            X_va = sc.transform(X[val_idx])
            X_te = sc.transform(X_test)
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=C, max_iter=5000, random_state=sd)
            clf.fit(X_tr, y_idx[trn_idx])
            P_va = clf.predict_proba(X_va)
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(X_te) / len(folds)
        ll = log_loss(y_idx, clip_norm(oof), labels=list(range(K)))
        print(f'[LR ens {g}/{grid_total}] C={C}, seed={sd}, OOF={ll:.6f} in {time.time()-t0:.2f}s', flush=True)
        oof_list.append(oof); test_list.append(test_pred)

oof_ens = np.mean(np.stack(oof_list, axis=0), axis=0)
test_ens = np.mean(np.stack(test_list, axis=0), axis=0)
oof_ll = log_loss(y_idx, clip_norm(oof_ens), labels=list(range(K)))
print(f'LR ensemble OOF: {oof_ll:.6f} | built from {len(oof_list)} models in {time.time()-t_all:.2f}s')
np.save('oof_lr_ens.npy', oof_ens)
np.save('test_lr_ens.npy', test_ens)
print('Saved oof_lr_ens.npy and test_lr_ens.npy for blending')













[LR ens 1/24] C=20.0, seed=2023, OOF=0.048727 in 2.59s














[LR ens 2/24] C=20.0, seed=2024, OOF=0.048727 in 2.15s














[LR ens 3/24] C=20.0, seed=2025, OOF=0.048727 in 2.70s














[LR ens 4/24] C=20.0, seed=2026, OOF=0.048727 in 2.21s












[LR ens 5/24] C=30.0, seed=2023, OOF=0.048535 in 1.59s












[LR ens 6/24] C=30.0, seed=2024, OOF=0.048535 in 1.76s














[LR ens 7/24] C=30.0, seed=2025, OOF=0.048535 in 2.16s














[LR ens 8/24] C=30.0, seed=2026, OOF=0.048535 in 1.96s














[LR ens 9/24] C=40.0, seed=2023, OOF=0.045635 in 2.30s














[LR ens 10/24] C=40.0, seed=2024, OOF=0.045635 in 2.42s














[LR ens 11/24] C=40.0, seed=2025, OOF=0.045635 in 1.89s














[LR ens 12/24] C=40.0, seed=2026, OOF=0.045635 in 2.54s














[LR ens 13/24] C=60.0, seed=2023, OOF=0.052977 in 2.69s














[LR ens 14/24] C=60.0, seed=2024, OOF=0.052977 in 2.74s














[LR ens 15/24] C=60.0, seed=2025, OOF=0.052977 in 2.29s














[LR ens 16/24] C=60.0, seed=2026, OOF=0.052977 in 2.29s














[LR ens 17/24] C=80.0, seed=2023, OOF=0.053087 in 2.40s














[LR ens 18/24] C=80.0, seed=2024, OOF=0.053087 in 2.32s














[LR ens 19/24] C=80.0, seed=2025, OOF=0.053087 in 2.44s














[LR ens 20/24] C=80.0, seed=2026, OOF=0.053087 in 2.26s














[LR ens 21/24] C=100.0, seed=2023, OOF=0.052654 in 2.13s














[LR ens 22/24] C=100.0, seed=2024, OOF=0.052654 in 2.16s














[LR ens 23/24] C=100.0, seed=2025, OOF=0.052654 in 2.65s














[LR ens 24/24] C=100.0, seed=2026, OOF=0.052654 in 2.29s


LR ensemble OOF: 0.047906 | built from 24 models in 55.18s
Saved oof_lr_ens.npy and test_lr_ens.npy for blending


In [36]:
# Calibrated LinearSVC on raw StandardScaled features (per expert advice)
import numpy as np, json, time
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

SEED = 2025
X = train[feature_cols].values.astype(np.float64)
X_test = test[feature_cols].values.astype(np.float64)
y_idx = y_enc.astype(int)
K = len(le.classes_)
with open('folds_6.json', 'r') as f:
    folds = json.load(f)

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def run_linsvc_cal(Cs=(0.5, 1.0, 2.0, 4.0)):
    best = (None, 1e9); best_oof=None; best_test=None
    for C in Cs:
        oof = np.zeros((len(X), K), dtype=np.float64)
        test_pred = np.zeros((len(X_test), K), dtype=np.float64)
        t0 = time.time()
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx); val_idx = np.array(val_idx)
            sc = StandardScaler()
            X_tr = sc.fit_transform(X[trn_idx])
            X_va = sc.transform(X[val_idx])
            X_te = sc.transform(X_test)
            base = LinearSVC(dual='auto', C=C, max_iter=20000, tol=1e-4, random_state=SEED)
            # sklearn >=1.6 uses 'estimator' instead of 'base_estimator'
            clf = CalibratedClassifierCV(estimator=base, method='sigmoid', cv=5)
            fstart = time.time()
            clf.fit(X_tr, y_idx[trn_idx])
            P_va = clf.predict_proba(X_va)
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(X_te) / len(folds)
            print(f'[LinSVC Cal fold {i}/{len(folds)}] C={C}, time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y_idx, clip_norm(oof), labels=list(range(K)))
        print(f'LinSVC Cal OOF: {ll:.6f} | C={C} in {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof=oof; best_test=test_pred
    print('Best LinSVC Cal:', best)
    return best, best_oof, best_test

best_lsvc, oof_lsvc, test_lsvc = run_linsvc_cal(Cs=(0.5, 1.0, 2.0, 4.0))
np.save('oof_linsvc_cal.npy', oof_lsvc)
np.save('test_linsvc_cal.npy', test_lsvc)
print('Saved oof_linsvc_cal.npy and test_linsvc_cal.npy')

[LinSVC Cal fold 1/6] C=0.5, time=21.44s


[LinSVC Cal fold 2/6] C=0.5, time=22.07s


[LinSVC Cal fold 3/6] C=0.5, time=21.50s


[LinSVC Cal fold 4/6] C=0.5, time=20.84s


[LinSVC Cal fold 5/6] C=0.5, time=21.06s


[LinSVC Cal fold 6/6] C=0.5, time=21.43s


LinSVC Cal OOF: 1.140908 | C=0.5 in 128.36s


[LinSVC Cal fold 1/6] C=1.0, time=48.78s


[LinSVC Cal fold 2/6] C=1.0, time=49.86s


[LinSVC Cal fold 3/6] C=1.0, time=50.63s


In [37]:
# SLSQP blend v4: USE lr_ens (replace lr_base) + concat_lr + chi2_lr + optional blockL1 -> temp scale -> soft submission
import numpy as np, pandas as pd, time, sys, subprocess
from sklearn.metrics import log_loss

def ensure_scipy():
    try:
        import scipy  # noqa: F401
        from scipy.optimize import minimize  # noqa: F401
        return True
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'scipy'], check=True)
        return True

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

ensure_scipy()
from scipy.optimize import minimize

# Load candidates (replace lr_base with lr_ens)
cands = []
names = []
y_idx = y_enc.astype(int)
num_classes = len(le.classes_)

oof_lr = np.load('oof_lr_ens.npy'); test_lr = np.load('test_lr_ens.npy')
cands.append((oof_lr, test_lr)); names.append('lr_ens')
try:
    oof_concat = np.load('oof_concat_lr.npy'); test_concat = np.load('test_concat_lr.npy')
    cands.append((oof_concat, test_concat)); names.append('concat_lr')
except Exception as e:
    print('Missing concat_lr, continuing without it:', e)
try:
    oof_c2 = np.load('oof_chi2_lr.npy'); test_c2 = np.load('test_chi2_lr.npy')
    cands.append((oof_c2, test_c2)); names.append('chi2_lr')
except Exception as e:
    print('Missing chi2_lr, continuing without it:', e)
try:
    oof_blk = np.load('oof_block_hell_lr.npy'); test_blk = np.load('test_block_hell_lr.npy')
    cands.append((oof_blk, test_blk)); names.append('blockL1_lr')
except Exception as e:
    print('Missing blockL1_lr, continuing without it:', e)

assert len(cands) >= 2, 'Need at least two models to blend'
oofs = [o for o,_ in cands]
tests = [t for _,t in cands]

def blend_from_w(weights, mats):
    P = np.zeros_like(mats[0])
    for wi, Pi in zip(weights, mats):
        P += wi * Pi
    return clip_norm(P)

def objective(w):
    P = blend_from_w(w, oofs)
    return log_loss(y_idx, P, labels=list(range(num_classes)))

m = len(cands)
w0 = np.ones(m, dtype=np.float64) / m
bounds = [(0.0, 1.0)] * m
cons = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0},)
t0 = time.time()
res = minimize(objective, w0, method='SLSQP', bounds=bounds, constraints=cons, options={'maxiter': 800, 'ftol': 1e-10, 'disp': False})
w_opt = res.x if res.success else w0
w_opt = np.maximum(w_opt, 0); w_opt = w_opt / w_opt.sum()
oof_blend = blend_from_w(w_opt, oofs)
test_blend = blend_from_w(w_opt, tests)
oof_ll = log_loss(y_idx, oof_blend, labels=list(range(num_classes)))
print('SLSQP success:', res.success, '| OOF(pre-cal):', oof_ll, '| Weights:', dict(zip(names, np.round(w_opt, 4))), '| time {:.2f}s'.format(time.time()-t0))

# Single global temperature scaling on blended probabilities
def temp_scale(P, alpha):
    Ps = np.power(np.clip(P, 1e-15, 1-1e-15), float(alpha))
    return Ps / Ps.sum(axis=1, keepdims=True)

alphas = np.linspace(0.6, 2.4, 181)
best = (1.0, 1e9)
for a in alphas:
    ll = log_loss(y_idx, temp_scale(oof_blend, a), labels=list(range(num_classes)))
    if ll < best[1]:
        best = (float(a), float(ll))
print(f'Temp scaling on blend: alpha={best[0]:.4f}, OOF={best[1]:.6f}')
alpha_opt = best[0]
test_cal = temp_scale(test_blend, alpha_opt)

# Write soft submission only (per production guidance)
pred_df = pd.DataFrame(test_cal, columns=list(le.classes_))
sub_cols = [c for c in sample_sub.columns if c != id_col]
pred_df = pred_df[sub_cols]
submission = pd.concat([test[[id_col]].reset_index(drop=True), pred_df.reset_index(drop=True)], axis=1)
submission.to_csv('submission.csv', index=False)
submission.to_csv('submission_lrENS_soft_blend_calibrated.csv', index=False)
print('Saved submission.csv and submission_lrENS_soft_blend_calibrated.csv:', submission.shape)

SLSQP success: True | OOF(pre-cal): 0.04546574498885025 | Weights: {'lr_ens': 0.5194, 'concat_lr': 0.2301, 'chi2_lr': 0.2342, 'blockL1_lr': 0.0163} | time 0.34s


Temp scaling on blend: alpha=1.8500, OOF=0.031526
Saved submission.csv and submission_lrENS_soft_blend_calibrated.csv: (99, 100)
