In [10]:
# Immediate diagnostics for Hellinger pipeline BEFORE any model retraining
import numpy as np, pandas as pd, json, time, sys
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

np.set_printoptions(linewidth=140, suppress=True)
SEED = 2025
id_col = 'id'; target_col = 'species'

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
with open('ordered_cols.json', 'r') as f:
    ordered_cols = json.load(f)

# Assert column equality and order
assert list(train[ordered_cols].columns) == ordered_cols, 'Train columns do not match ordered_cols'
assert list(test[ordered_cols].columns) == ordered_cols, 'Test columns do not match ordered_cols'

feature_cols = ordered_cols
X_df = train[feature_cols].copy()
X_test_df = test[feature_cols].copy()
X = X_df.values.astype(np.float64, copy=True)
X_test = X_test_df.values.astype(np.float64, copy=True)
le = LabelEncoder()
y = le.fit_transform(train[target_col].values)
K = len(le.classes_)

# Build block indices by prefix
def idxs_for_prefix(cols, pref):
    return np.array([i for i,c in enumerate(cols) if c.startswith(pref)], dtype=np.int64)

blk_idx = {
    'margin': idxs_for_prefix(feature_cols, 'margin'),
    'shape': idxs_for_prefix(feature_cols, 'shape'),
    'texture': idxs_for_prefix(feature_cols, 'texture'),
}
print('Block lengths:', {k: len(v) for k,v in blk_idx.items()})
assert all(len(blk_idx[b])==64 for b in ['margin','shape','texture']), 'Expected 64 dims per block'

# Raw data checks
print(f'Raw X: min={X.min():.6f}, max={X.max():.6f}, <0: {(X<0).sum()}, =0: {(X==0).sum()}')
for b in ['margin','shape','texture']:
    Xi = X[:, blk_idx[b]]
    print(f'{b}: min={Xi.min():.6f}, max={Xi.max():.6f}, mean={Xi.mean():.6f}')

# Hellinger per-block builder with pre-L1/block-sum diagnostics
def hellinger_per_block(X_in, blk_idx, eps=0.0, do_gl2=False, verbose=True):
    Xh_parts = []
    for b in ['margin','shape','texture']:
        cols = blk_idx[b]
        Xi = X_in[:, cols].copy()
        pre_clip_neg = int((Xi < 0).sum())
        if eps > 0:
            Xi += eps
        np.maximum(Xi, 0.0, out=Xi)
        s_raw = Xi.sum(axis=1, keepdims=True)
        if verbose:
            print(f'Block {b}: pre-clip negatives={pre_clip_neg}, sum=0 rows: {int((s_raw[:,0]==0).sum())}, sum<0.1: {int((s_raw[:,0]<0.1).sum())}')
        # L1
        s = s_raw.copy()
        s[s==0] = 1.0
        Xi = Xi / s
        # sqrt
        np.sqrt(Xi, out=Xi)
        # each block now has L2 norm = 1 (by construction) if original s>0
        Xh_parts.append(Xi)
    Xh = np.concatenate(Xh_parts, axis=1)
    if do_gl2:
        # Global L2 over concatenated vector
        norms = np.linalg.norm(Xh, axis=1, keepdims=True)
        norms[norms==0] = 1.0
        Xh = Xh / norms
    return Xh

# Build GL2 features for KNN parity sanity
Xh_gl2 = hellinger_per_block(X, blk_idx, eps=0.0, do_gl2=True, verbose=True)

# KNN parity sanity: cosine vs euclidean on unit-norm data should tie
def knn_oof_logloss(Xmat, y, metric='cosine', n_neighbors=5, weights='distance', folds_path='folds_6.json'):
    with open(folds_path, 'r') as f:
        folds = [(np.array(a, dtype=np.int64), np.array(b, dtype=np.int64)) for a,b in json.load(f)]
    oof = np.zeros((len(Xmat), K), dtype=np.float64)
    for i, (trn_idx, val_idx) in enumerate(folds, 1):
        clf = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights, algorithm='brute')
        t0 = time.time()
        clf.fit(Xmat[trn_idx], y[trn_idx])
        P = clf.predict_proba(Xmat[val_idx])
        oof[val_idx] = P
        print(f'[KNN {metric} fold {i}/{len(folds)}] time={time.time()-t0:.2f}s', flush=True)
    oof = np.clip(oof, 1e-15, 1-1e-15); oof /= oof.sum(axis=1, keepdims=True)
    return log_loss(y, oof, labels=list(range(K)))

print('Running KNN parity sanity on GL2 features (should tie):')
ll_cos = knn_oof_logloss(Xh_gl2, y, metric='cosine', n_neighbors=5, weights='distance')
ll_euc = knn_oof_logloss(Xh_gl2, y, metric='euclidean', n_neighbors=5, weights='distance')
print(f'KNN OOF logloss | cosine={ll_cos:.6f} | euclidean={ll_euc:.6f}')

# Self-neighbor sanity on 200-row subset (cosine), expect high same-class rate
n_check = min(200, Xh_gl2.shape[0])
rng = np.random.default_rng(SEED)
idx_sub = rng.choice(Xh_gl2.shape[0], size=n_check, replace=False)
nn = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
nn.fit(Xh_gl2)
dists, inds = nn.kneighbors(Xh_gl2[idx_sub], n_neighbors=2, return_distance=True)
nbrs = inds[:,1]  # exclude self at position 0
match = (y[idx_sub] == y[nbrs]).mean()
print(f'Self-neighbor sanity (cosine, GL2): match rate over {n_check} subs = {match:.4f} (random ~0.0101)')

print('Diagnostics complete.')

Block lengths: {'margin': 64, 'shape': 64, 'texture': 64}
Raw X: min=0.000000, max=0.853520, <0: 0, =0: 35108
margin: min=0.000000, max=0.388670, mean=0.015625
shape: min=0.000022, max=0.003007, mean=0.000607
texture: min=0.000000, max=0.853520, mean=0.015625
Block margin: pre-clip negatives=0, sum=0 rows: 0, sum<0.1: 0
Block shape: pre-clip negatives=0, sum=0 rows: 0, sum<0.1: 886
Block texture: pre-clip negatives=0, sum=0 rows: 0, sum<0.1: 0
Running KNN parity sanity on GL2 features (should tie):
[KNN cosine fold 1/6] time=0.01s


[KNN cosine fold 2/6] time=0.01s


[KNN cosine fold 3/6] time=0.01s


[KNN cosine fold 4/6] time=0.01s


[KNN cosine fold 5/6] time=0.01s


[KNN cosine fold 6/6] time=0.01s


[KNN euclidean fold 1/6] time=0.11s


[KNN euclidean fold 2/6] time=0.00s


[KNN euclidean fold 3/6] time=0.01s


[KNN euclidean fold 4/6] time=0.00s


[KNN euclidean fold 5/6] time=0.01s


[KNN euclidean fold 6/6] time=0.01s


KNN OOF logloss | cosine=0.624010 | euclidean=0.641695
Self-neighbor sanity (cosine, GL2): match rate over 200 subs = 0.9700 (random ~0.0101)
Diagnostics complete.


In [11]:
# Variant A (FIXED): no global L2, per-fold StandardScaler(mean+std), LR lbfgs multinomial, intercept=True
import time, json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def clip_norm(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)

def save_probs_and_logits(prefix: str, oof: np.ndarray, test_pred: np.ndarray):
    np.save(f'oof_{prefix}.npy', oof.astype(np.float32))
    np.save(f'test_{prefix}.npy', test_pred.astype(np.float32))
    oof_log = np.log(np.clip(oof, 1e-15, 1.0))
    test_log = np.log(np.clip(test_pred, 1e-15, 1.0))
    np.save(f'oof_{prefix}_logits.npy', oof_log.astype(np.float32))
    np.save(f'test_{prefix}_logits.npy', test_log.astype(np.float32))
    print(f'Saved oof_{prefix}.npy, test_{prefix}.npy and *_logits.npy', flush=True)

# Build Hellinger features WITHOUT global L2
Xh_nol2 = hellinger_per_block(X, blk_idx, eps=0.0, do_gl2=False, verbose=False)
Xh_te_nol2 = hellinger_per_block(X_test, blk_idx, eps=0.0, do_gl2=False, verbose=False)
print('Hellinger(no GL2) shapes:', Xh_nol2.shape, Xh_te_nol2.shape, flush=True)

# Load fixed 6-folds
with open('folds_6.json', 'r') as f:
    folds = [(np.array(a, dtype=np.int64), np.array(b, dtype=np.int64)) for a,b in json.load(f)]
print('Folds loaded:', len(folds), flush=True)

def run_variant_A_fixed(Cs=(20, 30, 50, 80, 120, 200, 300, 500, 1000, 2000, 5000, 10000, 20000), seed=2025):
    best = (None, 1e9); best_oof=None; best_test=None
    for C in Cs:
        t0 = time.time()
        oof = np.zeros((len(Xh_nol2), K), dtype=np.float64)
        test_pred = np.zeros((len(Xh_te_nol2), K), dtype=np.float64)
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            sc = StandardScaler(with_mean=True, with_std=True)
            X_tr = sc.fit_transform(Xh_nol2[trn_idx])
            X_va = sc.transform(Xh_nol2[val_idx])
            X_te = sc.transform(Xh_te_nol2)
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', fit_intercept=True, C=C, max_iter=10000, tol=1e-8, random_state=seed)
            fstart = time.time()
            clf.fit(X_tr, y[trn_idx])
            oof[val_idx] = clf.predict_proba(X_va)
            test_pred += clf.predict_proba(X_te) / len(folds)
            print(f'[VarA-noGL2 fold {i}/{len(folds)}] C={C} time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
        print(f'--> VarA-noGL2 OOF={ll:.6f} | C={C} | time {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof=oof; best_test=test_pred
    print('Best VarA-noGL2:', best, flush=True)
    if best_oof is not None:
        save_probs_and_logits('hell_varA_noGL2', best_oof, best_test)
    return best, best_oof, best_test

print('Variant A (fixed) ready. To run: bestA, oofA, testA = run_variant_A_fixed()')

Hellinger(no GL2) shapes: (891, 192) (99, 192)


Folds loaded: 6


Variant A (fixed) ready. To run: bestA, oofA, testA = run_variant_A_fixed()


In [12]:
# Execute Variant A (fixed) grid search and save outputs
bestA, oofA, testA = run_variant_A_fixed()
print('BestA:', bestA)



[VarA-noGL2 fold 1/6] C=20 time=3.78s




[VarA-noGL2 fold 2/6] C=20 time=3.64s




[VarA-noGL2 fold 3/6] C=20 time=4.62s




[VarA-noGL2 fold 4/6] C=20 time=4.88s




[VarA-noGL2 fold 5/6] C=20 time=4.25s




[VarA-noGL2 fold 6/6] C=20 time=3.39s


--> VarA-noGL2 OOF=0.069888 | C=20 | time 24.62s




[VarA-noGL2 fold 1/6] C=30 time=3.24s




[VarA-noGL2 fold 2/6] C=30 time=2.86s




[VarA-noGL2 fold 3/6] C=30 time=5.06s




[VarA-noGL2 fold 4/6] C=30 time=4.11s




[VarA-noGL2 fold 5/6] C=30 time=4.68s




[VarA-noGL2 fold 6/6] C=30 time=4.45s


--> VarA-noGL2 OOF=0.066339 | C=30 | time 24.46s




[VarA-noGL2 fold 1/6] C=50 time=3.48s




[VarA-noGL2 fold 2/6] C=50 time=3.50s




[VarA-noGL2 fold 3/6] C=50 time=3.94s




[VarA-noGL2 fold 4/6] C=50 time=5.13s




[VarA-noGL2 fold 5/6] C=50 time=4.64s




[VarA-noGL2 fold 6/6] C=50 time=4.97s


--> VarA-noGL2 OOF=0.062823 | C=50 | time 25.73s




[VarA-noGL2 fold 1/6] C=80 time=4.18s




[VarA-noGL2 fold 2/6] C=80 time=3.74s




[VarA-noGL2 fold 3/6] C=80 time=4.40s




[VarA-noGL2 fold 4/6] C=80 time=4.20s




[VarA-noGL2 fold 5/6] C=80 time=4.58s




[VarA-noGL2 fold 6/6] C=80 time=3.96s


--> VarA-noGL2 OOF=0.060320 | C=80 | time 25.12s




[VarA-noGL2 fold 1/6] C=120 time=4.14s




[VarA-noGL2 fold 2/6] C=120 time=4.19s




[VarA-noGL2 fold 3/6] C=120 time=4.49s




[VarA-noGL2 fold 4/6] C=120 time=4.63s




[VarA-noGL2 fold 5/6] C=120 time=3.34s




[VarA-noGL2 fold 6/6] C=120 time=4.12s


--> VarA-noGL2 OOF=0.058641 | C=120 | time 24.99s




[VarA-noGL2 fold 1/6] C=200 time=3.69s




[VarA-noGL2 fold 2/6] C=200 time=4.46s




[VarA-noGL2 fold 3/6] C=200 time=4.51s




[VarA-noGL2 fold 4/6] C=200 time=4.23s




[VarA-noGL2 fold 5/6] C=200 time=4.27s




[VarA-noGL2 fold 6/6] C=200 time=3.67s


--> VarA-noGL2 OOF=0.056990 | C=200 | time 24.89s




[VarA-noGL2 fold 1/6] C=300 time=4.10s




[VarA-noGL2 fold 2/6] C=300 time=3.31s




[VarA-noGL2 fold 3/6] C=300 time=3.81s




[VarA-noGL2 fold 4/6] C=300 time=3.87s




[VarA-noGL2 fold 5/6] C=300 time=3.01s




[VarA-noGL2 fold 6/6] C=300 time=2.43s


--> VarA-noGL2 OOF=0.055990 | C=300 | time 20.60s




[VarA-noGL2 fold 1/6] C=500 time=3.39s




[VarA-noGL2 fold 2/6] C=500 time=3.85s




[VarA-noGL2 fold 3/6] C=500 time=3.52s




[VarA-noGL2 fold 4/6] C=500 time=3.77s




[VarA-noGL2 fold 5/6] C=500 time=3.68s




[VarA-noGL2 fold 6/6] C=500 time=2.86s


--> VarA-noGL2 OOF=0.055161 | C=500 | time 21.13s




[VarA-noGL2 fold 1/6] C=1000 time=2.44s




[VarA-noGL2 fold 2/6] C=1000 time=3.18s




[VarA-noGL2 fold 3/6] C=1000 time=3.16s




[VarA-noGL2 fold 4/6] C=1000 time=3.57s




[VarA-noGL2 fold 5/6] C=1000 time=2.67s




[VarA-noGL2 fold 6/6] C=1000 time=2.47s


--> VarA-noGL2 OOF=0.054402 | C=1000 | time 17.55s




[VarA-noGL2 fold 1/6] C=2000 time=2.67s




[VarA-noGL2 fold 2/6] C=2000 time=2.79s




[VarA-noGL2 fold 3/6] C=2000 time=2.60s




[VarA-noGL2 fold 4/6] C=2000 time=2.04s




[VarA-noGL2 fold 5/6] C=2000 time=3.02s




[VarA-noGL2 fold 6/6] C=2000 time=2.84s


--> VarA-noGL2 OOF=0.053803 | C=2000 | time 16.01s




[VarA-noGL2 fold 1/6] C=5000 time=1.83s




[VarA-noGL2 fold 2/6] C=5000 time=2.18s




[VarA-noGL2 fold 3/6] C=5000 time=2.70s




[VarA-noGL2 fold 4/6] C=5000 time=1.96s




[VarA-noGL2 fold 5/6] C=5000 time=2.20s




[VarA-noGL2 fold 6/6] C=5000 time=2.57s


--> VarA-noGL2 OOF=0.053459 | C=5000 | time 13.52s




[VarA-noGL2 fold 1/6] C=10000 time=1.88s




[VarA-noGL2 fold 2/6] C=10000 time=2.60s




[VarA-noGL2 fold 3/6] C=10000 time=1.90s




[VarA-noGL2 fold 4/6] C=10000 time=1.84s




[VarA-noGL2 fold 5/6] C=10000 time=1.91s




[VarA-noGL2 fold 6/6] C=10000 time=1.86s


--> VarA-noGL2 OOF=0.053533 | C=10000 | time 12.07s




[VarA-noGL2 fold 1/6] C=20000 time=1.53s




[VarA-noGL2 fold 2/6] C=20000 time=1.55s




[VarA-noGL2 fold 3/6] C=20000 time=1.49s




[VarA-noGL2 fold 4/6] C=20000 time=1.62s




[VarA-noGL2 fold 5/6] C=20000 time=1.85s




[VarA-noGL2 fold 6/6] C=20000 time=1.51s


--> VarA-noGL2 OOF=0.054035 | C=20000 | time 9.61s


Best VarA-noGL2: (5000, 0.053458870299620306)


Saved oof_hell_varA_noGL2.npy, test_hell_varA_noGL2.npy and *_logits.npy


BestA: (5000, 0.053458870299620306)


# Canonical Hellinger Logistic Regression (Variants A & B)

Plan:
- Data: use train.csv/test.csv with float64; fixed 6-folds from folds_6.json; LabelEncoder target.
- Deterministic blocks: ['margin', 'shape', 'texture']; within each, lexicographically sort 64 cols; total 192; persist order for train/test.
- Shared per-block transform (applied inside each fold fit):
  1) clip to >=0
  2) L1 normalize per sample within block with eps=1e-12
  3) elementwise sqrt
- Variant A (global L2): concat blocks -> global L2 row norm with eps=1e-12; LR(mn, lbfgs, l2, fit_intercept=False, max_iter=5000, tol=1e-8, C in {5,10,20,30,50,80,120,200}).
- Variant B (center-only): concat blocks; NO global L2; fold-wise StandardScaler(mean-only) -> LR(fit_intercept=True, same solver, C in {50,100,200,300,500,800,1200}).
- Stability: no constants injected; eps only in denominators; assert sums/norms; verify class order.
- Outputs: save OOF/test probs and logits:
  * oof_hell_canon_gl2.npy / test_hell_canon_gl2.npy
  * oof_hell_center_only.npy / test_hell_center_only.npy
  * logits saved via np.log(np.clip(P,1e-15,1.0)) for blender.
- Expected single-model OOF ~0.035–0.065; adds 0.002–0.006 in blend.

Next cells:
1) Imports, IO, folds, label encoding, column ordering utils.
2) Hellinger block transformer (pure NumPy, no in-place on views).
3) CV trainers for Variant A and B with logging and assertions.
4) Execution: run grids, save artifacts.

In [7]:
# Setup: imports, IO, folds, label encoding, deterministic block ordering (NATURAL NUMERIC ORDER)
import numpy as np, pandas as pd, json, time, os, sys
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

SEED = 2025
np.random.seed(SEED)

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
id_col = 'id'; target_col = 'species'

# Label encode target
le = LabelEncoder()
y = le.fit_transform(train[target_col].values)
K = len(le.classes_)
print(f'Classes: {K}', flush=True)

# Build deterministic block order with NATURAL numeric order within each block
def natural(cols, prefix):
    keep = [c for c in cols if c.startswith(prefix)]
    def keyfunc(c):
        s = c[len(prefix):]
        num = ''.join(ch for ch in s if ch.isdigit())
        return int(num) if num.isdigit() else 0
    return sorted(keep, key=keyfunc)

feature_cols_all = [c for c in train.columns if c not in [id_col, target_col]]
ordered_cols = natural(feature_cols_all, 'margin') + natural(feature_cols_all, 'shape') + natural(feature_cols_all, 'texture')
assert len(ordered_cols) == 192, f'Expected 192 features, got {len(ordered_cols)}'

# Sanity: 64 per block; print first/last 5 per block
margin_cols = natural(feature_cols_all, 'margin')
shape_cols = natural(feature_cols_all, 'shape')
texture_cols = natural(feature_cols_all, 'texture')
print('Blocks sizes:', len(margin_cols), len(shape_cols), len(texture_cols), flush=True)
print('margin head/tail:', margin_cols[:5], margin_cols[-5:], flush=True)
print('shape head/tail:', shape_cols[:5], shape_cols[-5:], flush=True)
print('texture head/tail:', texture_cols[:5], texture_cols[-5:], flush=True)

# Persist ordered columns for reuse (overwrite old if existed)
with open('ordered_cols.json', 'w') as f:
    json.dump(ordered_cols, f)

# Assemble matrices in float64
X = train[ordered_cols].values.astype(np.float64, copy=True)
X_test = test[ordered_cols].values.astype(np.float64, copy=True)
print('Shapes:', X.shape, X_test.shape, flush=True)

# Load fixed 6-folds
with open('folds_6.json', 'r') as f:
    folds = [(np.array(a, dtype=np.int64), np.array(b, dtype=np.int64)) for a, b in json.load(f)]
print('Loaded folds:', len(folds), flush=True)

Classes: 99


Blocks sizes: 64 64 64


margin head/tail: ['margin1', 'margin2', 'margin3', 'margin4', 'margin5'] ['margin60', 'margin61', 'margin62', 'margin63', 'margin64']


shape head/tail: ['shape1', 'shape2', 'shape3', 'shape4', 'shape5'] ['shape60', 'shape61', 'shape62', 'shape63', 'shape64']


texture head/tail: ['texture1', 'texture2', 'texture3', 'texture4', 'texture5'] ['texture60', 'texture61', 'texture62', 'texture63', 'texture64']


Shapes: (891, 192) (99, 192)


Loaded folds: 6


In [8]:
# Hellinger transforms and CV trainers for Variants A and B
from typing import List, Tuple, Dict

def get_block_indices(ordered_cols: List[str]) -> Dict[str, np.ndarray]:
    blocks = {'margin': [], 'shape': [], 'texture': []}
    for i, c in enumerate(ordered_cols):
        for b in blocks.keys():
            if c.startswith(b):
                blocks[b].append(i)
                break
    for b in blocks:
        if len(blocks[b]) == 0:
            raise ValueError(f'No columns for block {b}')
    # Ensure lexicographic order preserved (already sorted earlier)
    return {b: np.array(idx, dtype=np.int64) for b, idx in blocks.items()}

blk_idx = get_block_indices(ordered_cols)
print({k: len(v) for k,v in blk_idx.items()}, flush=True)

def hellinger_per_block(X_in: np.ndarray, blk_idx: Dict[str, np.ndarray], eps: float = 1e-12) -> np.ndarray:
    # Returns a new array; no in-place on views
    X = X_in.astype(np.float64, copy=True)
    out_blocks = []
    for b in ['margin', 'shape', 'texture']:
        cols = blk_idx[b]
        Xi = X[:, cols].copy()
        # clip negatives to zero
        np.maximum(Xi, 0.0, out=Xi)
        # L1 normalize per row
        s = Xi.sum(axis=1, keepdims=True)
        Xi = Xi / np.maximum(s, eps)
        # sqrt
        Xi = np.sqrt(Xi, dtype=np.float64)
        # Optional assert on sums ~1 before sqrt (not exact after sqrt)
        ok = np.all((s.squeeze() >= 0.0))
        if not ok:
            raise ValueError('Negative sums encountered pre-normalization')
        out_blocks.append(Xi)
    return np.concatenate(out_blocks, axis=1)

def row_l2_normalize(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    X = X.astype(np.float64, copy=True)
    n = np.sqrt((X * X).sum(axis=1, keepdims=True))
    X = X / np.maximum(n, eps)
    return X

def clip_norm(P: np.ndarray) -> np.ndarray:
    P = np.clip(P, 1e-15, 1 - 1e-15)
    P = P / P.sum(axis=1, keepdims=True)
    return P

def save_probs_and_logits(prefix: str, oof: np.ndarray, test_pred: np.ndarray):
    np.save(f'oof_{prefix}.npy', oof.astype(np.float32))
    np.save(f'test_{prefix}.npy', test_pred.astype(np.float32))
    oof_log = np.log(np.clip(oof, 1e-15, 1.0))
    test_log = np.log(np.clip(test_pred, 1e-15, 1.0))
    np.save(f'oof_{prefix}_logits.npy', oof_log.astype(np.float32))
    np.save(f'test_{prefix}_logits.npy', test_log.astype(np.float32))
    print(f'Saved oof_{prefix}.npy, test_{prefix}.npy and *_logits.npy', flush=True)

def run_variant_A(Cs=(5,10,20,30,50,80,120,200)) -> Tuple[Tuple[float, float], np.ndarray, np.ndarray]:
    print('Running Variant A (global L2, fit_intercept=False)', flush=True)
    best = (None, 1e9)
    best_oof = None
    best_test = None
    X_h = hellinger_per_block(X, blk_idx)
    Xh_test = hellinger_per_block(X_test, blk_idx)
    # Assertions on per-block L1 prior to sqrt are implicit; do global L2 now
    X_h_gl2 = row_l2_normalize(X_h)
    Xh_test_gl2 = row_l2_normalize(Xh_test)
    # Check norms ~1
    n_tr = np.sqrt((X_h_gl2 * X_h_gl2).sum(axis=1))
    n_te = np.sqrt((Xh_test_gl2 * Xh_test_gl2).sum(axis=1))
    assert np.all(np.isfinite(n_tr)) and np.all(np.isfinite(n_te))
    for C in Cs:
        t0 = time.time()
        oof = np.zeros((len(X_h_gl2), K), dtype=np.float64)
        test_pred = np.zeros((len(Xh_test_gl2), K), dtype=np.float64)
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx, dtype=np.int64); val_idx = np.array(val_idx, dtype=np.int64)
            X_tr = X_h_gl2[trn_idx]
            X_va = X_h_gl2[val_idx]
            X_te = Xh_test_gl2
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', fit_intercept=False, C=C, max_iter=5000, tol=1e-8, n_jobs=None, random_state=SEED)
            fstart = time.time()
            clf.fit(X_tr, y[trn_idx])
            assert np.array_equal(clf.classes_, np.arange(K))
            P_va = clf.predict_proba(X_va)
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(X_te) / len(folds)
            print(f'[VarA fold {i}/{len(folds)}] C={C} time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
        print(f'Variant A OOF: {ll:.6f} (C={C}) in {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof = oof; best_test = test_pred
    print('Best Variant A:', best, flush=True)
    if best_oof is not None:
        save_probs_and_logits('hell_canon_gl2', best_oof, best_test)
    return best, best_oof, best_test

def run_variant_B(Cs=(50,100,200,300,500,800,1200)) -> Tuple[Tuple[float, float], np.ndarray, np.ndarray]:
    print('Running Variant B (center-only, fit_intercept=True)', flush=True)
    best = (None, 1e9)
    best_oof = None
    best_test = None
    X_h = hellinger_per_block(X, blk_idx)
    Xh_test = hellinger_per_block(X_test, blk_idx)
    for C in Cs:
        t0 = time.time()
        oof = np.zeros((len(X_h), K), dtype=np.float64)
        test_pred_accum = np.zeros((len(Xh_test), K), dtype=np.float64)
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx, dtype=np.int64); val_idx = np.array(val_idx, dtype=np.int64)
            X_tr = X_h[trn_idx]
            X_va = X_h[val_idx]
            X_te = Xh_test.copy()
            sc = StandardScaler(with_mean=True, with_std=False)
            X_tr_c = sc.fit_transform(X_tr)
            X_va_c = sc.transform(X_va)
            X_te_c = sc.transform(X_te)
            clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', fit_intercept=True, C=C, max_iter=5000, tol=1e-8, n_jobs=None, random_state=SEED)
            fstart = time.time()
            clf.fit(X_tr_c, y[trn_idx])
            assert np.array_equal(clf.classes_, np.arange(K))
            P_va = clf.predict_proba(X_va_c)
            oof[val_idx] = P_va
            test_pred_accum += clf.predict_proba(X_te_c) / len(folds)
            print(f'[VarB fold {i}/{len(folds)}] C={C} time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
        print(f'Variant B OOF: {ll:.6f} (C={C}) in {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (C, ll); best_oof = oof; best_test = test_pred_accum
    print('Best Variant B:', best, flush=True)
    if best_oof is not None:
        save_probs_and_logits('hell_center_only', best_oof, best_test)
    return best, best_oof, best_test

print('Functions ready. Next: execute Variant A and B.', flush=True)

{'margin': 64, 'shape': 64, 'texture': 64}


Functions ready. Next: execute Variant A and B.


In [3]:
# Execute Variant A (global L2) grid
t0 = time.time()
bestA, oofA, testA = run_variant_A(Cs=(5,10,20,30,50,80,120,200))
print('Variant A done in {:.2f}s | Best:'.format(time.time()-t0), bestA, flush=True)

Running Variant A (global L2, fit_intercept=False)




[VarA fold 1/6] C=5 time=0.46s




[VarA fold 2/6] C=5 time=0.59s




[VarA fold 3/6] C=5 time=0.65s




[VarA fold 4/6] C=5 time=0.68s




[VarA fold 5/6] C=5 time=0.54s




[VarA fold 6/6] C=5 time=0.64s


Variant A OOF: 1.761470 (C=5) in 3.59s




[VarA fold 1/6] C=10 time=0.83s




[VarA fold 2/6] C=10 time=0.70s


[VarA fold 3/6] C=10 time=0.79s




[VarA fold 4/6] C=10 time=0.76s




[VarA fold 5/6] C=10 time=0.96s




[VarA fold 6/6] C=10 time=0.87s


Variant A OOF: 1.170059 (C=10) in 4.93s




[VarA fold 1/6] C=20 time=1.01s




[VarA fold 2/6] C=20 time=0.85s




[VarA fold 3/6] C=20 time=0.94s




[VarA fold 4/6] C=20 time=0.96s




[VarA fold 5/6] C=20 time=1.02s




[VarA fold 6/6] C=20 time=1.10s


Variant A OOF: 0.776551 (C=20) in 5.90s




[VarA fold 1/6] C=30 time=0.97s




[VarA fold 2/6] C=30 time=1.11s




[VarA fold 3/6] C=30 time=1.21s




[VarA fold 4/6] C=30 time=1.24s




[VarA fold 5/6] C=30 time=1.22s




[VarA fold 6/6] C=30 time=1.16s


Variant A OOF: 0.617304 (C=30) in 6.94s




[VarA fold 1/6] C=50 time=0.95s




[VarA fold 2/6] C=50 time=0.93s




[VarA fold 3/6] C=50 time=0.82s




[VarA fold 4/6] C=50 time=0.84s




[VarA fold 5/6] C=50 time=1.22s




[VarA fold 6/6] C=50 time=1.27s


Variant A OOF: 0.468983 (C=50) in 6.05s




[VarA fold 1/6] C=80 time=1.71s




[VarA fold 2/6] C=80 time=1.50s




[VarA fold 3/6] C=80 time=1.11s




[VarA fold 4/6] C=80 time=0.91s




[VarA fold 5/6] C=80 time=1.16s




[VarA fold 6/6] C=80 time=1.22s


Variant A OOF: 0.369650 (C=80) in 7.63s




[VarA fold 1/6] C=120 time=1.16s




[VarA fold 2/6] C=120 time=1.09s




[VarA fold 3/6] C=120 time=1.05s




In [4]:
# Diagnostic: Variant A with fit_intercept=True at C=80 + sanity checks
from numpy.linalg import norm
C_try = 80
print('Building Hellinger features and global L2...')
X_h = hellinger_per_block(X, blk_idx)
Xh_test = hellinger_per_block(X_test, blk_idx)

# Sanity diagnostics on block norms pre-global L2
for b in ['margin','shape','texture']:
    idx = blk_idx[b]
    l2s = norm(X_h[:, idx], axis=1)
    print(f'Block {b}: L2 min/med/max = {l2s.min():.6f}/{np.median(l2s):.6f}/{l2s.max():.6f}')
tot_l2 = norm(X_h, axis=1)
print('Concat pre-gl2 row L2 min/med/max:', tot_l2.min(), np.median(tot_l2), tot_l2.max())

X_h_gl2 = row_l2_normalize(X_h)
Xh_test_gl2 = row_l2_normalize(Xh_test)
row_l2 = norm(X_h_gl2, axis=1)
print('Post-gl2 row L2 min/med/max:', row_l2.min(), np.median(row_l2), row_l2.max())
assert np.all(np.isfinite(X_h_gl2)) and np.all(np.isfinite(Xh_test_gl2))

# Single trial: fit_intercept=True with C=80
oof = np.zeros((len(X_h_gl2), K), dtype=np.float64)
test_pred = np.zeros((len(Xh_test_gl2), K), dtype=np.float64)
t0 = time.time()
for i, (trn_idx, val_idx) in enumerate(folds, 1):
    trn_idx = np.array(trn_idx, dtype=np.int64); val_idx = np.array(val_idx, dtype=np.int64)
    X_tr = X_h_gl2[trn_idx]
    X_va = X_h_gl2[val_idx]
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', fit_intercept=True, C=C_try, max_iter=5000, tol=1e-8, random_state=SEED)
    fstart = time.time()
    clf.fit(X_tr, y[trn_idx])
    assert np.array_equal(clf.classes_, np.arange(K))
    P_va = clf.predict_proba(X_va)
    oof[val_idx] = P_va
    test_pred += clf.predict_proba(Xh_test_gl2) / len(folds)
    print(f'[Diag VarA fold {i}/{len(folds)}] C={C_try} time={time.time()-fstart:.2f}s')
ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
print(f'DIAG Variant A (fit_intercept=True) OOF: {ll:.6f} at C={C_try} in {time.time()-t0:.2f}s')

Building Hellinger features and global L2...
Block margin: L2 min/med/max = 1.000000/1.000000/1.000000
Block shape: L2 min/med/max = 1.000000/1.000000/1.000000
Block texture: L2 min/med/max = 1.000000/1.000000/1.000000
Concat pre-gl2 row L2 min/med/max: 1.732050807568877 1.7320508075688772 1.7320508075688776
Post-gl2 row L2 min/med/max: 0.9999999999999998 1.0 1.0000000000000002




[Diag VarA fold 1/6] C=80 time=2.02s




[Diag VarA fold 2/6] C=80 time=1.62s




[Diag VarA fold 4/6] C=80 time=2.63s




[Diag VarA fold 5/6] C=80 time=2.77s




[Diag VarA fold 6/6] C=80 time=3.20s
DIAG Variant A (fit_intercept=True) OOF: 0.367674 at C=80 in 14.14s


In [5]:
# Execute Variant B (center-only) with extended C grid
Cs_ext = (50,100,200,300,500,800,1200,2000,4000)
t0 = time.time()
bestB, oofB, testB = run_variant_B(Cs=Cs_ext)
print('Variant B done in {:.2f}s | Best:'.format(time.time()-t0), bestB, flush=True)

Running Variant B (center-only, fit_intercept=True)




[VarB fold 1/6] C=50 time=1.42s




[VarB fold 2/6] C=50 time=1.41s




[VarB fold 3/6] C=50 time=1.69s






[VarB fold 5/6] C=50 time=1.13s




[VarB fold 6/6] C=50 time=1.18s


Variant B OOF: 0.273386 (C=50) in 8.19s




[VarB fold 1/6] C=100 time=1.06s




[VarB fold 2/6] C=100 time=0.93s




[VarB fold 3/6] C=100 time=0.93s




[VarB fold 4/6] C=100 time=1.34s




[VarB fold 5/6] C=100 time=1.90s




[VarB fold 6/6] C=100 time=2.00s


Variant B OOF: 0.202988 (C=100) in 8.21s




[VarB fold 1/6] C=200 time=2.11s




[VarB fold 2/6] C=200 time=2.18s




[VarB fold 3/6] C=200 time=1.82s




[VarB fold 4/6] C=200 time=1.70s




[VarB fold 5/6] C=200 time=1.67s




[VarB fold 6/6] C=200 time=1.67s


Variant B OOF: 0.155447 (C=200) in 11.20s




[VarB fold 1/6] C=300 time=2.01s




[VarB fold 2/6] C=300 time=1.94s




[VarB fold 3/6] C=300 time=1.96s




[VarB fold 4/6] C=300 time=1.79s




[VarB fold 5/6] C=300 time=2.09s




[VarB fold 6/6] C=300 time=1.77s


Variant B OOF: 0.134960 (C=300) in 11.61s




[VarB fold 1/6] C=500 time=1.71s




[VarB fold 2/6] C=500 time=1.49s




[VarB fold 3/6] C=500 time=1.39s




[VarB fold 4/6] C=500 time=1.56s




[VarB fold 5/6] C=500 time=1.52s




[VarB fold 6/6] C=500 time=1.44s


Variant B OOF: 0.114764 (C=500) in 9.16s




[VarB fold 1/6] C=800 time=1.41s




[VarB fold 2/6] C=800 time=1.58s




[VarB fold 3/6] C=800 time=1.18s




[VarB fold 4/6] C=800 time=1.25s




[VarB fold 5/6] C=800 time=1.97s




[VarB fold 6/6] C=800 time=1.78s


Variant B OOF: 0.100450 (C=800) in 9.21s




[VarB fold 1/6] C=1200 time=2.04s




[VarB fold 2/6] C=1200 time=2.37s




[VarB fold 3/6] C=1200 time=1.59s




[VarB fold 4/6] C=1200 time=1.58s




[VarB fold 5/6] C=1200 time=1.83s




[VarB fold 6/6] C=1200 time=2.14s


Variant B OOF: 0.090649 (C=1200) in 11.60s




[VarB fold 1/6] C=2000 time=2.00s




[VarB fold 2/6] C=2000 time=1.66s




[VarB fold 3/6] C=2000 time=1.37s




[VarB fold 4/6] C=2000 time=1.57s




[VarB fold 5/6] C=2000 time=1.59s




[VarB fold 6/6] C=2000 time=1.56s


Variant B OOF: 0.080929 (C=2000) in 9.80s




[VarB fold 1/6] C=4000 time=1.86s




[VarB fold 2/6] C=4000 time=2.04s




[VarB fold 3/6] C=4000 time=2.30s




[VarB fold 4/6] C=4000 time=2.24s




[VarB fold 5/6] C=4000 time=1.85s




[VarB fold 6/6] C=4000 time=1.63s


Variant B OOF: 0.071293 (C=4000) in 11.98s


Best Variant B: (4000, 0.07129287335935905)


Saved oof_hell_center_only.npy, test_hell_center_only.npy and *_logits.npy


Variant B done in 91.01s | Best: (4000, 0.07129287335935905)


In [9]:
# Additional model: KNN (cosine) on Hellinger GL2 features
from sklearn.neighbors import KNeighborsClassifier

def run_knn_cosine_on_gl2(n_neighbors_list=(1, 3, 5), weights='distance'):
    print('Running KNN (cosine) on Hellinger GL2 features', flush=True)
    X_h = hellinger_per_block(X, blk_idx)
    Xh_test = hellinger_per_block(X_test, blk_idx)
    X_h_gl2 = row_l2_normalize(X_h)
    Xh_test_gl2 = row_l2_normalize(Xh_test)
    best = (None, 1e9); best_oof=None; best_test=None
    for k in n_neighbors_list:
        t0 = time.time()
        oof = np.zeros((len(X_h_gl2), K), dtype=np.float64)
        test_pred = np.zeros((len(Xh_test_gl2), K), dtype=np.float64)
        for i, (trn_idx, val_idx) in enumerate(folds, 1):
            trn_idx = np.array(trn_idx, dtype=np.int64); val_idx = np.array(val_idx, dtype=np.int64)
            X_tr = X_h_gl2[trn_idx]; y_tr = y[trn_idx]
            X_va = X_h_gl2[val_idx]
            clf = KNeighborsClassifier(n_neighbors=k, weights=weights, metric='cosine', algorithm='brute', n_jobs=-1)
            fstart = time.time()
            clf.fit(X_tr, y_tr)
            P_va = clf.predict_proba(X_va)
            oof[val_idx] = P_va
            test_pred += clf.predict_proba(Xh_test_gl2) / len(folds)
            print(f'  [KNN fold {i}/{len(folds)}] k={k} time={time.time()-fstart:.2f}s', flush=True)
        ll = log_loss(y, clip_norm(oof), labels=list(range(K)))
        print(f'--> KNN(cos) OOF={ll:.6f} | k={k} | time {time.time()-t0:.2f}s', flush=True)
        if ll < best[1]:
            best = (k, ll); best_oof=oof; best_test=test_pred
    print('Best KNN(cos) on GL2:', best, flush=True)
    if best_oof is not None:
        np.save('oof_hell_knn_cos_gl2.npy', best_oof.astype(np.float32))
        np.save('test_hell_knn_cos_gl2.npy', best_test.astype(np.float32))
        np.save('oof_hell_knn_cos_gl2_logits.npy', np.log(np.clip(best_oof, 1e-15, 1.0)).astype(np.float32))
        np.save('test_hell_knn_cos_gl2_logits.npy', np.log(np.clip(best_test, 1e-15, 1.0)).astype(np.float32))
        print('Saved KNN cosine GL2 preds/logits')
    return best, best_oof, best_test

# Execute quick KNN sweep
best_knn, oof_knn, test_knn = run_knn_cosine_on_gl2(n_neighbors_list=(1,3,5))
print('KNN done. Best:', best_knn)

Running KNN (cosine) on Hellinger GL2 features


  [KNN fold 1/6] k=1 time=0.26s


  [KNN fold 2/6] k=1 time=0.18s


  [KNN fold 3/6] k=1 time=0.18s


  [KNN fold 4/6] k=1 time=0.15s


  [KNN fold 5/6] k=1 time=0.15s


  [KNN fold 6/6] k=1 time=0.14s


--> KNN(cos) OOF=1.356742 | k=1 | time 1.09s


  [KNN fold 1/6] k=3 time=0.15s


  [KNN fold 2/6] k=3 time=0.15s


  [KNN fold 3/6] k=3 time=0.14s


  [KNN fold 4/6] k=3 time=0.14s


  [KNN fold 5/6] k=3 time=0.14s


  [KNN fold 6/6] k=3 time=0.14s


--> KNN(cos) OOF=0.700731 | k=3 | time 0.89s


  [KNN fold 1/6] k=5 time=0.14s


  [KNN fold 2/6] k=5 time=0.15s


  [KNN fold 3/6] k=5 time=0.15s


  [KNN fold 4/6] k=5 time=0.15s


  [KNN fold 5/6] k=5 time=0.15s


  [KNN fold 6/6] k=5 time=0.15s


--> KNN(cos) OOF=0.624010 | k=5 | time 0.91s


Best KNN(cos) on GL2: (5, 0.6240103177853032)


Saved KNN cosine GL2 preds/logits
KNN done. Best: (5, 0.6240103177853032)


In [None]:
# Re-run Variant A after natural-order fix with extended C grid
t0 = time.time()
Cs_ext_A = (20, 30, 50, 80, 120, 200, 300, 500, 1000, 2000, 5000)
bestA_fix, oofA_fix, testA_fix = run_variant_A(Cs=Cs_ext_A)
print('Variant A re-run done in {:.2f}s | Best:'.format(time.time()-t0), bestA_fix, flush=True)