# Leaf Classification - Plan

Objective: Build a high-performing classifier for 99 leaf species using pre-extracted tabular features (shape, margin, texture) from train.csv/test.csv. Optimize for multi-class log-loss. Deliver submission.csv.

Workflow:
- Data loading & sanity checks: read train/test, inspect shapes, missing values, feature names, target distribution.
- Target encoding: LabelEncode species; keep mapping to columns for submission.
- Cross-validation: StratifiedKFold (10 folds, shuffle, fixed seed). Log fold progress and time.
- Baseline models (fast, strong for this task):
  1) Multinomial Logistic Regression (saga, class weights off, C tuned) with StandardScaler.
  2) Linear Discriminant Analysis (LDA) with shrinkage='auto' after StandardScaler.
  3) LightGBM multiclass (num_leaves, learning_rate tuned lightly).
  4) SVC (linear or rbf) with probability=True (limited C grid due to cost).
  5) QDA (with regularization) as a candidate if features suit it.
- Feature processing: Try PowerTransformer(Yeo-Johnson) or StandardScaler. Consider PCA with retained variance (e.g., 0.99) as an option and evaluate via CV.
- Ensembling: Average OOF and test probabilities from top 2-3 models based on CV log-loss. Optionally simple logistic blending on OOF if time.
- Evaluation: Multi-class log-loss via sklearn. Track per-fold and mean. Early stop poor configs.
- Submission: Average test probs per class; create columns matching sorted unique species labels from train; save to submission.csv.

Milestones for Expert Review:
- After initial data load + EDA summary.
- After first baseline CV results (LR/LDA).
- Before running heavier models (SVC/LGBM tuning).
- Before final ensembling and submission.

Time management:
- Start with LR + LDA with scaling; expect strong baseline quickly.
- Add LightGBM with modest tuning if needed.
- Only then consider SVC/PCA if CV indicates gains.

Next step: Implement data loading, preprocessing setup, and basic EDA logs.

In [None]:
# Setup: imports, data loading, CV utilities, and fast baselines (LDA, Logistic Regression)
import os, time, sys, gc, math, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

def load_data():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    print('Train shape:', train.shape, ' Test shape:', test.shape)
    # Basic checks
    assert 'species' in train.columns and 'id' in train.columns, 'Columns missing in train'
    assert 'id' in test.columns, 'id missing in test'
    # Drop id from features; keep for submission
    train_ids = train['id'].values
    test_ids = test['id'].values
    X = train.drop(columns=['id', 'species'])
    y = train['species'].values
    X_test = test.drop(columns=['id'])
    # Align columns just in case
    assert list(X.columns) == list(X_test.columns), 'Train/Test feature mismatch'
    # Encode target
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    classes = le.classes_  # alphabetical order
    print('Num classes:', len(classes))
    # Missing values check
    if X.isnull().any().any() or X_test.isnull().any().any():
        print('Warning: Missing values detected; filling with 0')
        X = X.fillna(0)
        X_test = X_test.fillna(0)
    return X.values, y_enc, X_test.values, classes, test_ids, le

def make_skf(n_splits=10, seed=SEED):
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

def train_oof(model, X, y, skf, X_test, n_classes, desc='model'):
    n_samples = X.shape[0]
    oof = np.zeros((n_samples, n_classes), dtype=np.float64)
    test_pred = np.zeros((X_test.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    start_all = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        t0 = time.time()
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]
        clf = model  # fresh clone each fold
        # Recreate the pipeline to avoid state carry-over
        from sklearn.base import clone
        clf = clone(model)
        clf.fit(X_tr, y_tr)
        va_proba = clf.predict_proba(X_va)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        oof[va_idx] = va_proba
        fold_losses.append(loss)
        # Test prediction
        test_pred += clf.predict_proba(X_test) / skf.get_n_splits()
        elapsed = time.time() - t0
        print(f'[{desc}] Fold {fold}/{skf.get_n_splits()} logloss={loss:.6f} time={elapsed:.1f}s', flush=True)
    total_elapsed = time.time() - start_all
    oof_loss = log_loss(y, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f} | time_total={total_elapsed/60:.1f}m')
    return oof, test_pred, oof_loss, fold_losses

# Load data
X, y, X_test, classes, test_ids, le = load_data()
n_classes = len(classes)
skf = make_skf(n_splits=10, seed=SEED)

# Baseline 1: LDA with StandardScaler and shrinkage='auto' (solver='lsqr')
lda_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'))
])
lda_oof, lda_test, lda_oof_loss, _ = train_oof(lda_pipeline, X, y, skf, X_test, n_classes, desc='LDA')

# Baseline 2: Multinomial Logistic Regression (saga) with StandardScaler
lr_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=1.0, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED))
])
lr_oof, lr_test, lr_oof_loss, _ = train_oof(lr_pipeline, X, y, skf, X_test, n_classes, desc='LogReg')

# Simple ensemble of current baselines (equal weight). Will be superseded later by SVC/QDA.
ens_oof = (lda_oof + lr_oof) / 2.0
ens_test = (lda_test + lr_test) / 2.0
ens_oof_loss = log_loss(y, ens_oof, labels=list(range(n_classes)))
print(f'[Ensemble LDA+LR] OOF logloss={ens_oof_loss:.6f}')

# Build a provisional submission from the ensemble
sub = pd.DataFrame(ens_test, columns=classes)
sub.insert(0, 'id', test_ids)
print('Submission shape:', sub.shape)
sub.head()

In [None]:
# Re-run with 5-fold CV and multiple LDA variants; build provisional submission
from sklearn.preprocessing import PowerTransformer

def clip_and_renorm(probs, eps=1e-15):
    P = np.clip(probs, eps, 1 - eps)
    P /= P.sum(axis=1, keepdims=True)
    return P

skf5 = make_skf(n_splits=5, seed=SEED)

# LDA Variant A: raw features, lsqr+shrinkage
lda_raw = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
lda_raw_oof, lda_raw_test, lda_raw_loss, _ = train_oof(lda_raw, X, y, skf5, X_test, n_classes, desc='LDA_raw')

# LDA Variant B: PowerTransformer -> LDA(lsqr, shrinkage='auto')
lda_pt = Pipeline(steps=[
    ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
    ('clf', LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto'))
])
lda_pt_oof, lda_pt_test, lda_pt_loss, _ = train_oof(lda_pt, X, y, skf5, X_test, n_classes, desc='LDA_PT')

# LDA Variant C: StandardScaler -> LDA(eigen, shrinkage='auto')
lda_eigen_std = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto'))
])
lda_eig_oof, lda_eig_test, lda_eig_loss, _ = train_oof(lda_eigen_std, X, y, skf5, X_test, n_classes, desc='LDA_eigen_std')

losses = {
    'LDA_raw': lda_raw_loss,
    'LDA_PT': lda_pt_loss,
    'LDA_eigen_std': lda_eig_loss
}
print('LDA variant OOF losses:', losses)
best_name = min(losses, key=losses.get)
print('Best LDA variant:', best_name, 'OOF=', losses[best_name])

best_test = {'LDA_raw': lda_raw_test, 'LDA_PT': lda_pt_test, 'LDA_eigen_std': lda_eig_test}[best_name]
best_test = clip_and_renorm(best_test)

# Build and save submission
sub = pd.DataFrame(best_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape:', sub.shape)
sub.head()

In [3]:
# 5-fold: RBF-SVC (isotonic-calibrated), QDA (PT), and LR; ensemble best
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

def train_oof_model(model, desc):
    return train_oof(model, X, y, skf5, X_test, n_classes, desc=desc)

# Recompute LR on 5-fold for consistent ensembling
lr5 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=1.0, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED))
])
lr5_oof, lr5_test, lr5_loss, _ = train_oof_model(lr5, 'LogReg_5fold')

# RBF-SVC: small grid, isotonic calibration; scaled features
svc_grid = [
    {'C': 4.0, 'gamma': 'scale'},
    {'C': 16.0, 'gamma': 'scale'},
    {'C': 64.0, 'gamma': 'scale'},
    {'C': 16.0, 'gamma': 0.01}
]
svc_results = []
best_svc = None
best_svc_loss = np.inf
best_svc_oof = None
best_svc_test = None
for i, params in enumerate(svc_grid, 1):
    print(f'[SVC grid] {i}/{len(svc_grid)} params={params}', flush=True)
    base_svc = SVC(kernel='rbf', C=params['C'], gamma=params['gamma'])
    svc_pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('cal', CalibratedClassifierCV(estimator=base_svc, method='isotonic', cv=3))
    ])
    oof, test_pred, oof_loss, _ = train_oof_model(svc_pipe, f'SVC_cal_C{params["C"]}_g{params["gamma"]}')
    svc_results.append((params, oof_loss))
    if oof_loss < best_svc_loss:
        best_svc_loss = oof_loss
        best_svc = params
        best_svc_oof = oof
        best_svc_test = test_pred
print('Best SVC params:', best_svc, 'OOF:', best_svc_loss)

# QDA with PowerTransformer and reg_param sweep
qda_grid = [0.0, 0.1, 0.2, 0.5]
best_qda_r = None
best_qda_loss = np.inf
best_qda_oof = None
best_qda_test = None
for j, rp in enumerate(qda_grid, 1):
    print(f'[QDA grid] {j}/{len(qda_grid)} reg_param={rp}', flush=True)
    qda_pipe = Pipeline(steps=[
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', QuadraticDiscriminantAnalysis(reg_param=rp))
    ])
    oof, test_pred, oof_loss, _ = train_oof_model(qda_pipe, f'QDA_PT_r{rp}')
    if oof_loss < best_qda_loss:
        best_qda_loss = oof_loss
        best_qda_r = rp
        best_qda_oof = oof
        best_qda_test = test_pred
print('Best QDA reg_param:', best_qda_r, 'OOF:', best_qda_loss)

# Choose LDA from prior cell variants if helpful
lda_candidates = {
    'LDA_raw': ('lda_raw_oof' in globals(), 'lda_raw_oof', 'lda_raw_test', lda_raw_loss if 'lda_raw_loss' in globals() else np.inf),
    'LDA_PT': ('lda_pt_oof' in globals(), 'lda_pt_oof', 'lda_pt_test', lda_pt_loss if 'lda_pt_loss' in globals() else np.inf),
    'LDA_eigen_std': ('lda_eig_oof' in globals(), 'lda_eig_oof', 'lda_eig_test', lda_eig_loss if 'lda_eig_loss' in globals() else np.inf)
}
best_lda_name = None
best_lda_loss = np.inf
best_lda_oof = None
best_lda_test = None
for name, (present, oof_var, test_var, loss_val) in lda_candidates.items():
    if present and loss_val < best_lda_loss:
        best_lda_loss = loss_val
        best_lda_name = name
        best_lda_oof = globals()[oof_var]
        best_lda_test = globals()[test_var]
print('Selected LDA for ensemble:', best_lda_name, 'OOF:', best_lda_loss)

# Build ensembles (equal weights first); clip and renormalize
def ensemble_oof_test(models):
    oofs = [m[0] for m in models]
    tests = [m[1] for m in models]
    oof = np.mean(oofs, axis=0)
    test = np.mean(tests, axis=0)
    oof = clip_and_renorm(oof)
    test = clip_and_renorm(test)
    return oof, test

models_for_ens = []
labels_desc = []
models_for_ens.append((lr5_oof, lr5_test)); labels_desc.append(('LR5', lr5_loss))
if best_svc_oof is not None: models_for_ens.append((best_svc_oof, best_svc_test)); labels_desc.append((f'SVC{best_svc}', best_svc_loss))
if best_qda_oof is not None: models_for_ens.append((best_qda_oof, best_qda_test)); labels_desc.append((f'QDA_r{best_qda_r}', best_qda_loss))
if best_lda_oof is not None: models_for_ens.append((best_lda_oof, best_lda_test)); labels_desc.append((best_lda_name, best_lda_loss))

print('Component models and OOF:', labels_desc)
ens_oof, ens_test = ensemble_oof_test(models_for_ens)
ens_loss = log_loss(y, ens_oof, labels=list(range(n_classes)))
print(f'[Ensemble] OOF logloss={ens_loss:.6f} with {len(models_for_ens)} models')

# Save final submission
sub = pd.DataFrame(ens_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (ensemble) with shape:', sub.shape)



[LogReg_5fold] Fold 1/5 logloss=0.115621 time=16.5s




[LogReg_5fold] Fold 2/5 logloss=0.108472 time=18.3s




[LogReg_5fold] Fold 3/5 logloss=0.122760 time=18.2s




[LogReg_5fold] Fold 4/5 logloss=0.120443 time=19.0s




[LogReg_5fold] Fold 5/5 logloss=0.124398 time=19.0s


[LogReg_5fold] OOF logloss=0.118336 | mean_folds=0.118339 | total=1.5m


[SVC grid] 1/4 params={'C': 4.0, 'gamma': 'scale'}


[SVC_cal_C4.0_gscale] Fold 1/5 logloss=0.095791 time=1.8s


[SVC_cal_C4.0_gscale] Fold 2/5 logloss=0.238826 time=1.7s


[SVC_cal_C4.0_gscale] Fold 3/5 logloss=0.266782 time=1.6s


[SVC_cal_C4.0_gscale] Fold 4/5 logloss=0.088842 time=1.7s


[SVC_cal_C4.0_gscale] Fold 5/5 logloss=0.304439 time=1.7s


[SVC_cal_C4.0_gscale] OOF logloss=0.198820 | mean_folds=0.198936 | total=0.1m


[SVC grid] 2/4 params={'C': 16.0, 'gamma': 'scale'}


[SVC_cal_C16.0_gscale] Fold 1/5 logloss=0.096418 time=1.8s


[SVC_cal_C16.0_gscale] Fold 2/5 logloss=0.238770 time=1.6s


[SVC_cal_C16.0_gscale] Fold 3/5 logloss=0.266777 time=1.7s


[SVC_cal_C16.0_gscale] Fold 4/5 logloss=0.088196 time=1.7s


[SVC_cal_C16.0_gscale] Fold 5/5 logloss=0.298813 time=1.8s


[SVC_cal_C16.0_gscale] OOF logloss=0.197681 | mean_folds=0.197795 | total=0.1m


[SVC grid] 3/4 params={'C': 64.0, 'gamma': 'scale'}


[SVC_cal_C64.0_gscale] Fold 1/5 logloss=0.096418 time=1.7s


[SVC_cal_C64.0_gscale] Fold 2/5 logloss=0.238770 time=1.7s


[SVC_cal_C64.0_gscale] Fold 3/5 logloss=0.266777 time=1.9s


[SVC_cal_C64.0_gscale] Fold 4/5 logloss=0.088196 time=1.7s


[SVC_cal_C64.0_gscale] Fold 5/5 logloss=0.298813 time=1.7s


[SVC_cal_C64.0_gscale] OOF logloss=0.197681 | mean_folds=0.197795 | total=0.1m


[SVC grid] 4/4 params={'C': 16.0, 'gamma': 0.01}


[SVC_cal_C16.0_g0.01] Fold 1/5 logloss=0.153159 time=1.7s


[SVC_cal_C16.0_g0.01] Fold 2/5 logloss=0.277482 time=1.8s


[SVC_cal_C16.0_g0.01] Fold 3/5 logloss=0.114873 time=1.7s


[SVC_cal_C16.0_g0.01] Fold 4/5 logloss=0.098508 time=1.7s


[SVC_cal_C16.0_g0.01] Fold 5/5 logloss=0.323956 time=1.7s


[SVC_cal_C16.0_g0.01] OOF logloss=0.193550 | mean_folds=0.193596 | total=0.1m


Best SVC params: {'C': 16.0, 'gamma': 0.01} OOF: 0.19355032456303511
[QDA grid] 1/4 reg_param=0.0


[QDA_PT_r0.0] Fold 1/5 logloss=34.231403 time=0.4s




[QDA_PT_r0.0] Fold 2/5 logloss=34.626206 time=0.5s




[QDA_PT_r0.0] Fold 3/5 logloss=34.626206 time=0.4s




[QDA_PT_r0.0] Fold 4/5 logloss=34.423714 time=0.4s




[QDA_PT_r0.0] Fold 5/5 logloss=34.018729 time=0.4s


[QDA_PT_r0.0] OOF logloss=34.385079 | mean_folds=34.385252 | total=0.0m


[QDA grid] 2/4 reg_param=0.1




[QDA_PT_r0.1] Fold 1/5 logloss=3.345553 time=0.4s




[QDA_PT_r0.1] Fold 2/5 logloss=2.970902 time=0.5s




[QDA_PT_r0.1] Fold 3/5 logloss=3.348329 time=0.5s




[QDA_PT_r0.1] Fold 4/5 logloss=3.182071 time=0.5s




[QDA_PT_r0.1] Fold 5/5 logloss=3.071086 time=0.5s


[QDA_PT_r0.1] OOF logloss=3.183770 | mean_folds=3.183588 | total=0.0m


[QDA grid] 3/4 reg_param=0.2




[QDA_PT_r0.2] Fold 1/5 logloss=2.998231 time=0.4s




[QDA_PT_r0.2] Fold 2/5 logloss=2.811213 time=0.5s




[QDA_PT_r0.2] Fold 3/5 logloss=3.072648 time=0.5s




[QDA_PT_r0.2] Fold 4/5 logloss=2.901031 time=0.4s




[QDA_PT_r0.2] Fold 5/5 logloss=2.810997 time=0.4s


[QDA_PT_r0.2] OOF logloss=2.918913 | mean_folds=2.918824 | total=0.0m


[QDA grid] 4/4 reg_param=0.5




[QDA_PT_r0.5] Fold 1/5 logloss=2.875343 time=0.4s




[QDA_PT_r0.5] Fold 2/5 logloss=2.789656 time=0.4s




[QDA_PT_r0.5] Fold 3/5 logloss=2.985573 time=0.4s




[QDA_PT_r0.5] Fold 4/5 logloss=2.805897 time=0.4s




[QDA_PT_r0.5] Fold 5/5 logloss=2.687894 time=0.4s


[QDA_PT_r0.5] OOF logloss=2.828925 | mean_folds=2.828873 | total=0.0m


Best QDA reg_param: 0.5 OOF: 2.8289247998901272
Selected LDA for ensemble: None OOF: inf
Component models and OOF: [('LR5', 0.11833580435211666), ("SVC{'C': 16.0, 'gamma': 0.01}", 0.19355032456303511), ('QDA_r0.5', 2.8289247998901272)]
[Ensemble] OOF logloss=0.459933 with 3 models
Saved submission.csv (ensemble) with shape: (99, 100)




In [5]:
# Strong bases: expanded RBF-SVC (calibrated), Linear SVC (calibrated), kNN; build strong ensemble (skip LDA)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

# 1) Expanded SVC grid with isotonic calibration (focus on smaller gamma)
svc_grid_expanded = []
for C in [30.0, 50.0, 100.0]:
    for gamma in [0.001, 0.003, 0.01]:
        svc_grid_expanded.append({'C': C, 'gamma': gamma})
best_svc2 = None
best_svc2_loss = np.inf
best_svc2_oof = None
best_svc2_test = None
for i, params in enumerate(svc_grid_expanded, 1):
    print(f'[SVC expanded] {i}/{len(svc_grid_expanded)} params={params}', flush=True)
    base_svc = SVC(kernel='rbf', C=params['C'], gamma=params['gamma'])
    svc_pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('cal', CalibratedClassifierCV(estimator=base_svc, method='isotonic', cv=3))
    ])
    oof, test_pred, oof_loss, _ = train_oof(svc_pipe, X, y, skf5, X_test, n_classes, desc=f'SVC_cal_C{params["C"]}_g{params["gamma"]}')
    if oof_loss < best_svc2_loss:
        best_svc2_loss = oof_loss
        best_svc2 = params
        best_svc2_oof = oof
        best_svc2_test = test_pred
print('Best expanded SVC params:', best_svc2, 'OOF:', best_svc2_loss)

# 2) Linear SVC via calibrated SVC(kernel='linear') for probabilities
lin_svc = SVC(kernel='linear', C=0.1)
lin_svc_pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('cal', CalibratedClassifierCV(estimator=lin_svc, method='isotonic', cv=3))
])
lin_oof, lin_test, lin_loss, _ = train_oof(lin_svc_pipe, X, y, skf5, X_test, n_classes, desc='LinSVC_cal_C0.1')

# 3) kNN strong baseline: StandardScaler -> KNN (distance, manhattan), small k grid
knn_params = []
for k in [3,5,7,9,11]:
    knn_params.append({'n_neighbors': k, 'metric': 'manhattan'})
best_knn = None
best_knn_loss = np.inf
best_knn_oof = None
best_knn_test = None
for j, prm in enumerate(knn_params, 1):
    print(f'[kNN] {j}/{len(knn_params)} params={prm}', flush=True)
    knn_pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier(n_neighbors=prm['n_neighbors'], weights='distance', metric=prm['metric']))
    ])
    oof, test_pred, oof_loss, _ = train_oof(knn_pipe, X, y, skf5, X_test, n_classes, desc=f'kNN_k{prm["n_neighbors"]}_{prm["metric"]}')
    if oof_loss < best_knn_loss:
        best_knn_loss = oof_loss
        best_knn = prm
        best_knn_oof = oof
        best_knn_test = test_pred
print('Best kNN params:', best_knn, 'OOF:', best_knn_loss)

# 4) Build ensemble with strong models only (LR5 from Cell 3 + best SVC RBF + LinSVC + kNN if decent)
candidates = [
    ('SVC_rbf', best_svc2_loss, best_svc2_oof, best_svc2_test),
    ('LinSVC', lin_loss, lin_oof, lin_test),
    ('kNN', best_knn_loss, best_knn_oof, best_knn_test),
    ('LR5', lr5_loss, lr5_oof, lr5_test)
]
candidates = [(n,l,o,t) for (n,l,o,t) in candidates if (o is not None) and np.isfinite(l)]
candidates.sort(key=lambda x: x[1])
print('Model leaderboard (by OOF):', [(n, round(l,6)) for n,l,_,_ in candidates])

# Use top 3 by OOF if available
top_k = 3 if len(candidates) >= 3 else len(candidates)
selected = candidates[:top_k]
print('Selected for ensemble:', [(n, round(l,6)) for n,l,_,_ in selected])
ens_oof = np.mean([m[2] for m in selected], axis=0)
ens_test = np.mean([m[3] for m in selected], axis=0)
ens_oof = clip_and_renorm(ens_oof)
ens_test = clip_and_renorm(ens_test)
ens_loss = log_loss(y, ens_oof, labels=list(range(n_classes)))
print(f'[Strong Ensemble] OOF logloss={ens_loss:.6f} with {top_k} models')

# Save submission
sub = pd.DataFrame(ens_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (strong ensemble) with shape:', sub.shape)

[SVC expanded] 1/9 params={'C': 30.0, 'gamma': 0.001}


[SVC_cal_C30.0_g0.001] Fold 1/5 logloss=0.073117 time=1.8s


[SVC_cal_C30.0_g0.001] Fold 2/5 logloss=0.106225 time=1.8s


[SVC_cal_C30.0_g0.001] Fold 3/5 logloss=0.260229 time=1.7s


[SVC_cal_C30.0_g0.001] Fold 4/5 logloss=0.106943 time=1.7s


[SVC_cal_C30.0_g0.001] Fold 5/5 logloss=0.308168 time=1.8s


[SVC_cal_C30.0_g0.001] OOF logloss=0.170827 | mean_folds=0.170937 | total=0.1m


[SVC expanded] 2/9 params={'C': 30.0, 'gamma': 0.003}


[SVC_cal_C30.0_g0.003] Fold 1/5 logloss=0.067919 time=1.7s


[SVC_cal_C30.0_g0.003] Fold 2/5 logloss=0.256361 time=1.7s


[SVC_cal_C30.0_g0.003] Fold 3/5 logloss=0.253711 time=1.7s


[SVC_cal_C30.0_g0.003] Fold 4/5 logloss=0.087095 time=1.8s


[SVC_cal_C30.0_g0.003] Fold 5/5 logloss=0.300317 time=1.7s


[SVC_cal_C30.0_g0.003] OOF logloss=0.192940 | mean_folds=0.193081 | total=0.1m


[SVC expanded] 3/9 params={'C': 30.0, 'gamma': 0.01}


[SVC_cal_C30.0_g0.01] Fold 1/5 logloss=0.153159 time=1.7s


[SVC_cal_C30.0_g0.01] Fold 2/5 logloss=0.277482 time=1.7s


[SVC_cal_C30.0_g0.01] Fold 3/5 logloss=0.114873 time=1.7s


[SVC_cal_C30.0_g0.01] Fold 4/5 logloss=0.098508 time=1.7s


[SVC_cal_C30.0_g0.01] Fold 5/5 logloss=0.323956 time=1.7s


[SVC_cal_C30.0_g0.01] OOF logloss=0.193550 | mean_folds=0.193596 | total=0.1m


[SVC expanded] 4/9 params={'C': 50.0, 'gamma': 0.001}


[SVC_cal_C50.0_g0.001] Fold 1/5 logloss=0.073117 time=1.7s


[SVC_cal_C50.0_g0.001] Fold 2/5 logloss=0.106225 time=1.7s


[SVC_cal_C50.0_g0.001] Fold 3/5 logloss=0.260229 time=1.8s


[SVC_cal_C50.0_g0.001] Fold 4/5 logloss=0.106943 time=1.8s


[SVC_cal_C50.0_g0.001] Fold 5/5 logloss=0.308168 time=1.7s


[SVC_cal_C50.0_g0.001] OOF logloss=0.170827 | mean_folds=0.170937 | total=0.1m


[SVC expanded] 5/9 params={'C': 50.0, 'gamma': 0.003}


[SVC_cal_C50.0_g0.003] Fold 1/5 logloss=0.067919 time=1.7s


[SVC_cal_C50.0_g0.003] Fold 2/5 logloss=0.256361 time=1.7s


[SVC_cal_C50.0_g0.003] Fold 3/5 logloss=0.253711 time=1.8s


[SVC_cal_C50.0_g0.003] Fold 4/5 logloss=0.087095 time=1.7s


[SVC_cal_C50.0_g0.003] Fold 5/5 logloss=0.300317 time=1.8s


[SVC_cal_C50.0_g0.003] OOF logloss=0.192940 | mean_folds=0.193081 | total=0.1m


[SVC expanded] 6/9 params={'C': 50.0, 'gamma': 0.01}


[SVC_cal_C50.0_g0.01] Fold 1/5 logloss=0.153159 time=1.7s


[SVC_cal_C50.0_g0.01] Fold 2/5 logloss=0.277482 time=1.8s


[SVC_cal_C50.0_g0.01] Fold 3/5 logloss=0.114873 time=1.8s


[SVC_cal_C50.0_g0.01] Fold 4/5 logloss=0.098508 time=1.8s


[SVC_cal_C50.0_g0.01] Fold 5/5 logloss=0.323956 time=1.8s


[SVC_cal_C50.0_g0.01] OOF logloss=0.193550 | mean_folds=0.193596 | total=0.1m


[SVC expanded] 7/9 params={'C': 100.0, 'gamma': 0.001}


[SVC_cal_C100.0_g0.001] Fold 1/5 logloss=0.073117 time=1.8s


[SVC_cal_C100.0_g0.001] Fold 2/5 logloss=0.106225 time=1.7s


[SVC_cal_C100.0_g0.001] Fold 3/5 logloss=0.260229 time=1.8s


[SVC_cal_C100.0_g0.001] Fold 4/5 logloss=0.106943 time=1.7s


[SVC_cal_C100.0_g0.001] Fold 5/5 logloss=0.308168 time=1.7s


[SVC_cal_C100.0_g0.001] OOF logloss=0.170827 | mean_folds=0.170937 | total=0.1m


[SVC expanded] 8/9 params={'C': 100.0, 'gamma': 0.003}


[SVC_cal_C100.0_g0.003] Fold 1/5 logloss=0.067919 time=1.7s


[SVC_cal_C100.0_g0.003] Fold 2/5 logloss=0.256361 time=1.7s


[SVC_cal_C100.0_g0.003] Fold 3/5 logloss=0.253711 time=1.7s


[SVC_cal_C100.0_g0.003] Fold 4/5 logloss=0.087095 time=1.7s


[SVC_cal_C100.0_g0.003] Fold 5/5 logloss=0.300317 time=1.7s


[SVC_cal_C100.0_g0.003] OOF logloss=0.192940 | mean_folds=0.193081 | total=0.1m


[SVC expanded] 9/9 params={'C': 100.0, 'gamma': 0.01}


[SVC_cal_C100.0_g0.01] Fold 1/5 logloss=0.153159 time=1.7s


[SVC_cal_C100.0_g0.01] Fold 2/5 logloss=0.277482 time=1.7s


[SVC_cal_C100.0_g0.01] Fold 3/5 logloss=0.114873 time=1.7s


[SVC_cal_C100.0_g0.01] Fold 4/5 logloss=0.098508 time=1.7s


[SVC_cal_C100.0_g0.01] Fold 5/5 logloss=0.323956 time=1.7s


[SVC_cal_C100.0_g0.01] OOF logloss=0.193550 | mean_folds=0.193596 | total=0.1m


Best expanded SVC params: {'C': 50.0, 'gamma': 0.001} OOF: 0.17082672625932951


[LinSVC_cal_C0.1] Fold 1/5 logloss=0.069862 time=1.7s


[LinSVC_cal_C0.1] Fold 2/5 logloss=0.067961 time=1.6s


[LinSVC_cal_C0.1] Fold 3/5 logloss=0.253501 time=1.7s


[LinSVC_cal_C0.1] Fold 4/5 logloss=0.117544 time=1.6s


[LinSVC_cal_C0.1] Fold 5/5 logloss=0.312877 time=1.6s


[LinSVC_cal_C0.1] OOF logloss=0.164243 | mean_folds=0.164349 | total=0.1m


[kNN] 1/5 params={'n_neighbors': 3, 'metric': 'manhattan'}


[kNN_k3_manhattan] Fold 1/5 logloss=0.236776 time=0.1s


[kNN_k3_manhattan] Fold 2/5 logloss=0.041485 time=0.0s


[kNN_k3_manhattan] Fold 3/5 logloss=0.231771 time=0.0s


[kNN_k3_manhattan] Fold 4/5 logloss=0.249245 time=0.0s


[kNN_k3_manhattan] Fold 5/5 logloss=0.244379 time=0.0s


[kNN_k3_manhattan] OOF logloss=0.200772 | mean_folds=0.200731 | total=0.0m


[kNN] 2/5 params={'n_neighbors': 5, 'metric': 'manhattan'}


[kNN_k5_manhattan] Fold 1/5 logloss=0.280554 time=0.0s


[kNN_k5_manhattan] Fold 2/5 logloss=0.084681 time=0.0s


[kNN_k5_manhattan] Fold 3/5 logloss=0.274205 time=0.0s


[kNN_k5_manhattan] Fold 4/5 logloss=0.288335 time=0.0s


[kNN_k5_manhattan] Fold 5/5 logloss=0.090640 time=0.0s


[kNN_k5_manhattan] OOF logloss=0.203770 | mean_folds=0.203683 | total=0.0m


[kNN] 3/5 params={'n_neighbors': 7, 'metric': 'manhattan'}


[kNN_k7_manhattan] Fold 1/5 logloss=0.157135 time=0.0s


[kNN_k7_manhattan] Fold 2/5 logloss=0.156453 time=0.0s


[kNN_k7_manhattan] Fold 3/5 logloss=0.333751 time=0.0s


[kNN_k7_manhattan] Fold 4/5 logloss=0.166546 time=0.0s


[kNN_k7_manhattan] Fold 5/5 logloss=0.153212 time=0.0s


[kNN_k7_manhattan] OOF logloss=0.193379 | mean_folds=0.193419 | total=0.0m


[kNN] 4/5 params={'n_neighbors': 9, 'metric': 'manhattan'}


[kNN_k9_manhattan] Fold 1/5 logloss=0.293210 time=0.0s


[kNN_k9_manhattan] Fold 2/5 logloss=0.289378 time=0.0s


[kNN_k9_manhattan] Fold 3/5 logloss=0.463124 time=0.0s


[kNN_k9_manhattan] Fold 4/5 logloss=0.294970 time=0.0s


[kNN_k9_manhattan] Fold 5/5 logloss=0.297520 time=0.0s


[kNN_k9_manhattan] OOF logloss=0.327602 | mean_folds=0.327641 | total=0.0m


[kNN] 5/5 params={'n_neighbors': 11, 'metric': 'manhattan'}


[kNN_k11_manhattan] Fold 1/5 logloss=0.425661 time=0.0s


[kNN_k11_manhattan] Fold 2/5 logloss=0.421483 time=0.0s


[kNN_k11_manhattan] Fold 3/5 logloss=0.597536 time=0.0s


[kNN_k11_manhattan] Fold 4/5 logloss=0.420640 time=0.0s


[kNN_k11_manhattan] Fold 5/5 logloss=0.426621 time=0.0s


[kNN_k11_manhattan] OOF logloss=0.458352 | mean_folds=0.458388 | total=0.0m


Best kNN params: {'n_neighbors': 7, 'metric': 'manhattan'} OOF: 0.19337877494945088
Model leaderboard (by OOF): [('LR5', 0.118336), ('LinSVC', 0.164243), ('SVC_rbf', 0.170827), ('kNN', 0.193379)]
Selected for ensemble: [('LR5', 0.118336), ('LinSVC', 0.164243), ('SVC_rbf', 0.170827)]
[Strong Ensemble] OOF logloss=0.092280 with 3 models
Saved submission.csv (strong ensemble) with shape: (99, 100)


In [None]:
# PCA+kNN, tuned LR, and refined SVC grid; ensemble strongest models
from sklearn.decomposition import PCA

def evaluate_pipeline(pipe, desc):
    oof, test_pred, oof_loss, _ = train_oof(pipe, X, y, skf5, X_test, n_classes, desc=desc)
    return oof, test_pred, oof_loss

# 1) PCA + kNN (distance) with euclidean/cosine, k grid
best_knn2 = None
best_knn2_loss = np.inf
best_knn2_oof = None
best_knn2_test = None
for metric in ['euclidean', 'cosine']:
    for k in [3,5,7,9,11]:
        pipe = Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=0.995, svd_solver='full', random_state=SEED)),
            ('knn', KNeighborsClassifier(n_neighbors=k, weights='distance', metric=metric))
        ])
        oof, test_pred, loss = evaluate_pipeline(pipe, f'kNN_PCA_k{k}_{metric}')
        if loss < best_knn2_loss:
            best_knn2_loss = loss
            best_knn2 = {'k': k, 'metric': metric}
            best_knn2_oof = oof
            best_knn2_test = test_pred
print('Best kNN_PCA:', best_knn2, 'OOF:', best_knn2_loss)

# 2) Tuned Logistic Regression (L2 and ElasticNet small l1_ratio)
best_lr = None
best_lr_loss = np.inf
best_lr_oof = None
best_lr_test = None
for penalty, params in [('l2', {'l1_ratio': None}), ('elasticnet', {'l1_ratio': 0.1})]:
    for C in [0.5, 1.0, 2.0, 3.0, 5.0, 10.0]:
        clf = LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty=penalty, max_iter=5000, n_jobs=-1, random_state=SEED, **({} if params['l1_ratio'] is None else {'l1_ratio': params['l1_ratio']}))
        pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', clf)])
        oof, test_pred, loss = evaluate_pipeline(pipe, f'LR_{penalty}_C{C}')
        if loss < best_lr_loss:
            best_lr_loss = loss
            best_lr = {'penalty': penalty, 'C': C, **params}
            best_lr_oof = oof
            best_lr_test = test_pred
print('Best LR tuned:', best_lr, 'OOF:', best_lr_loss)

# 3) Refined SVC grid (RBF, isotonic-calibrated) focusing on smaller gamma
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
svc_grid_refined = [
    {'C': 20.0, 'gamma': 0.001},
    {'C': 50.0, 'gamma': 0.001},
    {'C': 100.0, 'gamma': 0.001},
    {'C': 50.0, 'gamma': 0.0003},
    {'C': 100.0, 'gamma': 0.0003},
    {'C': 200.0, 'gamma': 0.0003}
]
best_svc3 = None
best_svc3_loss = np.inf
best_svc3_oof = None
best_svc3_test = None
for i, p in enumerate(svc_grid_refined, 1):
    print(f'[SVC refined] {i}/{len(svc_grid_refined)} params={p}', flush=True)
    base_svc = SVC(kernel='rbf', C=p['C'], gamma=p['gamma'])
    svc_pipe = Pipeline(steps=[('scaler', StandardScaler()), ('cal', CalibratedClassifierCV(estimator=base_svc, method='isotonic', cv=3))])
    oof, test_pred, loss = evaluate_pipeline(svc_pipe, f'SVC_cal_C{p["C"]}_g{p["gamma"]}')
    if loss < best_svc3_loss:
        best_svc3_loss = loss
        best_svc3 = p
        best_svc3_oof = oof
        best_svc3_test = test_pred
print('Best SVC refined:', best_svc3, 'OOF:', best_svc3_loss)

# 4) Assemble strongest models by OOF and average top-2/3
cands = []
if best_lr_oof is not None: cands.append(('LR_tuned', best_lr_loss, best_lr_oof, best_lr_test))
if best_svc3_oof is not None: cands.append(('SVC_refined', best_svc3_loss, best_svc3_oof, best_svc3_test))
if best_knn2_oof is not None: cands.append(('kNN_PCA', best_knn2_loss, best_knn2_oof, best_knn2_test))
cands.sort(key=lambda x: x[1])
print('Candidates:', [(n, round(l,6)) for n,l,_,_ in cands])
sel = cands[:3] if len(cands) >= 3 else cands
ens_oof = np.mean([m[2] for m in sel], axis=0)
ens_test = np.mean([m[3] for m in sel], axis=0)
ens_oof = clip_and_renorm(ens_oof)
ens_test = clip_and_renorm(ens_test)
ens_loss = log_loss(y, ens_oof, labels=list(range(n_classes)))
print(f'[PCA+kNN/LR/SVC Ensemble] OOF logloss={ens_loss:.6f} using {len(sel)} models')
sub = pd.DataFrame(ens_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (refined ensemble) with shape:', sub.shape)

In [None]:
# Group-wise scaling by feature blocks (shape/margin/texture) and re-evaluate LR/SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

# Reload data as DataFrames to get column names
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
feat_cols = [c for c in train_df.columns if c not in ['id', 'species']]
X_df = train_df[feat_cols].copy()
y_df = le.transform(train_df['species'])  # use existing encoder
X_test_df = test_df[feat_cols].copy()

# Identify groups by prefix
shape_cols = [c for c in feat_cols if c.lower().startswith('shape')]
margin_cols = [c for c in feat_cols if c.lower().startswith('margin')]
texture_cols = [c for c in feat_cols if c.lower().startswith('texture')]
other_cols = [c for c in feat_cols if c not in shape_cols + margin_cols + texture_cols]
print('Groups sizes:', len(shape_cols), len(margin_cols), len(texture_cols), len(other_cols))

transformers = []
if shape_cols: transformers.append(('shape_scaler', StandardScaler(), shape_cols))
if margin_cols: transformers.append(('margin_scaler', StandardScaler(), margin_cols))
if texture_cols: transformers.append(('texture_scaler', StandardScaler(), texture_cols))
if other_cols: transformers.append(('other_scaler', StandardScaler(), other_cols))
ct = ColumnTransformer(transformers=transformers, remainder='drop')

skf5 = make_skf(n_splits=5, seed=SEED)

def train_oof_df(pipe, desc):
    # Use DataFrame-based splitter but convert to numpy inside
    n_samples = X_df.shape[0]
    oof = np.zeros((n_samples, n_classes), dtype=np.float64)
    test_pred = np.zeros((X_test_df.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    for fold, (tr_idx, va_idx) in enumerate(skf5.split(X_df.values, y_df), 1):
        t0 = time.time()
        X_tr, y_tr = X_df.iloc[tr_idx], y_df[tr_idx]
        X_va, y_va = X_df.iloc[va_idx], y_df[va_idx]
        from sklearn.base import clone
        model = clone(pipe)
        model.fit(X_tr, y_tr)
        proba_va = model.predict_proba(X_va)
        loss = log_loss(y_va, proba_va, labels=list(range(n_classes)))
        oof[va_idx] = proba_va
        test_pred += model.predict_proba(X_test_df) / skf5.get_n_splits()
        fold_losses.append(loss)
        print(f'[{desc}] Fold {fold}/5 logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
    oof_loss = log_loss(y_df, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f}')
    return oof, test_pred, oof_loss

# 1) Group-scaled Logistic Regression (tuned C small set)
best_lr_g = None; best_lr_g_loss = np.inf; best_lr_g_oof=None; best_lr_g_test=None
for C in [1.0, 2.0, 3.0, 5.0]:
    pipe = Pipeline([('ct', ct), ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED))])
    oof, test_pred, loss = train_oof_df(pipe, f'LR_group_C{C}')
    if loss < best_lr_g_loss: best_lr_g_loss, best_lr_g, best_lr_g_oof, best_lr_g_test = loss, C, oof, test_pred
print('Best LR group C:', best_lr_g, 'OOF:', best_lr_g_loss)

# 2) Group-scaled SVC RBF with isotonic calibration (focused grid)
svc_params = [{'C': 10.0, 'gamma': 0.001}, {'C': 20.0, 'gamma': 0.001}, {'C': 50.0, 'gamma': 0.001}]
best_svc_g = None; best_svc_g_loss = np.inf; best_svc_g_oof=None; best_svc_g_test=None
for p in svc_params:
    base = SVC(kernel='rbf', C=p['C'], gamma=p['gamma'])
    pipe = Pipeline([('ct', ct), ('cal', CalibratedClassifierCV(estimator=base, method='isotonic', cv=3))])
    oof, test_pred, loss = train_oof_df(pipe, f'SVC_group_C{p["C"]}_g{p["gamma"]}')
    if loss < best_svc_g_loss: best_svc_g_loss, best_svc_g, best_svc_g_oof, best_svc_g_test = loss, p, oof, test_pred
print('Best SVC group params:', best_svc_g, 'OOF:', best_svc_g_loss)

# Ensemble top group-wise models (if better than previous)
cands = []
if best_lr_g_oof is not None: cands.append(('LR_group', best_lr_g_loss, best_lr_g_oof, best_lr_g_test))
if best_svc_g_oof is not None: cands.append(('SVC_group', best_svc_g_loss, best_svc_g_oof, best_svc_g_test))
cands.sort(key=lambda x: x[1])
print('Group-wise candidates:', [(n, round(l,6)) for n,l,_,_ in cands])
if cands:
    ens_oof = np.mean([m[2] for m in cands], axis=0)
    ens_test = np.mean([m[3] for m in cands], axis=0)
    ens_oof = clip_and_renorm(ens_oof); ens_test = clip_and_renorm(ens_test)
    ens_loss = log_loss(y_df, ens_oof, labels=list(range(n_classes)))
    print(f'[Group Ensemble] OOF logloss={ens_loss:.6f} using {len(cands)} models')
    sub = pd.DataFrame(ens_test, columns=classes); sub.insert(0, 'id', test_ids); sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (group-wise ensemble) with shape:', sub.shape)

In [10]:
# LightGBM multiclass with 5-fold CV and early stopping; ensemble with LR if better
import sys, subprocess, importlib, time
def ensure_pkg(pkg):
    try:
        return importlib.import_module(pkg)
    except ImportError:
        print(f'Installing {pkg}...', flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])
        return importlib.import_module(pkg)

lgb = ensure_pkg('lightgbm')
from lightgbm import Dataset as lgbDataset, early_stopping as lgb_early_stopping, log_evaluation as lgb_log_evaluation

def lgbm_cv_oof(X_np, y_np, Xte_np, n_classes, skf, params=None, num_boost_round=5000, early_stopping_rounds=150, desc='LGBM'):
    if params is None:
        params = {
            'objective': 'multiclass',
            'num_class': n_classes,
            'metric': 'multi_logloss',
            'learning_rate': 0.02,
            'num_leaves': 20,
            'min_data_in_leaf': 10,
            'feature_fraction': 0.7,
            'bagging_fraction': 0.8,
            'bagging_freq': 1,
            'lambda_l2': 3.0,
            'seed': SEED,
            'verbose': -1,
            'num_threads': -1
        }
    n = X_np.shape[0]
    oof = np.zeros((n, n_classes), dtype=np.float64)
    test_pred = np.zeros((Xte_np.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    start = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_np, y_np), 1):
        t0 = time.time()
        X_tr, y_tr = X_np[tr_idx], y_np[tr_idx]
        X_va, y_va = X_np[va_idx], y_np[va_idx]
        dtr = lgbDataset(X_tr, label=y_tr)
        dva = lgbDataset(X_va, label=y_va)
        booster = lgb.train(
            params,
            dtr,
            num_boost_round=num_boost_round,
            valid_sets=[dtr, dva],
            valid_names=['train','valid'],
            callbacks=[lgb_early_stopping(stopping_rounds=early_stopping_rounds, verbose=False),
                       lgb_log_evaluation(period=0)]
        )
        va_proba = booster.predict(X_va, num_iteration=booster.best_iteration)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        oof[va_idx] = va_proba
        test_pred += booster.predict(Xte_np, num_iteration=booster.best_iteration) / skf.get_n_splits()
        print(f'[{desc}] Fold {fold}/{skf.get_n_splits()} best_iter={booster.best_iteration} logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
        fold_losses.append(loss)
    oof_loss = log_loss(y_np, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f} | total={((time.time()-start)/60):.1f}m', flush=True)
    return oof, test_pred, oof_loss, fold_losses

# Use 5-fold CV for LGBM (consistent with other models)
skf5 = make_skf(n_splits=5, seed=SEED)
lgb_params = {
    'objective': 'multiclass',
    'num_class': n_classes,
    'metric': 'multi_logloss',
    'learning_rate': 0.02,
    'num_leaves': 20,
    'min_data_in_leaf': 10,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l2': 3.0,
    'seed': SEED,
    'verbose': -1,
    'num_threads': -1
}
lgb_oof, lgb_test, lgb_loss, _ = lgbm_cv_oof(X, y, X_test, n_classes, skf5, params=lgb_params, desc='LGBM5')

# Ensemble with best existing LR5 if available
ens_models = []
if 'lr5_oof' in globals():
    ens_models.append(('LR5', lr5_oof, lr5_test, lr5_loss))
ens_models.append(('LGBM5', lgb_oof, lgb_test, lgb_loss))
print('Ensemble candidates:', [(n, round(l,6)) for n,_,_,l in ens_models])
oofs = [m[1] for m in ens_models]
tests = [m[2] for m in ens_models]
ens_oof = clip_and_renorm(np.mean(oofs, axis=0)) if len(oofs) > 1 else clip_and_renorm(oofs[0])
ens_test = clip_and_renorm(np.mean(tests, axis=0)) if len(tests) > 1 else clip_and_renorm(tests[0])
ens_loss = log_loss(y, ens_oof, labels=list(range(n_classes)))
print(f'[LGBM Ensemble] OOF logloss={ens_loss:.6f} using {len(ens_models)} models')

# Save submission
sub = pd.DataFrame(ens_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (LGBM blend) with shape:', sub.shape)

In [11]:
# XGBoost multiclass OOF with 5-fold CV and early stopping; raw features; logs per fold
import importlib, subprocess, sys, time
def ensure_pkg(pkg):
    try:
        return importlib.import_module(pkg)
    except ImportError:
        print(f'Installing {pkg}...', flush=True)
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', pkg])
        return importlib.import_module(pkg)

xgb = ensure_pkg('xgboost')
from xgboost import DMatrix, train as xgb_train

def xgb_cv_oof(X_np, y_np, Xte_np, n_classes, skf, params=None, num_boost_round=4000, early_stopping_rounds=100, desc='XGB'):
    if params is None:
        params = {
            'objective': 'multi:softprob',
            'num_class': n_classes,
            'eval_metric': 'mlogloss',
            'eta': 0.03,
            'max_depth': 6,
            'min_child_weight': 1.0,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'lambda': 1.0,
            'alpha': 0.1,
            'tree_method': 'hist',
            'seed': SEED,
            'nthread': 0
        }
    n = X_np.shape[0]
    oof = np.zeros((n, n_classes), dtype=np.float64)
    test_pred = np.zeros((Xte_np.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    start = time.time()
    n_splits = skf.get_n_splits()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_np, y_np), 1):
        t0 = time.time()
        X_tr, y_tr = X_np[tr_idx], y_np[tr_idx]
        X_va, y_va = X_np[va_idx], y_np[va_idx]
        dtr = DMatrix(X_tr, label=y_tr)
        dva = DMatrix(X_va, label=y_va)
        watchlist = [(dtr, 'train'), (dva, 'valid')]
        booster = xgb_train(params, dtr, num_boost_round=num_boost_round, evals=watchlist,
                            early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        va_proba = booster.predict(dva, ntree_limit=booster.best_ntree_limit)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        oof[va_idx] = va_proba
        test_pred += booster.predict(DMatrix(Xte_np), ntree_limit=booster.best_ntree_limit) / n_splits
        fold_losses.append(loss)
        print(f'[{desc}] Fold {fold}/{n_splits} best_iter={booster.best_iteration} logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
    oof_loss = log_loss(y_np, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f} | total={(time.time()-start)/60:.1f}m', flush=True)
    return oof, test_pred, oof_loss, fold_losses

# Run 5-fold XGBoost
skf5 = make_skf(n_splits=5, seed=SEED)
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': n_classes,
    'eval_metric': 'mlogloss',
    'eta': 0.02,
    'max_depth': 5,
    'min_child_weight': 1.0,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'lambda': 1.0,
    'alpha': 0.1,
    'tree_method': 'hist',
    'seed': SEED,
    'nthread': 0
}
xgb_oof, xgb_test, xgb_loss, _ = xgb_cv_oof(X, y, X_test, n_classes, skf5, params=xgb_params, num_boost_round=5000, early_stopping_rounds=100, desc='XGB5')

# Compare/quick blend with best LR5 if available
blend_models = []
if 'lr5_oof' in globals():
    blend_models.append(('LR5', lr5_oof, lr5_test, lr5_loss))
blend_models.append(('XGB5', xgb_oof, xgb_test, xgb_loss))
print('Blend candidates:', [(n, round(l,6)) for n,_,_,l in blend_models])
oofs = [m[1] for m in blend_models]
tests = [m[2] for m in blend_models]
blend_oof = clip_and_renorm(np.mean(oofs, axis=0)) if len(oofs) > 1 else clip_and_renorm(oofs[0])
blend_test = clip_and_renorm(np.mean(tests, axis=0)) if len(tests) > 1 else clip_and_renorm(tests[0])
blend_loss = log_loss(y, blend_oof, labels=list(range(n_classes)))
print(f'[LR+XGB Blend] OOF logloss={blend_loss:.6f} using {len(blend_models)} models')

# Save a provisional submission from the blend
sub = pd.DataFrame(blend_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (LR+XGB blend) with shape:', sub.shape)

In [7]:
# Fast tree booster via sklearn HistGradientBoosting + Stacking meta-learner
import time
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

def hgb_cv_oof(X_np, y_np, Xte_np, n_classes, skf, params=None, desc='HGB'):
    if params is None:
        params = {
            'learning_rate': 0.05,
            'max_depth': 6,
            'max_leaf_nodes': 31,
            'min_samples_leaf': 5,
            'l2_regularization': 1.0,
            'max_iter': 300,
            'early_stopping': True,
            'n_iter_no_change': 20,
            'tol': 1e-7,
            'validation_fraction': 0.2,
            'random_state': SEED
        }
    n = X_np.shape[0]
    oof = np.zeros((n, n_classes), dtype=np.float64)
    test_pred = np.zeros((Xte_np.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    start = time.time()
    n_splits = skf.get_n_splits()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_np, y_np), 1):
        t0 = time.time()
        print(f'[{desc}] Starting fold {fold}/{n_splits} | train={len(tr_idx)} valid={len(va_idx)}', flush=True)
        X_tr, y_tr = X_np[tr_idx], y_np[tr_idx]
        X_va, y_va = X_np[va_idx], y_np[va_idx]
        clf = HistGradientBoostingClassifier(
            loss='log_loss',
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            max_leaf_nodes=params['max_leaf_nodes'],
            min_samples_leaf=params['min_samples_leaf'],
            l2_regularization=params['l2_regularization'],
            max_iter=params['max_iter'],
            early_stopping=params['early_stopping'],
            n_iter_no_change=params['n_iter_no_change'],
            tol=params['tol'],
            validation_fraction=params['validation_fraction'],
            random_state=params['random_state']
        )
        clf.fit(X_tr, y_tr)
        va_proba = clf.predict_proba(X_va)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        oof[va_idx] = va_proba
        test_pred += clf.predict_proba(Xte_np) / n_splits
        fold_losses.append(loss)
        print(f'[{desc}] Fold {fold}/{n_splits} logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
    oof_loss = log_loss(y_np, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f} | total={(time.time()-start)/60:.1f}m', flush=True)
    return oof, test_pred, oof_loss, fold_losses

# 5-fold CV for HGB on raw features
skf5 = make_skf(n_splits=5, seed=SEED)
hgb_params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'max_leaf_nodes': 31,
    'min_samples_leaf': 5,
    'l2_regularization': 1.0,
    'max_iter': 300,
    'early_stopping': True,
    'n_iter_no_change': 20,
    'tol': 1e-7,
    'validation_fraction': 0.2,
    'random_state': SEED
}
hgb_oof, hgb_test, hgb_loss, _ = hgb_cv_oof(X, y, X_test, n_classes, skf5, params=hgb_params, desc='HGB5')

# Build stacking features from available strong base models (OOF probs) + HGB
base_models = []
if 'lr5_oof' in globals() and np.isfinite(lr5_loss):
    base_models.append(('LR5', lr5_oof, lr5_test, lr5_loss))
if 'best_svc2_oof' in globals() and 'best_svc2_test' in globals() and np.isfinite(best_svc2_loss):
    base_models.append(('SVC_rbf', best_svc2_oof, best_svc2_test, best_svc2_loss))
if 'lin_oof' in globals() and 'lin_test' in globals() and np.isfinite(lin_loss):
    base_models.append(('LinSVC', lin_oof, lin_test, lin_loss))
base_models.append(('HGB5', hgb_oof, hgb_test, hgb_loss))

# Keep only reasonably strong contributors
base_models = [m for m in base_models if m[3] < 0.2]
print('Stacking bases:', [(n, round(l,6)) for n,_,_,l in base_models])
if len(base_models) == 0:
    # Fallback to HGB alone
    final_test = clip_and_renorm(hgb_test)
    sub = pd.DataFrame(final_test, columns=classes); sub.insert(0, 'id', test_ids); sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (HGB only) with shape:', sub.shape)
else:
    # Prepare level-2 data
    X_meta = np.concatenate([m[1] for m in base_models], axis=1)
    X_meta_test = np.concatenate([m[2] for m in base_models], axis=1)
    # Build OOF for meta model using the same skf5 to avoid leakage
    meta_oof = np.zeros((X.shape[0], n_classes), dtype=np.float64)
    meta_test_accum = np.zeros((X_test.shape[0], n_classes), dtype=np.float64)
    for fold, (tr_idx, va_idx) in enumerate(skf5.split(X, y), 1):
        t0 = time.time()
        X_tr_m, X_va_m = X_meta[tr_idx], X_meta[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        meta = LogisticRegression(solver='lbfgs', multi_class='multinomial', C=1.0, max_iter=1000, n_jobs=-1, random_state=SEED)
        meta.fit(X_tr_m, y_tr)
        va_proba = meta.predict_proba(X_va_m)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        meta_oof[va_idx] = va_proba
        meta_test_accum += meta.predict_proba(X_meta_test) / skf5.get_n_splits()
        print(f'[Stack LR] Fold {fold}/5 meta logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
    meta_oof = clip_and_renorm(meta_oof)
    meta_test = clip_and_renorm(meta_test_accum)
    meta_loss = log_loss(y, meta_oof, labels=list(range(n_classes)))
    print(f'[Stack LR] OOF logloss={meta_loss:.6f}')
    # Epsilon smoothing
    eps = 0.001
    meta_test = meta_test * (1 - eps) + eps / n_classes
    # Save submission
    sub = pd.DataFrame(meta_test, columns=classes)
    sub.insert(0, 'id', test_ids)
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (Stacked LR on OOF probs) with shape:', sub.shape)

[HGB5] Starting fold 1/5 | train=712 valid=179


[HGB5] Fold 1/5 logloss=1.121763 time=19.9s


[HGB5] Starting fold 2/5 | train=713 valid=178


[HGB5] Fold 2/5 logloss=1.096867 time=19.9s


[HGB5] Starting fold 3/5 | train=713 valid=178


[HGB5] Fold 3/5 logloss=1.174128 time=20.0s


[HGB5] Starting fold 4/5 | train=713 valid=178


[HGB5] Fold 4/5 logloss=1.175451 time=20.2s


[HGB5] Starting fold 5/5 | train=713 valid=178


[HGB5] Fold 5/5 logloss=0.988615 time=20.0s


[HGB5] OOF logloss=1.111376 | mean_folds=1.111365 | total=1.7m


Stacking bases: [('LR5', 0.118336), ('SVC_rbf', 0.170827), ('LinSVC', 0.164243)]




[Stack LR] Fold 1/5 meta logloss=0.402552 time=1.7s




[Stack LR] Fold 2/5 meta logloss=0.406568 time=1.2s




[Stack LR] Fold 3/5 meta logloss=0.405242 time=1.1s




[Stack LR] Fold 4/5 meta logloss=0.429567 time=1.2s




[Stack LR] Fold 5/5 meta logloss=0.418111 time=1.2s


[Stack LR] OOF logloss=0.412397
Saved submission.csv (Stacked LR on OOF probs) with shape: (99, 100)


In [8]:
# Force-install precompiled wheels for LightGBM and XGBoost (avoid building from source)
import sys, subprocess, importlib
def pip_install(args):
    print('Installing packages:', ' '.join(args), flush=True)
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '--no-cache-dir', '--prefer-binary'] + args)

pkgs = ["lightgbm>=4.1.0", "xgboost==1.7.6"]
pip_install(pkgs)

import lightgbm as lgb
import xgboost as xgb
print('lightgbm version:', lgb.__version__)
print('xgboost version:', xgb.__version__)

Installing packages: lightgbm>=4.1.0 xgboost==1.7.6


lightgbm version: 4.6.0
xgboost version: 1.7.6




In [15]:
# Convex weight optimization on existing OOF predictions (no installs) and save submission
import numpy as np
from sklearn.metrics import log_loss
from math import isfinite
import time

# Always avoid SciPy to prevent stalls; use fast simplex search
SCIPY_AVAILABLE = False

def normalize_weights(w):
    w = np.maximum(w, 0)
    s = w.sum()
    if s <= 0:
        w = np.ones_like(w) / len(w)
    else:
        w = w / s
    return w

def blend_probs(weights, prob_list):
    w = normalize_weights(np.array(weights, dtype=np.float64))
    P = np.tensordot(w, np.stack(prob_list, axis=0), axes=(0,0))
    P = clip_and_renorm(P, eps=1e-15)
    return P

# Collect available strong bases
bases = []  # (name, oof, test, loss)
if 'lr5_oof' in globals() and isfinite(lr5_loss):
    bases.append(('LR5', lr5_oof, lr5_test, lr5_loss))
if 'best_svc2_oof' in globals() and 'best_svc2_test' in globals() and isfinite(best_svc2_loss):
    bases.append(('SVC_rbf', best_svc2_oof, best_svc2_test, best_svc2_loss))
if 'lin_oof' in globals() and 'lin_test' in globals() and isfinite(lin_loss):
    bases.append(('LinSVC', lin_oof, lin_test, lin_loss))
bases = [b for b in bases if b[3] < 0.2]
print('Bases for optimization:', [(n, round(l,6)) for n,_,_,l in bases])
assert len(bases) >= 2, 'Need at least two base models to optimize weights.'

oof_list = [b[1] for b in bases]
test_list = [b[2] for b in bases]
loss_list = np.array([b[3] for b in bases], dtype=np.float64)
names = [b[0] for b in bases]
k = len(bases)

def obj(w):
    P = blend_probs(w, oof_list)
    return log_loss(y, P, labels=list(range(n_classes)))

# Baselines
w_eq = np.ones(k) / k
eq_loss = obj(w_eq)
w0 = 1.0 / np.maximum(loss_list, 1e-9)
w0 = w0 / w0.sum()
w_best = w0.copy()
best = obj(w_best)
print(f'Init losses -> equal: {eq_loss:.6f} | inv-loss init: {best:.6f}', flush=True)

t_start = time.time()
rng = np.random.default_rng(SEED)

if k == 2:
    # Fine 1D grid for 2 models
    grid = np.linspace(0, 1, 1001)
    for i, a in enumerate(grid, 1):
        w = np.array([a, 1-a], dtype=np.float64)
        val = obj(w)
        if val < best:
            best, w_best = val, w
        if i % 200 == 0:
            print(f'Grid2 progress {i}/{len(grid)} best={best:.6f}', flush=True)
else:
    # Mixed strategy: coarse simplex grid for k=3 plus random simplex search
    if k == 3:
        # Coarse grid step 0.02
        grid = np.linspace(0, 1, 51)
        cnt = 0
        for a in grid:
            for b in grid:
                c = 1 - a - b
                if c < 0: continue
                w = np.array([a, b, c], dtype=np.float64)
                val = obj(w)
                cnt += 1
                if val < best:
                    best, w_best = val, w
                if cnt % 2000 == 0:
                    print(f'Coarse grid3 progress {cnt} best={best:.6f}', flush=True)
    # Random simplex search
    samples = 4000 if k == 3 else 6000
    for i in range(1, samples + 1):
        w = rng.random(k)
        w = w / w.sum()
        val = obj(w)
        if val < best:
            best, w_best = val, w
        if i % 500 == 0:
            print(f'Random simplex {i}/{samples} best={best:.6f}', flush=True)

opt_oof = blend_probs(w_best, oof_list)
opt_loss = log_loss(y, opt_oof, labels=list(range(n_classes)))
elapsed = time.time() - t_start
print('Optimized weights:', dict(zip(names, np.round(w_best, 6))))
print(f'[Optimized Blend] OOF logloss={opt_loss:.6f} (equal={eq_loss:.6f}, invloss_init={obj(w0):.6f}) | time={elapsed:.1f}s', flush=True)

# Apply to test and save submission with tiny epsilon smoothing
opt_test = blend_probs(w_best, test_list)
eps = 0.001
opt_test = opt_test * (1 - eps) + eps / n_classes
sub = pd.DataFrame(opt_test, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (optimized weighted blend) with shape:', sub.shape)

Bases for optimization: [('LR5', 0.036933), ('SVC_rbf', 0.170827), ('LinSVC', 0.164243)]
Init losses -> equal: 0.063499 | inv-loss init: 0.047174


Random simplex 500/4000 best=0.036933


Random simplex 1000/4000 best=0.036933


Random simplex 1500/4000 best=0.036933


Random simplex 2000/4000 best=0.036933


Random simplex 2500/4000 best=0.036933


Random simplex 3000/4000 best=0.036933


Random simplex 3500/4000 best=0.036933


Random simplex 4000/4000 best=0.036933


Optimized weights: {'LR5': 1.0, 'SVC_rbf': 0.0, 'LinSVC': 0.0}
[Optimized Blend] OOF logloss=0.036933 (equal=0.063499, invloss_init=0.047174) | time=35.0s


Saved submission.csv (optimized weighted blend) with shape: (99, 100)


In [None]:
# Quick blend: equal vs inverse-loss weights over existing OOFs; pick best and save submission
import numpy as np
from math import isfinite
from sklearn.metrics import log_loss

bases = []  # (name, oof, test, loss)
if 'lr5_oof' in globals() and isfinite(lr5_loss):
    bases.append(('LR5', lr5_oof, lr5_test, lr5_loss))
if 'best_svc2_oof' in globals() and 'best_svc2_test' in globals() and isfinite(best_svc2_loss):
    bases.append(('SVC_rbf', best_svc2_oof, best_svc2_test, best_svc2_loss))
if 'lin_oof' in globals() and 'lin_test' in globals() and isfinite(lin_loss):
    bases.append(('LinSVC', lin_oof, lin_test, lin_loss))
assert len(bases) >= 2, 'Need at least two base models available (LR5, SVC_rbf, LinSVC) to blend.'
print('Blend bases:', [(n, round(l,6)) for n,_,_,l in bases])

oof_list = [b[1] for b in bases]
test_list = [b[2] for b in bases]
loss_list = np.array([b[3] for b in bases], dtype=np.float64)
k = len(bases)

def blend(weights, plist):
    w = np.maximum(weights, 0); w = w / w.sum()
    P = np.tensordot(w, np.stack(plist, axis=0), axes=(0,0))
    return clip_and_renorm(P, eps=1e-15)

# Equal-weight blend
w_eq = np.ones(k) / k
oof_eq = blend(w_eq, oof_list)
loss_eq = log_loss(y, oof_eq, labels=list(range(n_classes)))

# Inverse-loss weighted blend
w_il = 1.0 / np.maximum(loss_list, 1e-9); w_il = w_il / w_il.sum()
oof_il = blend(w_il, oof_list)
loss_il = log_loss(y, oof_il, labels=list(range(n_classes)))

print(f'Equal-weight OOF: {loss_eq:.6f} | Inv-loss-weight OOF: {loss_il:.6f}')
w_best = w_il if loss_il < loss_eq else w_eq
print('Chosen weights:', dict(zip([b[0] for b in bases], np.round(w_best, 6))))

test_blend = blend(w_best, test_list)
eps = 0.001
test_blend = test_blend * (1 - eps) + eps / n_classes
sub = pd.DataFrame(test_blend, columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (quick blend) with shape:', sub.shape)

In [1]:
# Bootstrap: imports, utilities, data load (no model training)
import os, time, sys, gc, math, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

def load_data_fast():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    print('Train shape:', train.shape, ' Test shape:', test.shape, flush=True)
    train_ids = train['id'].values
    test_ids = test['id'].values
    X = train.drop(columns=['id', 'species'])
    y = train['species'].values
    X_test = test.drop(columns=['id'])
    assert list(X.columns) == list(X_test.columns), 'Train/Test feature mismatch'
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    classes = le.classes_
    if X.isnull().any().any() or X_test.isnull().any().any():
        X = X.fillna(0); X_test = X_test.fillna(0)
    return X.values, y_enc, X_test.values, classes, test_ids, le

def make_skf(n_splits=5, seed=SEED):
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

def clip_and_renorm(probs, eps=1e-15):
    P = np.clip(probs, eps, 1 - eps)
    P /= P.sum(axis=1, keepdims=True)
    return P

def train_oof(model, X, y, skf, X_test, n_classes, desc='model'):
    n_samples = X.shape[0]
    oof = np.zeros((n_samples, n_classes), dtype=np.float64)
    test_pred = np.zeros((X_test.shape[0], n_classes), dtype=np.float64)
    fold_losses = []
    start_all = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        t0 = time.time()
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]
        from sklearn.base import clone
        clf = clone(model)
        clf.fit(X_tr, y_tr)
        va_proba = clf.predict_proba(X_va)
        loss = log_loss(y_va, va_proba, labels=list(range(n_classes)))
        oof[va_idx] = va_proba
        fold_losses.append(loss)
        test_pred += clf.predict_proba(X_test) / skf.get_n_splits()
        print(f'[{desc}] Fold {fold}/{skf.get_n_splits()} logloss={loss:.6f} time={time.time()-t0:.1f}s', flush=True)
    oof_loss = log_loss(y, oof, labels=list(range(n_classes)))
    print(f'[{desc}] OOF logloss={oof_loss:.6f} | mean_folds={np.mean(fold_losses):.6f} | total={(time.time()-start_all)/60:.1f}m', flush=True)
    return oof, test_pred, oof_loss, fold_losses

# Load once
X, y, X_test, classes, test_ids, le = load_data_fast()
n_classes = len(classes)
skf5 = make_skf(n_splits=5, seed=SEED)
print('Bootstrap complete. n_classes:', n_classes, 'X shape:', X.shape, 'X_test shape:', X_test.shape, flush=True)

Train shape: (891, 194)  Test shape: (99, 193)


Bootstrap complete. n_classes: 99 X shape: (891, 192) X_test shape: (99, 192)


In [12]:
# Fast LR tuning (multinomial, L2) over C; replace lr5_* with best and save quick blend
from sklearn.linear_model import LogisticRegression

Cs = [0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
best_C = None
best_lr_oof = None
best_lr_test = None
best_lr_loss = np.inf
for C in Cs:
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED))])
    oof, test_pred, loss, _ = train_oof(pipe, X, y, skf5, X_test, n_classes, desc=f'LR_C{C}')
    if loss < best_lr_loss:
        best_lr_loss = loss
        best_C = C
        best_lr_oof = oof
        best_lr_test = test_pred
print('Best LR C:', best_C, 'OOF:', best_lr_loss)

# Replace lr5_* aliases to use tuned LR going forward
lr5_oof, lr5_test, lr5_loss = best_lr_oof, best_lr_test, best_lr_loss

# Quick re-blend with LinSVC/SVC if available using optimized weights cell later
cands = [('LR5', lr5_loss)]
if 'lin_oof' in globals() and lin_oof is not None: cands.append(('LinSVC', lin_loss))
if 'best_svc2_oof' in globals() and best_svc2_oof is not None: cands.append(('SVC_rbf', best_svc2_loss))
print('Bases now:', [(n, round(l,6)) for n,l in cands])



[LR_C0.5] Fold 1/5 logloss=0.157535 time=13.3s




[LR_C0.5] Fold 2/5 logloss=0.147769 time=15.1s




[LR_C0.5] Fold 3/5 logloss=0.161110 time=15.5s




[LR_C0.5] Fold 4/5 logloss=0.159580 time=15.0s




[LR_C0.5] Fold 5/5 logloss=0.161023 time=15.2s


[LR_C0.5] OOF logloss=0.157403 | mean_folds=0.157403 | total=1.2m




[LR_C1.0] Fold 1/5 logloss=0.115621 time=16.4s




[LR_C1.0] Fold 2/5 logloss=0.108472 time=18.2s




[LR_C1.0] Fold 3/5 logloss=0.122760 time=18.1s




[LR_C1.0] Fold 4/5 logloss=0.120443 time=18.9s




[LR_C1.0] Fold 5/5 logloss=0.124398 time=19.0s


[LR_C1.0] OOF logloss=0.118336 | mean_folds=0.118339 | total=1.5m




[LR_C2.0] Fold 1/5 logloss=0.088162 time=17.9s




[LR_C2.0] Fold 2/5 logloss=0.082791 time=23.9s




[LR_C2.0] Fold 3/5 logloss=0.098677 time=21.8s




[LR_C2.0] Fold 4/5 logloss=0.095637 time=22.7s




[LR_C2.0] Fold 5/5 logloss=0.101679 time=22.3s


[LR_C2.0] OOF logloss=0.093383 | mean_folds=0.093389 | total=1.8m




[LR_C3.0] Fold 1/5 logloss=0.076840 time=20.2s




[LR_C3.0] Fold 2/5 logloss=0.072264 time=27.2s




[LR_C3.0] Fold 3/5 logloss=0.089373 time=24.1s




[LR_C3.0] Fold 4/5 logloss=0.085693 time=25.5s




[LR_C3.0] Fold 5/5 logloss=0.092630 time=25.6s


[LR_C3.0] OOF logloss=0.083353 | mean_folds=0.083360 | total=2.0m




[LR_C5.0] Fold 1/5 logloss=0.066230 time=23.9s




[LR_C5.0] Fold 2/5 logloss=0.062491 time=31.6s




[LR_C5.0] Fold 3/5 logloss=0.081113 time=26.9s




[LR_C5.0] Fold 4/5 logloss=0.076784 time=28.8s




[LR_C5.0] Fold 5/5 logloss=0.084534 time=29.7s


[LR_C5.0] OOF logloss=0.074221 | mean_folds=0.074230 | total=2.3m




[LR_C10.0] Fold 1/5 logloss=0.056808 time=28.6s




[LR_C10.0] Fold 2/5 logloss=0.053844 time=37.2s




[LR_C10.0] Fold 3/5 logloss=0.073693 time=31.8s




[LR_C10.0] Fold 4/5 logloss=0.069233 time=33.2s




[LR_C10.0] Fold 5/5 logloss=0.077701 time=35.0s


[LR_C10.0] OOF logloss=0.066245 | mean_folds=0.066256 | total=2.8m


Best LR C: 10.0 OOF: 0.06624531781361775
Bases now: [('LR5', 0.066245), ('LinSVC', 0.164243), ('SVC_rbf', 0.170827)]


In [14]:
# Extended LR tuning: very large C and PowerTransformer variants; update lr5_* if improved
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer

best_loss = lr5_loss if 'lr5_loss' in globals() else float('inf')
best_oof = lr5_oof if 'lr5_oof' in globals() else None
best_test = lr5_test if 'lr5_test' in globals() else None
best_tag = 'existing_lr5'

def run_lr_pipeline(pipe, tag):
    oof, test_pred, loss, _ = train_oof(pipe, X, y, skf5, X_test, n_classes, desc=tag)
    return oof, test_pred, loss

# 1) High-C sweep (StandardScaler -> LR(saga, L2))
Cs_high = [20, 50, 100, 200, 500]
for C in Cs_high:
    pipe = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='l2', max_iter=20000, tol=1e-4, n_jobs=-1, random_state=SEED))
    ])
    oof, test_pred, loss = run_lr_pipeline(pipe, f'LR_highC_C{C}')
    if loss < best_loss:
        best_loss, best_oof, best_test, best_tag = loss, oof, test_pred, f'LR_highC_C{C}'
print('High-C LR best so far:', best_tag, 'OOF=', best_loss)

# 2) PowerTransformer(YJ) -> LR(saga, L2) sweep
Cs_pt = [10, 50, 100]
for C in Cs_pt:
    pipe = Pipeline(steps=[
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='l2', max_iter=20000, tol=1e-4, n_jobs=-1, random_state=SEED))
    ])
    oof, test_pred, loss = run_lr_pipeline(pipe, f'LR_PT_C{C}')
    if loss < best_loss:
        best_loss, best_oof, best_test, best_tag = loss, oof, test_pred, f'LR_PT_C{C}'
print('Overall LR best:', best_tag, 'OOF=', best_loss)

# Update lr5_* aliases if improved
if best_oof is not None and best_loss < (lr5_loss if 'lr5_loss' in globals() else float('inf')):
    lr5_oof, lr5_test, lr5_loss = best_oof, best_test, best_loss
    print('Updated lr5_* to', best_tag, 'with OOF=', lr5_loss)
else:
    print('Kept existing lr5_* with OOF=', lr5_loss if 'lr5_loss' in globals() else None)



[LR_highC_C20] Fold 1/5 logloss=0.051280 time=32.6s




[LR_highC_C20] Fold 2/5 logloss=0.048810 time=41.7s




[LR_highC_C20] Fold 3/5 logloss=0.068929 time=37.5s




[LR_highC_C20] Fold 4/5 logloss=0.064344 time=38.2s




[LR_highC_C20] Fold 5/5 logloss=0.073931 time=39.0s


[LR_highC_C20] OOF logloss=0.061447 | mean_folds=0.061459 | total=3.1m




[LR_highC_C50] Fold 1/5 logloss=0.046969 time=38.1s




[LR_highC_C50] Fold 2/5 logloss=0.045425 time=45.2s




[LR_highC_C50] Fold 3/5 logloss=0.065713 time=42.4s




[LR_highC_C50] Fold 4/5 logloss=0.060829 time=43.1s




[LR_highC_C50] Fold 5/5 logloss=0.071508 time=42.2s


[LR_highC_C50] OOF logloss=0.058076 | mean_folds=0.058089 | total=3.5m




[LR_highC_C100] Fold 1/5 logloss=0.045285 time=40.8s




[LR_highC_C100] Fold 2/5 logloss=0.044202 time=46.8s




[LR_highC_C100] Fold 3/5 logloss=0.064603 time=44.4s




[LR_highC_C100] Fold 4/5 logloss=0.059596 time=45.2s




[LR_highC_C100] Fold 5/5 logloss=0.070551 time=44.0s


[LR_highC_C100] OOF logloss=0.056834 | mean_folds=0.056847 | total=3.7m




[LR_highC_C200] Fold 1/5 logloss=0.044371 time=42.4s




[LR_highC_C200] Fold 2/5 logloss=0.043589 time=47.5s




[LR_highC_C200] Fold 3/5 logloss=0.064006 time=45.5s




[LR_highC_C200] Fold 4/5 logloss=0.058959 time=46.2s




[LR_highC_C200] Fold 5/5 logloss=0.070039 time=45.1s


[LR_highC_C200] OOF logloss=0.056180 | mean_folds=0.056193 | total=3.8m




[LR_highC_C500] Fold 1/5 logloss=0.043811 time=43.4s




[LR_highC_C500] Fold 2/5 logloss=0.043200 time=48.1s




[LR_highC_C500] Fold 3/5 logloss=0.063646 time=46.2s




[LR_highC_C500] Fold 4/5 logloss=0.058539 time=47.1s




[LR_highC_C500] Fold 5/5 logloss=0.069715 time=46.0s


[LR_highC_C500] OOF logloss=0.055769 | mean_folds=0.055782 | total=3.8m


High-C LR best so far: LR_highC_C500 OOF= 0.05576877307805801




[LR_PT_C10] Fold 1/5 logloss=0.050577 time=27.2s




[LR_PT_C10] Fold 2/5 logloss=0.049346 time=27.7s




[LR_PT_C10] Fold 3/5 logloss=0.044511 time=26.8s




[LR_PT_C10] Fold 4/5 logloss=0.055832 time=28.9s




[LR_PT_C10] Fold 5/5 logloss=0.045542 time=24.4s


[LR_PT_C10] OOF logloss=0.049163 | mean_folds=0.049161 | total=2.2m




[LR_PT_C50] Fold 1/5 logloss=0.037064 time=38.4s




[LR_PT_C50] Fold 2/5 logloss=0.039332 time=38.0s




[LR_PT_C50] Fold 3/5 logloss=0.035063 time=37.7s




[LR_PT_C50] Fold 4/5 logloss=0.046606 time=41.0s




[LR_PT_C50] Fold 5/5 logloss=0.035221 time=37.5s


[LR_PT_C50] OOF logloss=0.038655 | mean_folds=0.038657 | total=3.2m




[LR_PT_C100] Fold 1/5 logloss=0.034663 time=41.9s




[LR_PT_C100] Fold 2/5 logloss=0.037764 time=40.3s




[LR_PT_C100] Fold 3/5 logloss=0.033549 time=40.9s




[LR_PT_C100] Fold 4/5 logloss=0.045260 time=43.7s




[LR_PT_C100] Fold 5/5 logloss=0.033440 time=41.3s


[LR_PT_C100] OOF logloss=0.036933 | mean_folds=0.036935 | total=3.5m


Overall LR best: LR_PT_C100 OOF= 0.03693293302891193
Updated lr5_* to LR_PT_C100 with OOF= 0.03693293302891193


In [16]:
# PT-LogReg extension: larger C for L2 and ElasticNet with small l1_ratio; update lr5_* if improved
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer

best_loss = lr5_loss if 'lr5_loss' in globals() else float('inf')
best_oof = lr5_oof if 'lr5_oof' in globals() else None
best_test = lr5_test if 'lr5_test' in globals() else None
best_tag = 'existing_lr5'

def run_lr_pipeline(pipe, tag):
    oof, test_pred, loss, _ = train_oof(pipe, X, y, skf5, X_test, n_classes, desc=tag)
    return oof, test_pred, loss

# 1) PT(YJ) + L2 with much larger C
Cs_pt_ext = [200, 500, 1000, 2000]
for C in Cs_pt_ext:
    pipe = Pipeline(steps=[
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='l2', max_iter=30000, tol=1e-4, n_jobs=-1, random_state=SEED))
    ])
    oof, test_pred, loss = run_lr_pipeline(pipe, f'LR_PT_L2_C{C}')
    if loss < best_loss:
        best_loss, best_oof, best_test, best_tag = loss, oof, test_pred, f'LR_PT_L2_C{C}'
print('PT L2 best so far:', best_tag, 'OOF=', best_loss)

# 2) PT(YJ) + ElasticNet (small L1) with C grid
l1_ratios = [0.01, 0.05, 0.1]
Cs_en = [50, 100, 200, 500]
for l1r in l1_ratios:
    for C in Cs_en:
        pipe = Pipeline(steps=[
            ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
            ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=C, penalty='elasticnet', l1_ratio=l1r, max_iter=30000, tol=1e-4, n_jobs=-1, random_state=SEED))
        ])
        tag = f'LR_PT_EN_l1{l1r}_C{C}'
        oof, test_pred, loss = run_lr_pipeline(pipe, tag)
        if loss < best_loss:
            best_loss, best_oof, best_test, best_tag = loss, oof, test_pred, tag
print('Overall PT-LogReg best:', best_tag, 'OOF=', best_loss)

# Update lr5_* if improved
if best_oof is not None and best_loss < (lr5_loss if 'lr5_loss' in globals() else float('inf')):
    lr5_oof, lr5_test, lr5_loss = best_oof, best_test, best_loss
    print('Updated lr5_* to', best_tag, 'with OOF=', lr5_loss)
else:
    print('Kept existing lr5_* with OOF=', lr5_loss if 'lr5_loss' in globals() else None)

In [17]:
# Refit best PT+LR (L2, C=2000) cleanly; set lr5_* and save submission
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer

best_pipe = Pipeline(steps=[
    ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=2000, penalty='l2', max_iter=30000, tol=1e-4, n_jobs=-1, random_state=SEED))
])
lr5_oof, lr5_test, lr5_loss, _ = train_oof(best_pipe, X, y, skf5, X_test, n_classes, desc='LR_PT_L2_C2000_refit')
print('Set lr5_* from LR_PT_L2_C2000_refit: OOF=', lr5_loss)

# Save submission directly from this best model (also used by blending cell 11 if rerun)
sub = pd.DataFrame(clip_and_renorm(lr5_test), columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (LR_PT_L2_C2000_refit) with shape:', sub.shape)

In [18]:
# Quick set best: PT(YJ) + LR L2 C=2000 single run to set lr5_* and save submission
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer

best_pipe = Pipeline(steps=[
    ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', C=2000, penalty='l2', max_iter=30000, tol=1e-4, n_jobs=-1, random_state=SEED))
])
lr5_oof, lr5_test, lr5_loss, _ = train_oof(best_pipe, X, y, skf5, X_test, n_classes, desc='LR_PT_L2_C2000_single')
print('lr5 updated: OOF=', lr5_loss)

sub = pd.DataFrame(clip_and_renorm(lr5_test), columns=classes)
sub.insert(0, 'id', test_ids)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv (PT+LR L2 C=2000) with shape:', sub.shape)

In [19]:
# Promote best from Cell 16 (if available) to lr5_* without retraining; save submission
if 'best_oof' in globals() and 'best_test' in globals() and 'best_loss' in globals():
    print('Found best from prior sweep:', globals().get('best_tag', 'unknown'), 'OOF=', best_loss)
    lr5_oof, lr5_test, lr5_loss = best_oof, best_test, float(best_loss)
    sub = pd.DataFrame(clip_and_renorm(lr5_test), columns=classes)
    sub.insert(0, 'id', test_ids)
    sub.to_csv('submission.csv', index=False)
    print('lr5_* set from best sweep. OOF=', lr5_loss, '| submission.csv saved:', sub.shape)
else:
    print('No best_* globals available to promote; skip.')

In [20]:
# Minimal promotion of best_* -> lr5_* without retraining; fast save submission
import numpy as np, pandas as pd
if 'best_oof' in globals() and 'best_test' in globals() and 'best_loss' in globals():
    try:
        lr5_oof = np.array(best_oof, copy=True)
        lr5_test = np.array(best_test, copy=True)
        lr5_loss = float(best_loss)
        print('[Promote] Using best_tag:', globals().get('best_tag', 'unknown'), 'OOF=', lr5_loss, flush=True)
        sub = pd.DataFrame(clip_and_renorm(lr5_test), columns=classes)
        sub.insert(0, 'id', test_ids)
        sub.to_csv('submission.csv', index=False)
        print('[Promote] lr5_* set and submission.csv saved', sub.shape, flush=True)
    except Exception as e:
        print('[Promote] Failed:', e, flush=True)
else:
    print('[Promote] best_* globals not found; skip', flush=True)