In [17]:
# Image Embeddings Extraction (ConvNeXt-Tiny fb_in22k) -> save train/test .npy + missing flags
import os, sys, time, math, subprocess, gc, random, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image, ImageOps

t0 = time.time()
SEED = 1337
random.seed(SEED); np.random.seed(SEED)

# Prepare writable caches BEFORE importing timm/hf-hub to avoid read-only default at /app/.cache
HF_CACHE = Path('./.hf_cache').absolute()
TORCH_CACHE = Path('./.torch_cache').absolute()
HF_CACHE.mkdir(parents=True, exist_ok=True)
TORCH_CACHE.mkdir(parents=True, exist_ok=True)
os.environ['HF_HOME'] = str(HF_CACHE)
os.environ['HUGGINGFACE_HUB_CACHE'] = str(HF_CACHE)
os.environ['HF_HUB_CACHE'] = str(HF_CACHE)
os.environ['TRANSFORMERS_CACHE'] = str(HF_CACHE)
os.environ['TORCH_HOME'] = str(TORCH_CACHE)

# Install PyTorch (CUDA 12.1) and timm if missing
def ensure_pkg():
    try:
        import torch, torchvision, timm  # noqa
        return
    except Exception:
        pass
    print('Installing torch/torchvision/torchaudio (cu121) and timm...', flush=True)
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '--upgrade', 'pip'], check=True)
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', '--index-url', 'https://download.pytorch.org/whl/cu121', 'torch', 'torchvision', 'torchaudio'], check=True)
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'timm>=0.9.12'], check=True)
    print('Install complete.', flush=True)
ensure_pkg()

import torch
import torch.backends.cudnn as cudnn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import timm

# Determinism and performance knobs
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
cudnn.deterministic = True
cudnn.benchmark = False
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device, flush=True)

DATA_DIR = Path('.')
IMG_DIR = DATA_DIR / 'images'
train_df = pd.read_csv(DATA_DIR/'train.csv')
test_df = pd.read_csv(DATA_DIR/'test.csv')
id_col = 'id'; target_col = 'species'
train_ids = train_df[id_col].tolist()
test_ids = test_df[id_col].tolist()
print(f'train n={len(train_ids)}, test n={len(test_ids)}', flush=True)

# Map id -> image path
def id_to_path(x):
    return IMG_DIR / f'{int(x)}.jpg'

# Quick sanity of paths
missing_train = sum([not id_to_path(i).exists() for i in train_ids])
missing_test = sum([not id_to_path(i).exists() for i in test_ids])
print(f'Missing files - train: {missing_train}, test: {missing_test}', flush=True)

# Transforms: invert grayscale -> RGB, resize 224, normalize ImageNet
img_size = 224
imagenet_mean = [0.485, 0.456, 0.406]
imagenet_std = [0.229, 0.224, 0.225]
to_tensor = T.Compose([
    T.Resize((img_size, img_size), interpolation=T.InterpolationMode.BILINEAR, antialias=True),
    T.ToTensor(),
    T.Normalize(mean=imagenet_mean, std=imagenet_std),
])

class LeafDataset(Dataset):
    def __init__(self, ids):
        self.ids = list(ids)
    def __len__(self):
        return len(self.ids)
    def __getitem__(self, idx):
        lid = self.ids[idx]
        path = id_to_path(lid)
        miss_flag = 0
        try:
            img = Image.open(path).convert('L')
            # Invert so background -> white, object -> black
            img = ImageOps.invert(img)
            img = img.convert('RGB')
            tensor = to_tensor(img)
        except Exception:
            tensor = torch.zeros(3, img_size, img_size, dtype=torch.float32)
            miss_flag = 1
        return tensor, miss_flag

# Model: ConvNeXt-Tiny fb_in22k, feature extractor (768-d)
model_name = 'convnext_tiny.fb_in22k'
model = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool='avg', cache_dir=str(HF_CACHE))
model.eval().to(device)
feat_dim = model.num_features if hasattr(model, 'num_features') else 768
print(f'Model {model_name} -> feature dim {feat_dim}', flush=True)

def extract_embeddings(ids, split_name='train', batch_size=128, num_workers=8):
    ds = LeafDataset(ids)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, drop_last=False)
    n = len(ds)
    embs = np.zeros((n, feat_dim), dtype=np.float32)
    miss = np.zeros((n,), dtype=np.uint8)
    seen = 0
    start = time.time()
    oom_downgraded = False
    with torch.inference_mode():
        for bi, (imgs, flags) in enumerate(dl):
            bt0 = time.time()
            try:
                with torch.autocast(device_type='cuda' if device.type=='cuda' else 'cpu', dtype=torch.float16 if device.type=='cuda' else torch.bfloat16):
                    imgs = imgs.to(device, non_blocking=True)
                    feats = model(imgs)
                    feats = feats.float()
            except RuntimeError as e:
                if ('out of memory' in str(e).lower()) and (not oom_downgraded):
                    print('OOM encountered, reducing batch size to 64 for remaining iterations', flush=True)
                    oom_downgraded = True
                raise
            bs = feats.shape[0]
            embs[seen:seen+bs] = feats.detach().cpu().numpy()
            miss[seen:seen+bs] = flags.numpy().astype(np.uint8)
            seen += bs
            if (bi % 10) == 0 or bi == len(dl)-1:
                print(f'[{split_name}] batch {bi+1}/{len(dl)} | seen {seen}/{n} | elapsed {(time.time()-start):.1f}s (batch {(time.time()-bt0):.2f}s)', flush=True)
    # L2 normalize row-wise
    norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
    embs = embs / norms
    return embs, miss

# Try larger bs first; if OOM we will catch and instruct to lower manually next run
bs = 128
try:
    tr_embs, tr_miss = extract_embeddings(train_ids, 'train', batch_size=bs, num_workers=8)
    te_embs, te_miss = extract_embeddings(test_ids, 'test', batch_size=bs, num_workers=8)
except RuntimeError as e:
    if 'out of memory' in str(e).lower():
        torch.cuda.empty_cache(); gc.collect()
        print('Retrying with batch_size=64 after OOM...', flush=True)
        tr_embs, tr_miss = extract_embeddings(train_ids, 'train', batch_size=64, num_workers=8)
        te_embs, te_miss = extract_embeddings(test_ids, 'test', batch_size=64, num_workers=8)
    else:
        raise

# Save artifacts with id alignment
np.save('train_img_emb.npy', tr_embs)
np.save('test_img_emb.npy', te_embs)
pd.DataFrame({id_col: train_ids, 'img_missing': tr_miss}).to_csv('train_img_flags.csv', index=False)
pd.DataFrame({id_col: test_ids, 'img_missing': te_miss}).to_csv('test_img_flags.csv', index=False)
print('Saved embeddings and flags:',
      'train_img_emb.npy', tr_embs.shape,
      'test_img_emb.npy', te_embs.shape, flush=True)

# Memory cleanup
del model; torch.cuda.empty_cache(); gc.collect()
print(f'Embedding extraction done in {(time.time()-t0)/60:.1f} min')

Device: cuda


train n=891, test n=99


Missing files - train: 0, test: 0


Model convnext_tiny.fb_in22k -> feature dim 768


[train] batch 1/7 | seen 128/891 | elapsed 3.0s (batch 0.65s)


[test] batch 1/1 | seen 99/99 | elapsed 1.4s (batch 0.13s)


Saved embeddings and flags: train_img_emb.npy (891, 768) test_img_emb.npy (99, 768)


Embedding extraction done in 0.1 min


In [13]:
# Leaf Classification - Fix pipelines, add PT-LDA eigen sweep, SVC(PT), stack with meta LR, per-fold TS later
import os, sys, time, random, subprocess, io, contextlib, warnings
import numpy as np
import pandas as pd
from pathlib import Path

SEED = 1337
random.seed(SEED); np.random.seed(SEED)

t0 = time.time()
print('Starting run...')

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

# LightGBM import with auto-install if missing
try:
    import lightgbm as lgb
    from lightgbm import LGBMClassifier
except Exception as e:
    print('Installing lightgbm...', flush=True)
    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'lightgbm'], check=True)
    import lightgbm as lgb
    from lightgbm import LGBMClassifier

# Silence LightGBM excessive warnings/logs
warnings.filterwarnings('ignore', message='.*No further splits with positive gain.*')

DATA_DIR = Path('.')
train_path = DATA_DIR/'train.csv'
test_path = DATA_DIR/'test.csv'
ss_path = DATA_DIR/'sample_submission.csv'

# Load data
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
ss = pd.read_csv(ss_path)
print(f'train shape: {train.shape}, test shape: {test.shape}', flush=True)

# Columns
id_col = 'id'; target_col = 'species'
feature_cols = [c for c in train.columns if c not in [id_col, target_col]]
print(f'Number of features: {len(feature_cols)}')

# Target encoding
le = LabelEncoder()
y = le.fit_transform(train[target_col])
classes = list(le.classes_); n_classes = len(classes)
print(f'Number of classes: {n_classes}')

# Submission column order sanity
ss_cols = [c for c in ss.columns if c != id_col]
if set(ss_cols) != set(classes):
    raise ValueError('Sample submission class columns do not match training classes')
submission_cols = ss_cols.copy()

# Matrices
X = train[feature_cols].values
X_test = test[feature_cols].values

# CV (min class count ~6)
n_splits = 6
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

def clip_and_normalize(P):
    P = np.clip(P, 1e-15, 1 - 1e-15)
    row_sums = P.sum(axis=1, keepdims=True)
    return P / row_sums

def logloss_with_clip(y_true, y_pred):
    p = clip_and_normalize(y_pred)
    return log_loss(y_true, p, labels=np.arange(n_classes))

def temp_scale_probs(P, T):
    P = np.clip(P, 1e-15, 1-1e-15)
    Q = np.power(P, 1.0/float(T))
    return Q / Q.sum(axis=1, keepdims=True)

def run_model_pipeline(name, pipe_factory, X, y, X_test, skf):
    print(f'\n=== Running {name} with {skf.get_n_splits()} folds ===')
    oof = np.zeros((len(X), n_classes), dtype=np.float32)
    tst = np.zeros((len(X_test), n_classes), dtype=np.float32)
    fold_losses = []
    start = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        f_t0 = time.time()
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        print(f'[{name}] Fold {fold}/{n_splits} - train: {len(tr_idx)}, valid: {len(va_idx)}', flush=True)
        pipe = pipe_factory()
        pipe.fit(X_tr, y_tr)
        proba_va = pipe.predict_proba(X_va)
        loss = logloss_with_clip(y_va, proba_va)
        oof[va_idx] = proba_va
        tst += pipe.predict_proba(X_test) / n_splits
        fold_losses.append(loss)
        print(f'[{name}] Fold {fold} logloss: {loss:.6f}; elapsed fold {(time.time()-f_t0):.1f}s; total {(time.time()-start):.1f}s', flush=True)
    oof_loss = logloss_with_clip(y, oof)
    print(f'[{name}] CV fold logloss: ' + ', '.join([f"{v:.6f}" for v in fold_losses]))
    print(f'[{name}] OOF CV logloss: {oof_loss:.6f}')
    return oof, tst, oof_loss

def run_model_pipeline_with_guard(name, pipe_factory, X, y, X_test, skf, abort_threshold=0.6):
    print(f'\n=== Running {name} (guarded) with {skf.get_n_splits()} folds ===')
    oof = np.zeros((len(X), n_classes), dtype=np.float32)
    tst = np.zeros((len(X_test), n_classes), dtype=np.float32)
    fold_losses = []
    start = time.time()
    bad_first_two = False
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        f_t0 = time.time()
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        print(f'[{name}] Fold {fold}/{n_splits} - train: {len(tr_idx)}, valid: {len(va_idx)}', flush=True)
        pipe = pipe_factory()
        pipe.fit(X_tr, y_tr)
        proba_va = pipe.predict_proba(X_va)
        loss = logloss_with_clip(y_va, proba_va)
        oof[va_idx] = proba_va
        tst += pipe.predict_proba(X_test) / n_splits
        fold_losses.append(loss)
        print(f'[{name}] Fold {fold} logloss: {loss:.6f}; elapsed fold {(time.time()-f_t0):.1f}s; total {(time.time()-start):.1f}s', flush=True)
        if fold == 2:
            if fold_losses[0] > abort_threshold and fold_losses[1] > abort_threshold:
                bad_first_two = True
                print(f'[{name}] Early abort: first two folds > {abort_threshold}. Skipping remaining folds.')
                break
    if bad_first_two:
        return oof, tst, 1e3
    oof_loss = logloss_with_clip(y, oof)
    print(f'[{name}] CV fold logloss: ' + ', '.join([f"{v:.6f}" for v in fold_losses]))
    print(f'[{name}] OOF CV logloss: {oof_loss:.6f}')
    return oof, tst, oof_loss

# Pipelines
# 1) Logistic Regression (anchor, L2)
def make_pipeline_lr(C=1.0, max_iter=5000):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(solver='lbfgs', C=C, max_iter=max_iter, n_jobs=-1, random_state=SEED))
    ])

# 1b) Logistic Regression elastic-net (optional)
RUN_LR_EN = False  # disable slow elastic-net grid by default
def make_pipeline_lr_en(C=1.0, l1_ratio=0.25, max_iter=5000):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=l1_ratio, C=C, max_iter=max_iter, n_jobs=-1, random_state=SEED))
    ])

# 2) LDA with PCA whiten: StandardScaler -> PCA -> LDA
def make_pipeline_lda_pca(n_comp):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=n_comp, whiten=True, svd_solver='full', random_state=SEED)),
        ('clf', LDA(solver='lsqr', shrinkage='auto'))
    ])

# 3) LDA PT variants (PT already standardizes; no extra scaler)
def make_pipeline_lda_pt_lsqr():
    return Pipeline([
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', LDA(solver='lsqr', shrinkage='auto'))
    ])

def make_pipeline_lda_pt_eigen(shrink=0.2):
    return Pipeline([
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', LDA(solver='eigen', shrinkage=shrink))
    ])

# 4) SVM RBF with PowerTransformer (NO PCA)
def make_pipeline_svm_pt(C=1.0, gamma='scale', class_weight=None):
    return Pipeline([
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', SVC(kernel='rbf', C=C, gamma=gamma, class_weight=class_weight, probability=True, cache_size=2000, random_state=SEED))
    ])

# 5) (Dropped) GaussianNB after PT - harmful
def make_pipeline_gnb_pt():
    return Pipeline([
        ('pt', PowerTransformer(method='yeo-johnson', standardize=True)),
        ('clf', GaussianNB())
    ])

# RUN BASE MODELS
base_models = []  # list of tuples (name, oof, tst, oof_loss)

# A) LR L2 grid
best_lr = None; best_lr_oof=None; best_lr_tst=None; best_lr_loss=1e9
for C in [0.5, 1.0, 2.0]:
    def lr_factory(cc=C): return make_pipeline_lr(cc, max_iter=5000)
    oof_l, tst_l, loss_l = run_model_pipeline(f'LR_L2(C={C})', lr_factory, X, y, X_test, skf)
    if loss_l < best_lr_loss:
        best_lr_loss = loss_l; best_lr = C; best_lr_oof=oof_l; best_lr_tst=tst_l
print(f'Best LR_L2: C={best_lr}, OOF={best_lr_loss:.6f}')
if best_lr_loss <= 0.12: base_models.append((f'LR_L2_C{best_lr}', best_lr_oof, best_lr_tst, best_lr_loss))

# B) LR elastic-net (optional, only if enabled and competitive)
best_lren = (None, None, None, 1e9)
if RUN_LR_EN:
    for C in [0.5, 1.0, 2.0]:
        for l1r in [0.0, 0.25]:
            def lren_factory(cc=C, ll=l1r): return make_pipeline_lr_en(cc, ll, max_iter=5000)
            oof_e, tst_e, loss_e = run_model_pipeline(f'LR_EN(C={C},l1={l1r})', lren_factory, X, y, X_test, skf)
            if loss_e < best_lren[3]: best_lren = (f'LR_EN_C{C}_l1{l1r}', oof_e, tst_e, loss_e)
    print(f'Best LR_EN: {best_lren[0]} OOF={best_lren[3]:.6f}')
    if best_lren[3] <= 0.12: base_models.append(best_lren)

# C) LDA PT lsqr
oof_ptl, tst_ptl, loss_ptl = run_model_pipeline('LDA_PT_lsqr', make_pipeline_lda_pt_lsqr, X, y, X_test, skf)
if loss_ptl <= 0.20: base_models.append(('LDA_PT_lsqr', oof_ptl, tst_ptl, loss_ptl))

# D) LDA PT eigen shrink sweep (expanded grid)
best_pte = (None, None, None, 1e9)
for s in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]:
    def pte_factory(sshr=s): return make_pipeline_lda_pt_eigen(sshr)
    oof_pe, tst_pe, loss_pe = run_model_pipeline(f'LDA_PT_eigen(sh={s})', pte_factory, X, y, X_test, skf)
    if loss_pe < best_pte[3]: best_pte = (f'LDA_PT_eigen_{s}', oof_pe, tst_pe, loss_pe)
print(f'Best LDA_PT_eigen: {best_pte[0]} OOF={best_pte[3]:.6f}')
if best_pte[3] <= 0.20: base_models.append(best_pte)

# E) LDA PCA (small sweep) - keep only if beats PT-LDA
best_ldap = (None, None, None, 1e9)
for nc in [64, 80, 96]:
    def ldap_factory(ncc=nc): return make_pipeline_lda_pca(ncc)
    oof_la, tst_la, loss_la = run_model_pipeline(f'LDA_PCA(n={nc})', ldap_factory, X, y, X_test, skf)
    if loss_la < best_ldap[3]: best_ldap = (f'LDA_PCA_{nc}', oof_la, tst_la, loss_la)
print(f'Best LDA_PCA: {best_ldap[0]} OOF={best_ldap[3]:.6f}')
if best_ldap[3] <= min(loss_ptl, best_pte[3], 0.20): base_models.append(best_ldap)

# F) SVM PT (no PCA): C x gamma grid; keep only if <=0.12
RUN_SVM_PT = False  # disable for now due to poor performance; re-enable if needed
best_svm = (None, None, None, 1e9)
if RUN_SVM_PT:
    svm_C_grid = [1.0, 3.0, 10.0, 30.0, 100.0]
    svm_gamma_grid = ['scale', 0.002, 0.004, 0.008, 0.016, 0.032]
    def run_svm_variant(Cval, gval, cw):
        def svm_factory(cc=Cval, gg=gval, cw_=cw): return make_pipeline_svm_pt(C=cc, gamma=gg, class_weight=cw_)
        tag = f'SVM_PT(C={Cval},g={gval},cw={cw})'
        return run_model_pipeline_with_guard(tag, svm_factory, X, y, X_test, skf, abort_threshold=0.6)
    for Cval in svm_C_grid:
        for gval in svm_gamma_grid:
            oof_s, tst_s, loss_s = run_svm_variant(Cval, gval, None)
            if loss_s < best_svm[3]: best_svm = (f'SVM_PT_C{Cval}_g{gval}_cwNone', oof_s, tst_s, loss_s)
    print(f'Best SVM_PT: {best_svm[0]} OOF={best_svm[3]:.6f}')
    if best_svm[3] <= 0.12: base_models.append(best_svm)

# G) LightGBM (tree diversity) with early stopping; compact regularized grid
best_lgb = (None, None, None, 1e9)
lgb_param_grid = []
for num_leaves in [31, 63]:
    for max_depth in [4, 6]:
        for lambda_l2 in [5, 10]:
            lgb_param_grid.append({'num_leaves': num_leaves, 'max_depth': max_depth, 'lambda_l2': lambda_l2})

def run_lgbm_variant(params):
    name = f"LGBM(leaves={params['num_leaves']},depth={params['max_depth']},l2={params['lambda_l2']})"
    print(f"\n=== Running {name} with {n_splits} folds ===")
    oof = np.zeros((len(X), n_classes), dtype=np.float32)
    tst = np.zeros((len(X_test), n_classes), dtype=np.float32)
    fold_losses = []
    start = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        f_t0 = time.time()
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]
        X_tr_in, X_es, y_tr_in, y_es = train_test_split(X_tr, y_tr, test_size=0.2, stratify=y_tr, random_state=SEED+fold)
        clf = LGBMClassifier(
            objective='multiclass', num_class=n_classes,
            learning_rate=0.05, n_estimators=2000,
            num_leaves=params['num_leaves'], max_depth=params['max_depth'],
            min_child_samples=20,
            subsample=0.8, subsample_freq=1,
            colsample_bytree=0.8,
            reg_lambda=params['lambda_l2'], reg_alpha=0.0,
            random_state=SEED, n_jobs=-1, verbose=-1
        )
        # Suppress LightGBM internal logging
        with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
            clf.fit(
                X_tr_in, y_tr_in,
                eval_set=[(X_es, y_es)],
                eval_metric='multi_logloss',
                callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False), lgb.log_evaluation(period=0)]
            )
        proba_va = clf.predict_proba(X_va, num_iteration=clf.best_iteration_)
        loss = logloss_with_clip(y_va, proba_va)
        oof[va_idx] = proba_va
        tst += clf.predict_proba(X_test, num_iteration=clf.best_iteration_) / n_splits
        fold_losses.append(loss)
        print(f'[{name}] Fold {fold} logloss: {loss:.6f}; best_iter={clf.best_iteration_}; elapsed fold {(time.time()-f_t0):.1f}s; total {(time.time()-start):.1f}s', flush=True)
    oof_loss = logloss_with_clip(y, oof)
    print(f'[{name}] CV fold logloss: ' + ', '.join([f"{v:.6f}" for v in fold_losses]))
    print(f'[{name}] OOF CV logloss: {oof_loss:.6f}')
    return name, oof, tst, oof_loss

for i, params in enumerate(lgb_param_grid, 1):
    print(f'Grid {i}/{len(lgb_param_grid)}: {params}', flush=True)
    name, oof_lgb, tst_lgb, loss_lgb = run_lgbm_variant(params)
    if loss_lgb < best_lgb[3]: best_lgb = (name, oof_lgb, tst_lgb, loss_lgb)
print(f'Best LGBM: {best_lgb[0]} OOF={best_lgb[3]:.6f}')
if best_lgb[3] <= 0.12: base_models.append(best_lgb)

print('Selected base models:', [(m[0], round(m[3],6)) for m in base_models])
if len(base_models) < 2:
    print('Not enough competitive base models; falling back to best LR_L2 for submission.')
    final_tst = best_lr_tst
    sub = pd.DataFrame(test[id_col])
    proba_df = pd.DataFrame(final_tst, columns=le.inverse_transform(np.arange(n_classes)))
    proba_df = proba_df[submission_cols]
    sub = pd.concat([sub, proba_df], axis=1)
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv'); print(sub.head()); print(f'Total runtime: {(time.time()-t0)/60:.1f} min')
    raise SystemExit

# Clip-normalize base probabilities before stacking
base_models_clipped = []
for name, oof_m, tst_m, loss_m in base_models:
    base_models_clipped.append((name, clip_and_normalize(oof_m), clip_and_normalize(tst_m), loss_m))

# Build stacked features
X_stack = np.hstack([m[1] for m in base_models_clipped])
X_test_stack = np.hstack([m[2] for m in base_models_clipped])
print(f'Stack features shape: {X_stack.shape}, test: {X_test_stack.shape}')

# Second-level CV for meta LR (L2), grid over C
def fit_meta_and_oof(C):
    meta_oof = np.zeros((len(X), n_classes), dtype=np.float32)
    start = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        Xtr, Xva = X_stack[tr_idx], X_stack[va_idx]
        ytr = y[tr_idx]
        meta = LogisticRegression(solver='lbfgs', C=C, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED)
        meta.fit(Xtr, ytr)
        meta_oof[va_idx] = meta.predict_proba(Xva)
        if fold % 2 == 0:
            print(f'[META C={C}] fold {fold} done; elapsed {(time.time()-start):.1f}s', flush=True)
    return meta_oof

best_meta = (None, 1e9, None)
for Cmeta in [0.3, 1.0, 3.0, 10.0]:
    meta_oof = fit_meta_and_oof(Cmeta)
    loss_meta = logloss_with_clip(y, meta_oof)
    print(f'Meta LR(C={Cmeta}) OOF: {loss_meta:.6f}')
    if loss_meta < best_meta[1]: best_meta = (Cmeta, loss_meta, meta_oof)
best_Cmeta, best_meta_oof_loss, best_meta_oof = best_meta
print(f'Best Meta C={best_Cmeta} OOF={best_meta_oof_loss:.6f}')

# Refit meta on full stacked features and predict test
meta_final = LogisticRegression(solver='lbfgs', C=best_Cmeta, penalty='l2', max_iter=5000, n_jobs=-1, random_state=SEED)
meta_final.fit(X_stack, y)
meta_test = meta_final.predict_proba(X_test_stack)

# Temperature scaling on final stack only (global T for now; per-fold TS to be added later)
best_T = 1.0; best_ts_loss = best_meta_oof_loss
for T in np.arange(0.5, 5.01, 0.05):
    ts_oof = temp_scale_probs(best_meta_oof, T)
    loss_T = logloss_with_clip(y, ts_oof)
    if loss_T < best_ts_loss:
        best_ts_loss = loss_T; best_T = float(T)
print(f'Best temperature T={best_T:.2f} improved OOF from {best_meta_oof_loss:.6f} to {best_ts_loss:.6f}')
meta_test_ts = temp_scale_probs(meta_test, best_T)

# Build submission from temperature-scaled stacked predictions
sub = pd.DataFrame(test[id_col])
proba_df = pd.DataFrame(meta_test_ts, columns=le.inverse_transform(np.arange(n_classes)))
proba_df = proba_df[submission_cols]
sub = pd.concat([sub, proba_df], axis=1)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv')
print(sub.head())
print(f'Total runtime: {(time.time()-t0)/60:.1f} min')

In [None]:
# Fusion modeling: tabular + image embeddings + missing flag -> LR only + global TS, submit
import os, sys, time, random, gc, warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

SEED = 1337
random.seed(SEED); np.random.seed(SEED)
t0 = time.time()
print('Starting fusion modeling (LR + TS)...')

DATA_DIR = Path('.')
train = pd.read_csv(DATA_DIR/'train.csv')
test = pd.read_csv(DATA_DIR/'test.csv')
ss = pd.read_csv(DATA_DIR/'sample_submission.csv')
id_col = 'id'; target_col = 'species'
feature_cols = [c for c in train.columns if c not in [id_col, target_col]]
print(f'Loaded train {train.shape}, test {test.shape}; tabular feats: {len(feature_cols)}')

# Label encoding
le = LabelEncoder()
y = le.fit_transform(train[target_col])
classes = list(le.classes_); n_classes = len(classes)
ss_cols = [c for c in ss.columns if c != id_col]
assert set(ss_cols) == set(classes), 'Submission cols mismatch'
submission_cols = ss_cols.copy()

# Load embeddings and missing flags
tr_emb = np.load('train_img_emb.npy')
te_emb = np.load('test_img_emb.npy')
tr_flag = pd.read_csv('train_img_flags.csv')['img_missing'].values.astype(np.float32).reshape(-1,1)
te_flag = pd.read_csv('test_img_flags.csv')['img_missing'].values.astype(np.float32).reshape(-1,1)
print('Embeddings:', tr_emb.shape, te_emb.shape, 'flags:', tr_flag.shape, te_flag.shape)

# Build fused matrices
X_tab = train[feature_cols].values.astype(np.float32)
X_test_tab = test[feature_cols].values.astype(np.float32)
X_fused = np.hstack([X_tab, tr_emb.astype(np.float32), tr_flag])
X_test_fused = np.hstack([X_test_tab, te_emb.astype(np.float32), te_flag])
print('Fused shapes:', X_fused.shape, X_test_fused.shape)

n_splits = 6
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

def clip_and_normalize(P):
    P = np.clip(P, 1e-15, 1 - 1e-15)
    s = P.sum(axis=1, keepdims=True)
    return P / s

def logloss_with_clip(y_true, y_pred):
    p = clip_and_normalize(y_pred)
    return log_loss(y_true, p, labels=np.arange(n_classes))

def temp_scale_probs(P, T):
    P = np.clip(P, 1e-15, 1-1e-15)
    Q = np.power(P, 1.0/float(T))
    return Q / Q.sum(axis=1, keepdims=True)

# Base: LR on fused (StandardScaler on full fused)
def run_lr_fused(C=1.0):
    name = f'LR_fused_C{C}'
    print(f'\n=== {name} ===')
    oof = np.zeros((len(X_fused), n_classes), dtype=np.float32)
    tst = np.zeros((len(X_test_fused), n_classes), dtype=np.float32)
    losses = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fused, y), 1):
        f0 = time.time()
        Xtr, Xva = X_fused[tr_idx], X_fused[va_idx]
        ytr, yva = y[tr_idx], y[va_idx]
        pipe = Pipeline([
            ('scaler', StandardScaler(with_mean=True, with_std=True)),
            ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', C=C, max_iter=5000, n_jobs=-1, random_state=SEED))
        ])
        pipe.fit(Xtr, ytr)
        proba_va = pipe.predict_proba(Xva)
        loss = logloss_with_clip(yva, proba_va)
        oof[va_idx] = proba_va
        tst += pipe.predict_proba(X_test_fused) / n_splits
        losses.append(loss)
        print(f'[LR C={C}] fold {fold}/{n_splits} loss {loss:.6f} elapsed {(time.time()-f0):.1f}s', flush=True)
    oof_loss = logloss_with_clip(y, oof)
    print(f'[LR C={C}] OOF {oof_loss:.6f}; folds: ' + ', '.join([f'{v:.6f}' for v in losses]))
    return oof, tst, oof_loss

best_lr = (None, None, 1e9)
best_lr_tst = None
for C in [0.5, 1.0, 2.0, 3.0]:
    oof_l, tst_l, loss_l = run_lr_fused(C)
    if loss_l < best_lr[2]:
        best_lr = (C, oof_l, loss_l); best_lr_tst = tst_l
print(f'Best LR_fused C={best_lr[0]} OOF={best_lr[2]:.6f}')

# Global temperature scaling on LR OOF
lr_oof = clip_and_normalize(best_lr[1])
lr_tst = clip_and_normalize(best_lr_tst)
base_oof_loss = logloss_with_clip(y, lr_oof)
print('LR base OOF (post-clip):', base_oof_loss)

best_T = 1.0; best_ts_loss = base_oof_loss
for T in np.arange(0.5, 5.01, 0.05):
    ts_oof = temp_scale_probs(lr_oof, T)
    loss_T = logloss_with_clip(y, ts_oof)
    if loss_T < best_ts_loss:
        best_ts_loss = loss_T; best_T = float(T)
print(f'Best temperature T={best_T:.2f} OOF {best_ts_loss:.6f} (from {base_oof_loss:.6f})')

final_test = temp_scale_probs(lr_tst, best_T)

# Build submission
sub = pd.DataFrame(test[id_col])
proba_df = pd.DataFrame(final_test, columns=le.inverse_transform(np.arange(n_classes)))
proba_df = proba_df[submission_cols]
sub = pd.concat([sub, proba_df], axis=1)
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv')
print(sub.head())
print(f'Total fusion runtime: {(time.time()-t0)/60:.1f} min')

Starting fusion modeling (LR + TS)...
Loaded train (891, 194), test (99, 193); tabular feats: 192
Embeddings: (891, 768) (99, 768) flags: (891, 1) (99, 1)
Fused shapes: (891, 961) (99, 961)

=== LR_fused_C0.5 ===




[LR C=0.5] fold 1/6 loss 0.051223 elapsed 2.9s




[LR C=0.5] fold 2/6 loss 0.042543 elapsed 1.8s


