In [1]:
# Config & imports for stable, deterministic training
import os, sys, gc, time, math, json, random, warnings
from pathlib import Path
import numpy as np
import pandas as pd

# Global config
SEEDS = [7, 42, 2025]
PRIMARY_SEED = 42
N_FOLDS = 8
N_THREADS = 6  # cap threads for stability
HEARTBEAT_PATH = 'heartbeat.log'

def seed_everything(seed: int = PRIMARY_SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except Exception:
        pass

def heartbeat(msg: str):
    try:
        with open(HEARTBEAT_PATH, 'a') as f:
            f.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}\n")
    except Exception:
        pass

# Silence warnings/logs
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ['TQDM_DISABLE'] = '1'
os.environ['PIP_DISABLE_PIP_VERSION_CHECK'] = '1'
os.environ['OMP_NUM_THREADS'] = str(N_THREADS)
os.environ['OPENBLAS_NUM_THREADS'] = str(N_THREADS)
os.environ['MKL_NUM_THREADS'] = str(N_THREADS)
os.environ['VECLIB_MAXIMUM_THREADS'] = str(N_THREADS)
os.environ['NUMEXPR_NUM_THREADS'] = str(N_THREADS)

# Pandas display
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

seed_everything(PRIMARY_SEED)
print('Config ready: SEEDS', SEEDS, '| N_FOLDS', N_FOLDS, '| N_THREADS', N_THREADS)
heartbeat('CONFIG INITIALIZED')

Config ready: SEEDS [7, 42, 2025] | N_FOLDS 8 | N_THREADS 6


In [2]:
# Utils: feature engineering, grouping, mm-lite features, encoders
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def cell_volume(a, b, c, alpha_deg, beta_deg, gamma_deg):
    alpha = np.deg2rad(alpha_deg); beta = np.deg2rad(beta_deg); gamma = np.deg2rad(gamma_deg)
    ca, cb, cg = np.cos(alpha), np.cos(beta), np.cos(gamma)
    term = 1 + 2*ca*cb*cg - ca**2 - cb**2 - cg**2
    term = np.clip(term, 0, None)
    return a * b * c * np.sqrt(term)

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    a, b, c = df['lattice_vector_1_ang'], df['lattice_vector_2_ang'], df['lattice_vector_3_ang']
    alpha, beta, gamma = df['lattice_angle_alpha_degree'], df['lattice_angle_beta_degree'], df['lattice_angle_gamma_degree']
    vol = cell_volume(a, b, c, alpha, beta, gamma)
    df['cell_volume'] = vol
    df['volume_per_atom'] = vol / df['number_of_total_atoms']
    df['atoms_per_volume'] = df['number_of_total_atoms'] / vol.replace(0, np.nan)
    # angles info
    for ang, s in [('alpha', alpha), ('beta', beta), ('gamma', gamma)]:
        df[f'cos_{ang}'] = np.cos(np.deg2rad(s))
        df[f'abs_{ang}_dev90'] = np.abs(s - 90.0)
    df['orthorhombicity'] = df[['abs_alpha_dev90','abs_beta_dev90','abs_gamma_dev90']].sum(axis=1)
    # fractions
    for el in ['al','ga','in']:
        df[f'percent_atom_{el}'] = df[f'percent_atom_{el}'].astype(float)
        df[f'frac_{el}'] = df[f'percent_atom_{el}'] / 100.0
    df['percent_atom_o'] = 100.0 - (df['percent_atom_al'] + df['percent_atom_ga'] + df['percent_atom_in'])
    df['frac_o'] = df['percent_atom_o'] / 100.0
    # cation weights
    frac_cat = (df['frac_al'] + df['frac_ga'] + df['frac_in']).replace(0, np.nan)
    df['w_al'] = (df['frac_al']/frac_cat).fillna(0.0)
    df['w_ga'] = (df['frac_ga']/frac_cat).fillna(0.0)
    df['w_in'] = (df['frac_in']/frac_cat).fillna(0.0)
    # Vegard baseline and bowing
    df['vegard_bg'] = 8.8*df['w_al'] + 4.8*df['w_ga'] + 2.9*df['w_in']
    df['bow_in'] = df['w_in']*(1.0 - df['w_in'])
    df['bow_ga'] = df['w_ga']*(1.0 - df['w_ga'])
    # logs
    df['log_vpa'] = np.log1p(df['volume_per_atom'].clip(lower=0))
    df['log_apv'] = np.log1p(df['atoms_per_volume'].clip(lower=0))
    df['log_oc'] = np.log1p((df['frac_o']/(frac_cat+1e-9)).clip(lower=0))
    df['log_in_over_al'] = np.log1p(((df['frac_in']+1e-6)/(df['frac_al']+1e-6)).clip(lower=0))
    # reduced lattice
    l = df['cell_volume'].replace(0, np.nan).pow(1/3)
    df['a_red'] = df['lattice_vector_1_ang']/l
    df['b_red'] = df['lattice_vector_2_ang']/l
    df['c_red'] = df['lattice_vector_3_ang']/l
    df.replace([np.inf,-np.inf], np.nan, inplace=True)
    return df

def compute_stoich_groups(df: pd.DataFrame):
    N = np.rint(df['number_of_total_atoms']/5.0).astype(int)
    n_cat = 2 * N
    frac_al = df['percent_atom_al']/100.0
    frac_ga = df['percent_atom_ga']/100.0
    frac_in = df['percent_atom_in']/100.0
    frac_cations_total = (frac_al + frac_ga + frac_in).replace(0, np.nan)
    w_al = (frac_al / frac_cations_total).clip(0,1).fillna(0)
    w_ga = (frac_ga / frac_cations_total).clip(0,1).fillna(0)
    w_in = (1.0 - w_al - w_ga).clip(0,1)
    n_al = np.rint(n_cat * w_al).astype(int)
    n_ga = np.rint(n_cat * w_ga).astype(int)
    n_in = (n_cat - n_al - n_ga).astype(int)
    n_o = 3 * N
    key = pd.Series(list(zip(N, n_al, n_ga, n_in))).astype(str)
    return key, N, n_al, n_ga, n_in, n_o

# mm-lite and extra low-cost features
def add_mm_lite_and_extras(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # mm-lite stoichiometry norms
    fa, fg, fi, fo = df['frac_al'], df['frac_ga'], df['frac_in'], df['frac_o']
    arr = np.stack([fa, fg, fi, fo], axis=1)
    df['sto_s2'] = np.sqrt((arr**2).sum(axis=1))
    df['sto_s3'] = np.cbrt(np.clip((arr**3).sum(axis=1), a_min=0, a_max=None))
    df['sto_s5'] = np.clip((arr**5).sum(axis=1), a_min=0, a_max=None)**(1/5)
    df['frac_max'] = arr.max(axis=1); df['frac_min'] = arr.min(axis=1); df['frac_range'] = df['frac_max'] - df['frac_min']
    # cation mix stats
    w = np.stack([df['w_al'], df['w_ga'], df['w_in']], axis=1)
    df['w_max'] = w.max(axis=1); df['w_min'] = w.min(axis=1); df['w_range'] = df['w_max'] - df['w_min']
    df['hhi_cation2'] = (w**2).sum(axis=1)
    # valence-orbital proxies
    s_map = {'Al':2,'Ga':2,'In':2,'O':2}; p_map = {'Al':1,'Ga':1,'In':1,'O':4}
    s_cat = df['w_al']*s_map['Al'] + df['w_ga']*s_map['Ga'] + df['w_in']*s_map['In']
    p_cat = df['w_al']*p_map['Al'] + df['w_ga']*p_map['Ga'] + df['w_in']*p_map['In']
    df['vo_cat_s_mean'] = s_cat; df['vo_cat_p_mean'] = p_cat
    df['vo_cat_p_frac'] = p_cat / (s_cat + p_cat + 1e-9); df['vo_cat_p_minus_s'] = p_cat - s_cat
    s_tot = fa*s_map['Al'] + fg*s_map['Ga'] + fi*s_map['In'] + fo*s_map['O']
    p_tot = fa*p_map['Al'] + fg*p_map['Ga'] + fi*p_map['In'] + fo*p_map['O']
    df['vo_tot_s_mean'] = s_tot; df['vo_tot_p_mean'] = p_tot
    df['vo_tot_p_frac'] = p_tot / (s_tot + p_tot + 1e-9); df['vo_tot_p_minus_s'] = p_tot - s_tot
    # physics-driven contrasts
    props = {
        'chi_pauling': {'Al':1.61,'Ga':1.81,'In':1.78,'O':3.44},
        'ionic_radius': {'Al':0.535,'Ga':0.62,'In':0.80,'O':1.38},
        'Z': {'Al':13,'Ga':31,'In':49,'O':8},
        'period': {'Al':3,'Ga':4,'In':5,'O':2},
        'group': {'Al':13,'Ga':13,'In':13,'O':16},
        'first_ionization_energy': {'Al':5.986,'Ga':5.999,'In':5.786,'O':13.618}
    }
    for name, tbl in props.items():
        ca, cg, ci, co = tbl['Al'], tbl['Ga'], tbl['In'], tbl['O']
        wmean = df['w_al']*ca + df['w_ga']*cg + df['w_in']*ci
        df[f'catw_{name}_mean'] = wmean
        df[f'catw_{name}_var'] = (df['w_al']*(ca-wmean)**2 + df['w_ga']*(cg-wmean)**2 + df['w_in']*(ci-wmean)**2)
    df['o_minus_catw_chi_pauling'] = 3.44 - df['catw_chi_pauling_mean']
    df['o_minus_catw_ionic_radius'] = 1.38 - df['catw_ionic_radius_mean']
    # low-cost extras
    df['inv_vpa'] = 1.0 / (df['volume_per_atom'] + 1e-6)
    df['vegard_bg_sq'] = df['vegard_bg']**2
    df['sqrt_vegard'] = np.sqrt(np.clip(df['vegard_bg'], a_min=0, a_max=None))
    df['log1p_vegard'] = np.log1p(np.clip(df['vegard_bg'], a_min=0, a_max=None))
    # pairwise cation-property ratios
    df['ratio_Z_over_ir'] = df['catw_Z_mean'] / (df['catw_ionic_radius_mean'] + 1e-9)
    df['ratio_chi_over_ir'] = df['catw_chi_pauling_mean'] / (df['catw_ionic_radius_mean'] + 1e-9)
    df['ratio_fie_over_chi'] = df['catw_first_ionization_energy_mean'] / (df['catw_chi_pauling_mean'] + 1e-9)
    # N interactions (assumes N exists later; safe fill if missing)
    if 'N' in df.columns:
        for col in ['vegard_bg','w_in','catw_chi_pauling_mean','catw_ionic_radius_mean']:
            df[f'N_x_{col}'] = df['N'] * df[col]
    # cation weight ratios
    eps = 1e-6
    df['w_al_over_in'] = (df['w_al']+eps)/(df['w_in']+eps)
    df['w_ga_over_in'] = (df['w_ga']+eps)/(df['w_in']+eps)
    df['w_al_over_ga'] = (df['w_al']+eps)/(df['w_ga']+eps)
    df['log1p_w_al_over_in'] = np.log1p(df['w_al_over_in'])
    df['log1p_w_ga_over_in'] = np.log1p(df['w_ga_over_in'])
    df['log1p_w_al_over_ga'] = np.log1p(df['w_al_over_ga'])
    # diffs
    df['diff_Z_minus_period'] = df['catw_Z_mean'] - df['catw_period_mean']

    # === High-signal physics features (added) ===
    # Cation entropy and effective cation count
    eps = 1e-12
    w_al = df['w_al'].clip(0,1); w_ga = df['w_ga'].clip(0,1); w_in = df['w_in'].clip(0,1)
    H_cation = -(w_al*np.log(w_al+eps) + w_ga*np.log(w_ga+eps) + w_in*np.log(w_in+eps))
    df['H_cation'] = H_cation
    df['eff_cations'] = np.exp(H_cation)
    # Simplex geometry: distances to corners and center
    center = np.array([1/3, 1/3, 1/3])
    W = np.stack([w_al, w_ga, w_in], axis=1)
    df['dist_l2_center'] = np.linalg.norm(W - center, axis=1)
    df['dist_l1_center'] = np.abs(W - center).sum(axis=1)
    # distances to Al/Ga/In corners
    corners = {'al': np.array([1,0,0]), 'ga': np.array([0,1,0]), 'in': np.array([0,0,1])}
    for k, v in corners.items():
        diff = W - v
        df[f'dist_l2_{k}_corner'] = np.linalg.norm(diff, axis=1)
        df[f'dist_l1_{k}_corner'] = np.abs(diff).sum(axis=1)
    # weight polys/interactions
    df['w_al_sq'] = w_al**2; df['w_ga_sq'] = w_ga**2; df['w_in_sq'] = w_in**2
    df['w_al_ga'] = w_al*w_ga; df['w_al_in'] = w_al*w_in; df['w_ga_in'] = w_ga*w_in
    df['sum_w_cu'] = w_al**3 + w_ga**3 + w_in**3
    # Ionic radius / tolerance proxies
    r_al, r_ga, r_in, r_o = 0.535, 0.62, 0.80, 1.38
    rM = w_al*r_al + w_ga*r_ga + w_in*r_in
    df['rM'] = rM
    df['rM_var'] = (w_al*(r_al - rM)**2 + w_ga*(r_ga - rM)**2 + w_in*(r_in - rM)**2)
    df['t_ratio'] = rM / r_o
    df['t_dev'] = np.abs(df['t_ratio'] - 1.0)
    # Charge density proxies
    if 'N' in df.columns:
        vol = (df['cell_volume'].replace(0, np.nan)).astype(float)
        df['charge_density_6N'] = (6.0 * df['N']) / (vol + 1e-9)
        df['charge_density_3N'] = (3.0 * df['N']) / (vol + 1e-9)
    else:
        df['charge_density_6N'] = 0.0; df['charge_density_3N'] = 0.0
    # Vegard interactions and contrasts
    df['veg_w_al'] = df['vegard_bg'] * w_al
    df['veg_w_ga'] = df['vegard_bg'] * w_ga
    df['veg_w_in'] = df['vegard_bg'] * w_in
    df['veg_minus_catw_chi'] = df['vegard_bg'] - df['catw_chi_pauling_mean']

    df.replace([np.inf,-np.inf], np.nan, inplace=True)
    return df

def lattice_system_from_sgnum(sgnum: int) -> int:
    n = int(sgnum)
    if n <= 2: return 1
    if n <= 15: return 2
    if n <= 74: return 3
    if n <= 142: return 4
    if n <= 167: return 5
    if n <= 194: return 6
    return 7

def build_stratified_group_folds(train_df: pd.DataFrame, gkey: pd.Series, y: pd.Series, n_splits: int = 8, seed: int = 42) -> np.ndarray:
    gmean = y.groupby(gkey).mean()
    gbin = pd.qcut(gmean, q=10, labels=False, duplicates='drop')
    uniq = pd.DataFrame({'g': gmean.index, 'bin': gbin.values}).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    group_to_fold = {}
    for k, (_, val_idx) in enumerate(skf.split(uniq['g'], uniq['bin'])):
        for g in uniq['g'].iloc[val_idx]:
            group_to_fold[g] = k
    fold_ids = gkey.map(group_to_fold).astype(int).values
    return fold_ids

def oof_m_estimate_te(series_cat: pd.Series, y_log: pd.Series, fold_ids: np.ndarray, m: float, return_full_map: bool = False):
    te = np.zeros(len(series_cat), dtype=float)
    global_mean = float(y_log.mean())
    s_cat = series_cat.astype(str)
    for k in np.unique(fold_ids):
        tr = np.where(fold_ids != k)[0]; va = np.where(fold_ids == k)[0]
        s_tr = s_cat.iloc[tr]
        counts = s_tr.groupby(s_tr).size()
        sums = pd.Series(y_log.iloc[tr].values, index=s_tr.index).groupby(s_tr).sum()
        enc = (sums + m*global_mean) / (counts + m)
        te[va] = s_cat.iloc[va].map(enc).fillna(global_mean).values
    if return_full_map:
        counts_all = s_cat.groupby(s_cat).size()
        sums_all = pd.Series(y_log.values, index=s_cat.index).groupby(s_cat).sum()
        enc_all = (sums_all + m*global_mean) / (counts_all + m)
        return te, enc_all.to_dict(), global_mean
    return te

print('Utils ready.')

Utils ready.


In [3]:
# Centralized Ordered Target Encoding (OOF LOO) + OOF Frequency Encodings
import numpy as np
import pandas as pd

def _safe_series(x):
    if isinstance(x, pd.Series):
        return x
    return pd.Series(x)

def oof_freq_encoding(train_cat: pd.Series, test_cat: pd.Series, fold_ids: np.ndarray):
    train_cat = train_cat.astype(str)
    test_cat = test_cat.astype(str)
    n = len(train_cat)
    fe_tr = np.zeros(n, dtype=float)
    uniq_folds = np.unique(fold_ids)
    for k in uniq_folds:
        tr_idx = np.where(fold_ids != k)[0]
        va_idx = np.where(fold_ids == k)[0]
        counts = train_cat.iloc[tr_idx].value_counts(normalize=True)
        fe_tr[va_idx] = train_cat.iloc[va_idx].map(counts).fillna(0.0).values
    full_counts = train_cat.value_counts(normalize=True)
    fe_te = test_cat.map(full_counts).fillna(0.0).values
    return fe_tr, fe_te

def ordered_te_oof(train_cat: pd.Series, test_cat: pd.Series, y_log: np.ndarray, fold_ids: np.ndarray, m: float, noise_std: float, min_support: int = 5, rng: np.random.RandomState | None = None):
    # STRICT OOF per-fold encodings for train; add noise only to training rows; keep validation OOF clean; full-train map for test.
    if rng is None:
        rng = np.random.RandomState(42)
    s_tr = train_cat.astype(str)
    s_te = test_cat.astype(str)
    y_log = pd.Series(y_log)
    n = len(s_tr)
    enc_tr = np.zeros(n, dtype=float)
    uniq = np.unique(fold_ids)
    for k in uniq:
        tr_idx = np.where(fold_ids != k)[0]
        va_idx = np.where(fold_ids == k)[0]
        s_tr_k = s_tr.iloc[tr_idx]
        y_k = y_log.iloc[tr_idx]
        mu = float(y_k.mean())
        stats = pd.DataFrame({'cat': s_tr_k, 'y': y_k}).groupby('cat')['y'].agg(['sum','count'])
        enc_map = (stats['sum'] + m * mu) / (stats['count'] + m)
        if min_support is not None and min_support > 0:
            rare = stats['count'] < min_support
            if rare.any():
                enc_map.loc[rare] = mu
        # Clean, noise-free OOF encodings for validation indices
        enc_tr[va_idx] = s_tr.iloc[va_idx].map(enc_map).fillna(mu).values
        # Apply noise ONLY to training rows within this fold; do not overwrite already-set validation entries
        enc_vals = s_tr.iloc[tr_idx].map(enc_map).fillna(mu).values
        if noise_std and noise_std > 0:
            enc_vals = enc_vals + rng.normal(0.0, noise_std, size=enc_vals.shape)
        # assign only where not yet set by validation (enc_tr initialized to 0.0); validation rows are non-zero now
        mask_not_set = (enc_tr[tr_idx] == 0.0)
        if mask_not_set.any():
            tmp = enc_tr.copy()
            tmp_tr = tmp[tr_idx]
            tmp_tr[mask_not_set] = enc_vals[mask_not_set]
            enc_tr[tr_idx] = tmp_tr
    # Test encodings from full map (no noise)
    mu_full = float(y_log.mean())
    stats_full = pd.DataFrame({'cat': s_tr, 'y': y_log}).groupby('cat')['y'].agg(['sum','count'])
    enc_map_full = (stats_full['sum'] + m * mu_full) / (stats_full['count'] + m)
    enc_te = s_te.map(enc_map_full).fillna(mu_full).values.astype(float)
    return enc_tr.astype(float), enc_te

def add_encoded_features(X_tr: pd.DataFrame, X_te: pd.DataFrame, tr_df: pd.DataFrame, te_df: pd.DataFrame, y_log: np.ndarray, fold_ids: np.ndarray, seed: int = 42):
    rng = np.random.RandomState(int(seed))
    # Categories
    sg_tr = tr_df['spacegroup'].astype(str)
    sg_te = te_df['spacegroup'].astype(str)
    ls_tr = tr_df['lattice_system'].astype(int).astype(str)  # treat as categorical
    ls_te = te_df['lattice_system'].astype(int).astype(str)
    g_tr = tr_df[['N','n_al','n_ga','n_in']].astype(int).astype(str).agg('_'.join, axis=1)
    g_te = te_df[['N','n_al','n_ga','n_in']].astype(int).astype(str).agg('_'.join, axis=1)
    # Nb buckets (use q=8 as per guidance)
    Nb_tr = pd.qcut(tr_df['N'].astype(float), q=8, labels=False, duplicates='drop')
    try:
        _, bins = pd.qcut(tr_df['N'].astype(float), q=8, duplicates='drop', retbins=True)
        bins = np.unique(bins)
        Nb_te_raw = np.digitize(te_df['N'].astype(float).values, bins[1:-1], right=True)
        Nb_te = pd.Series(Nb_te_raw, index=te_df.index)
    except Exception:
        Nb_te = pd.qcut(te_df['N'].astype(float), q=8, labels=False, duplicates='drop')
    Nb_tr = Nb_tr.astype('Int64').astype(str).fillna('-1')
    Nb_te = Nb_te.astype('Int64').astype(str).fillna('-1')
    # Encodings per spec
    # Use zero noise for low-cardinality features like lattice_system and Nb
    enc_cfg = [
        ('sg', sg_tr, sg_te, 30.0, 0.006),
        ('group', g_tr, g_te, 14.0, 0.004),
        ('ls', ls_tr, ls_te, 10.0, 0.0),
        ('Nb', Nb_tr, Nb_te, 10.0, 0.0),
    ]
    Xtr = X_tr.copy()
    Xte = X_te.copy()
    meta_oof = {}  # return for stacking if needed
    for name, cat_tr, cat_te, m, sigma in enc_cfg:
        te_tr, te_te = ordered_te_oof(cat_tr, cat_te, y_log, fold_ids, m=m, noise_std=sigma, min_support=5, rng=rng)
        fe_tr, fe_te = oof_freq_encoding(cat_tr, cat_te, fold_ids)
        # For base models: DROP group encodings (they are fold-constant due to group-disjoint CV)
        if name != 'group':
            Xtr[f'te_{name}'] = te_tr
            Xte[f'te_{name}'] = te_te
            Xtr[f'fe_{name}'] = fe_tr
            Xte[f'fe_{name}'] = fe_te
        # Always return in meta_oof for stacking diagnostics
        meta_oof[f'te_{name}'] = te_tr
        meta_oof[f'fe_{name}'] = fe_tr
    # Ensure numeric-only matrices and consistent fills
    med = Xtr.median(numeric_only=True)
    Xtr = Xtr.fillna(med)
    Xte = Xte.fillna(med)
    num_cols = list(Xtr.select_dtypes(include=[np.number]).columns)
    Xtr = Xtr[num_cols]
    Xte = Xte[num_cols]
    return Xtr, Xte, meta_oof

In [6]:
# Cache cleanup to prevent drift before rebuilding
import os, glob
from pathlib import Path

patterns = [
    'fold_ids.npy',
    'y.npy',
    'X.parquet',
    'X_test.parquet',
    'train_fe.parquet',
    'test_fe.parquet',
    'features.json',
    'stoich_groups.csv',
    'oof_*.npy',
    'pred_*.npy',
    'oof_*.npz',
    'pred_*.npz'
]
removed = []
for pat in patterns:
    for fp in glob.glob(pat):
        try:
            Path(fp).unlink(missing_ok=True)
            removed.append(fp)
        except Exception as e:
            print('Could not remove', fp, '|', e)
print('Removed files:', len(removed))
print(sorted(removed)[:25], '...')

Removed files: 18
['X.parquet', 'X_test.parquet', 'features.json', 'fold_ids.npy', 'oof_fe_Nb.npy', 'oof_fe_group.npy', 'oof_fe_ls.npy', 'oof_fe_sg.npy', 'oof_lgbm.npy', 'oof_te_Nb.npy', 'oof_te_group.npy', 'oof_te_ls.npy', 'oof_te_sg.npy', 'pred_lgbm_test.npy', 'stoich_groups.csv', 'test_fe.parquet', 'train_fe.parquet', 'y.npy'] ...


In [7]:
# Folds + feature build/cache: engineer, mm-lite, groups, lattice_system; persist fold_ids and X/y
import numpy as np, pandas as pd, json, gc, time
from pathlib import Path

t0 = time.time()
heartbeat('BUILD START')

# Paths
FOLD_PATH = Path('fold_ids.npy')
Y_PATH = Path('y.npy')
X_TR_PATH = Path('X.parquet')
X_TE_PATH = Path('X_test.parquet')
TRAIN_FE_PATH = Path('train_fe.parquet')
TEST_FE_PATH = Path('test_fe.parquet')
FEATS_JSON = Path('features.json')
GROUPS_CSV = Path('stoich_groups.csv')

# Load
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Base features
tr = engineer_features(train)
te = engineer_features(test)

# Stoichiometry counts and group key
gkey_tr, N_tr, al_tr, ga_tr, in_tr, o_tr = compute_stoich_groups(train)
gkey_te, N_te, al_te, ga_te, in_te, o_te = compute_stoich_groups(test)
tr['N'] = N_tr; tr['n_al'] = al_tr; tr['n_ga'] = ga_tr; tr['n_in'] = in_tr; tr['n_o'] = o_tr
te['N'] = N_te; te['n_al'] = al_te; te['n_ga'] = ga_te; te['n_in'] = in_te; te['n_o'] = o_te

# Lattice system from spacegroup
tr['sg_number'] = pd.to_numeric(tr['spacegroup'], errors='coerce').fillna(-1).astype(int)
te['sg_number'] = pd.to_numeric(te['spacegroup'], errors='coerce').fillna(-1).astype(int)
tr['lattice_system'] = tr['sg_number'].apply(lattice_system_from_sgnum).astype(int)
te['lattice_system'] = te['sg_number'].apply(lattice_system_from_sgnum).astype(int)

# mm-lite and extra features
tr = add_mm_lite_and_extras(tr)
te = add_mm_lite_and_extras(te)

# Debug: verify presence of newly added physics features
check_cols = ['H_cation','eff_cations','t_ratio','t_dev','rM_var','charge_density_6N','dist_l2_center','veg_w_al','veg_w_ga','veg_w_in']
present = [c for c in check_cols if c in tr.columns]
missing = [c for c in check_cols if c not in tr.columns]
print('Physics features present:', present)
print('Physics features missing:', missing)

# Persist engineered frames (for reuse by modeling cell)
tr.to_parquet(TRAIN_FE_PATH, index=False)
te.to_parquet(TEST_FE_PATH, index=False)

# Build 8-fold stratified group-disjoint folds (stratify by group mean target)
y = train['bandgap_energy_ev'].astype(float)
fold_ids = build_stratified_group_folds(tr, gkey_tr.astype(str), y, n_splits=N_FOLDS, seed=PRIMARY_SEED)
np.save(FOLD_PATH, fold_ids)
np.save(Y_PATH, np.log1p(y.clip(lower=0)).values)

# Save group mapping (for diagnostics/reuse)
pd.DataFrame({'id': train['id'], 'stoich_group': gkey_tr.astype(str)}).to_csv(GROUPS_CSV, index=False)

# Build numeric-only X caches (no encodings yet; encodings will be added in modeling using frozen folds)
drop_cols = ['id','bandgap_energy_ev']
common_cols = [c for c in tr.columns if c in te.columns]
feat_cols = [c for c in common_cols if c not in drop_cols]
X_tr = tr[feat_cols].copy()
X_te = te[feat_cols].copy()
med = X_tr.median(numeric_only=True)
X_tr = X_tr.fillna(med)
X_te = X_te.fillna(med)
num_cols = list(X_tr.select_dtypes(include=[np.number]).columns)
X_tr = X_tr[num_cols]
X_te = X_te[num_cols]
X_tr.to_parquet(X_TR_PATH, index=False)
X_te.to_parquet(X_TE_PATH, index=False)
with open(FEATS_JSON, 'w') as f:
    json.dump({'features': num_cols}, f)

print('Built & cached:',
      'fold_ids.npy', FOLD_PATH.exists(),
      '| X.parquet', X_TR_PATH.exists(),
      '| X_test.parquet', X_TE_PATH.exists(),
      '| y.npy', Y_PATH.exists(),
      '| feats', len(num_cols),
      '| elapsed', f'{time.time()-t0:.1f}s')
heartbeat('BUILD DONE')
gc.collect();

Physics features present: ['H_cation', 'eff_cations', 't_ratio', 't_dev', 'rM_var', 'charge_density_6N', 'dist_l2_center', 'veg_w_al', 'veg_w_ga', 'veg_w_in']
Physics features missing: []
Built & cached: fold_ids.npy True | X.parquet True | X_test.parquet True | y.npy True | feats 123 | elapsed 0.3s


In [12]:
# Models: load caches, add centralized OOF Ordered TEs using frozen folds, train multi-seed LightGBM, save OOF/preds
import numpy as np, pandas as pd, time, gc, json, os
from pathlib import Path
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

heartbeat('MODELS START')
t0_all = time.time()

# Paths
FOLD_PATH = Path('fold_ids.npy')
Y_PATH = Path('y.npy')
X_TR_PATH = Path('X.parquet')
X_TE_PATH = Path('X_test.parquet')
TRAIN_FE_PATH = Path('train_fe.parquet')
TEST_FE_PATH = Path('test_fe.parquet')
FEATS_JSON = Path('features.json')

# Load caches
fold_ids = np.load(FOLD_PATH)
y_log = np.load(Y_PATH)
X_tr = pd.read_parquet(X_TR_PATH)
X_te = pd.read_parquet(X_TE_PATH)
tr = pd.read_parquet(TRAIN_FE_PATH)
te = pd.read_parquet(TEST_FE_PATH)
with open(FEATS_JSON) as f: meta = json.load(f)

# Centralized encodings (OOF Ordered TE + OOF Frequency) using frozen folds and y_log
X_tr_enc, X_te_enc, meta_oof = add_encoded_features(X_tr, X_te, tr, te, y_log, fold_ids, seed=PRIMARY_SEED)
print('Feature matrix (centralized encodings):', X_tr_enc.shape, X_te_enc.shape)

# For LightGBM stability: drop all te_* columns (use physics + fe_* only)
drop_te_cols = [c for c in X_tr_enc.columns if c.startswith('te_')]
X_tr_lgb = X_tr_enc.drop(columns=drop_te_cols, errors='ignore')
X_te_lgb = X_te_enc.drop(columns=drop_te_cols, errors='ignore')
enc_cols_kept = [c for c in X_tr_lgb.columns if c.startswith('fe_')]

# Auto-drop zero-variance columns
std = X_tr_lgb.std(numeric_only=True)
const_cols = list(std[std == 0].index)
if const_cols:
    X_tr_lgb = X_tr_lgb.drop(columns=const_cols, errors='ignore')
    X_te_lgb = X_te_lgb.drop(columns=const_cols, errors='ignore')

print('LGB matrices (no te_*, const-dropped):', X_tr_lgb.shape, X_te_lgb.shape, '| kept fe_ cols:', len(enc_cols_kept))

# Quick diagnostics
try:
    base_n = len(meta.get('features', []))
    enc_cols_all = [c for c in X_tr_enc.columns if c.startswith('te_') or c.startswith('fe_')]
    low_uniq = {c: X_tr_lgb[c].nunique() for c in X_tr_lgb.columns if c.startswith('fe_')}
    print('Base feat count:', base_n, '| Enc cols added (all):', len(enc_cols_all), '| fe_* kept:', len(enc_cols_kept))
    print('Const cols dropped:', const_cols)
    print('fe_ nunique:', {k: int(v) for k, v in low_uniq.items()})
except Exception as e:
    print('Diagnostics warning:', e)

# Persist meta OOF encodings for later stacking
for k, v in meta_oof.items():
    np.save(f'oof_{k}.npy', np.asarray(v, dtype=float))

# LightGBM params (Variant B from expert guidance, with extra_trees for variance dampening)
base_params = {
    'objective': 'regression', 'metric': 'rmse',
    'learning_rate': 0.020, 'num_leaves': 32, 'max_depth': -1,
    'min_data_in_leaf': 240, 'feature_fraction': 0.58,
    'bagging_fraction': 0.75, 'bagging_freq': 1,
    'lambda_l2': 18.0, 'lambda_l1': 0.0,
    'extra_trees': True, 'extra_tree_threshold': 0.5,
    'verbosity': -1, 'num_threads': N_THREADS,
    'deterministic': True, 'force_col_wise': True
}

seeds = SEEDS
n_splits = len(np.unique(fold_ids))
oof_seeds = []; pred_seeds = []

for si, SEED in enumerate(seeds):
    params = dict(base_params)
    params['seed'] = int(SEED)
    oof = np.zeros(len(X_tr_lgb), dtype=float)
    pred = np.zeros(len(X_te_lgb), dtype=float)
    t0 = time.time()
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        dtr = lgb.Dataset(X_tr_lgb.iloc[tr_idx], label=y_log[tr_idx], free_raw_data=False)
        dva = lgb.Dataset(X_tr_lgb.iloc[va_idx], label=y_log[va_idx], free_raw_data=False)
        model = lgb.train(params, dtr, num_boost_round=11000, valid_sets=[dtr, dva], valid_names=['train','valid'],
                          callbacks=[lgb.early_stopping(600), lgb.log_evaluation(0)])
        oof[va_idx] = model.predict(X_tr_lgb.iloc[va_idx], num_iteration=model.best_iteration)
        pred += model.predict(X_te_lgb, num_iteration=model.best_iteration) / n_splits
        print(f'LGB SEED {SEED} | fold {k} done | best_iter {model.best_iteration} | elapsed {time.time()-t0:.1f}s', flush=True)
        del model, dtr, dva; gc.collect()
    rmse = float(mean_squared_error(y_log, oof) ** 0.5)
    print(f'SEED {SEED}: OOF RMSLE {rmse:.6f} | elapsed {time.time()-t0:.1f}s')
    oof_seeds.append(oof); pred_seeds.append(pred)

# Average across seeds
oof_avg = np.mean(np.vstack(oof_seeds), axis=0)
pred_avg = np.mean(np.vstack(pred_seeds), axis=0)
cv = float(mean_squared_error(y_log, oof_avg) ** 0.5)
print(f'Blended seeds CV RMSLE: {cv:.6f} | total elapsed {time.time()-t0_all:.1f}s')

# Persist OOF/test preds and a quick submission (for sanity); final blend + calibration handled later
np.save('oof_lgbm.npy', oof_avg)
np.save('pred_lgbm_test.npy', pred_avg)
sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_avg).clip(0, 6.5)})
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv', sub.shape)
heartbeat('MODELS DONE')

Feature matrix (centralized encodings): (2160, 129) (240, 129)
LGB matrices (no te_*, const-dropped): (2160, 125) (240, 125) | kept fe_ cols: 3
Base feat count: 123 | Enc cols added (all): 6 | fe_* kept: 3
Const cols dropped: ['vo_tot_s_mean']
fe_ nunique: {'fe_sg': 48, 'fe_ls': 40, 'fe_Nb': 24}
Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[8110]	train's rmse: 0.0751505	valid's rmse: 0.0740106
LGB SEED 7 | fold 0 done | best_iter 8110 | elapsed 2.5s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[8215]	train's rmse: 0.0720042	valid's rmse: 0.0956388
LGB SEED 7 | fold 1 done | best_iter 8215 | elapsed 5.1s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7699]	train's rmse: 0.0738746	valid's rmse: 0.0859531
LGB SEED 7 | fold 2 done | best_iter 7699 | elapsed 7.5s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[4843]	train's rmse: 0.0745762	valid's rmse: 0.104252
LGB SEED 7 | fold 3 done | best_iter 4843 | elapsed 9.2s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6067]	train's rmse: 0.0762362	valid's rmse: 0.0792978
LGB SEED 7 | fold 4 done | best_iter 6067 | elapsed 11.3s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[4776]	train's rmse: 0.0785192	valid's rmse: 0.0712584
LGB SEED 7 | fold 5 done | best_iter 4776 | elapsed 12.9s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6415]	train's rmse: 0.0729317	valid's rmse: 0.104816
LGB SEED 7 | fold 6 done | best_iter 6415 | elapsed 15.1s


Training until validation scores don't improve for 600 rounds


Did not meet early stopping. Best iteration is:
[10768]	train's rmse: 0.0734387	valid's rmse: 0.0718828
LGB SEED 7 | fold 7 done | best_iter 10768 | elapsed 18.4s


SEED 7: OOF RMSLE 0.086908 | elapsed 18.5s
Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6565]	train's rmse: 0.0765407	valid's rmse: 0.0745378
LGB SEED 42 | fold 0 done | best_iter 6565 | elapsed 2.1s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7564]	train's rmse: 0.0725704	valid's rmse: 0.0958433
LGB SEED 42 | fold 1 done | best_iter 7564 | elapsed 4.5s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7121]	train's rmse: 0.0745311	valid's rmse: 0.0860765
LGB SEED 42 | fold 2 done | best_iter 7121 | elapsed 6.8s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6113]	train's rmse: 0.0729129	valid's rmse: 0.103931
LGB SEED 42 | fold 3 done | best_iter 6113 | elapsed 8.9s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[5641]	train's rmse: 0.0767063	valid's rmse: 0.0790215
LGB SEED 42 | fold 4 done | best_iter 5641 | elapsed 10.8s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[5030]	train's rmse: 0.0786236	valid's rmse: 0.071913
LGB SEED 42 | fold 5 done | best_iter 5030 | elapsed 12.5s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7260]	train's rmse: 0.0719473	valid's rmse: 0.104021
LGB SEED 42 | fold 6 done | best_iter 7260 | elapsed 14.9s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[8857]	train's rmse: 0.0746273	valid's rmse: 0.0716686
LGB SEED 42 | fold 7 done | best_iter 8857 | elapsed 17.8s


SEED 42: OOF RMSLE 0.086875 | elapsed 17.9s
Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7271]	train's rmse: 0.0756971	valid's rmse: 0.0746322
LGB SEED 2025 | fold 0 done | best_iter 7271 | elapsed 2.2s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6072]	train's rmse: 0.074075	valid's rmse: 0.0954372
LGB SEED 2025 | fold 1 done | best_iter 6072 | elapsed 4.3s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6194]	train's rmse: 0.0754323	valid's rmse: 0.0856745
LGB SEED 2025 | fold 2 done | best_iter 6194 | elapsed 6.3s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[5132]	train's rmse: 0.0740051	valid's rmse: 0.103977
LGB SEED 2025 | fold 3 done | best_iter 5132 | elapsed 8.1s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[6230]	train's rmse: 0.0759505	valid's rmse: 0.079044
LGB SEED 2025 | fold 4 done | best_iter 6230 | elapsed 10.2s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[5628]	train's rmse: 0.0774817	valid's rmse: 0.0714186
LGB SEED 2025 | fold 5 done | best_iter 5628 | elapsed 12.1s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7632]	train's rmse: 0.0718986	valid's rmse: 0.104137
LGB SEED 2025 | fold 6 done | best_iter 7632 | elapsed 14.6s


Training until validation scores don't improve for 600 rounds


Early stopping, best iteration is:
[7162]	train's rmse: 0.0761482	valid's rmse: 0.0733414
LGB SEED 2025 | fold 7 done | best_iter 7162 | elapsed 16.9s


SEED 2025: OOF RMSLE 0.086913 | elapsed 17.0s
Blended seeds CV RMSLE: 0.086744 | total elapsed 53.6s
Saved submission.csv (240, 2)


In [9]:
# CatBoost OOF with centralized encodings + stronger regularization; save OOF/test; optional NNLS blend
import numpy as np, pandas as pd, json, time, gc, os, sys, subprocess
from pathlib import Path
from sklearn.metrics import mean_squared_error
from scipy.optimize import nnls

t0_all = time.time()
print('CatBoost (centralized encodings) start')

# Ensure CatBoost is available
try:
    from catboost import CatBoostRegressor, Pool
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--prefer-binary', '-q', 'catboost'])
    from catboost import CatBoostRegressor, Pool

# Paths
FOLD_PATH = Path('fold_ids.npy')
Y_PATH = Path('y.npy')
X_TR_PATH = Path('X.parquet')
X_TE_PATH = Path('X_test.parquet')
TRAIN_FE_PATH = Path('train_fe.parquet')
TEST_FE_PATH = Path('test_fe.parquet')

# Load caches
fold_ids = np.load(FOLD_PATH)
y_log = np.load(Y_PATH)
X_tr = pd.read_parquet(X_TR_PATH)
X_te = pd.read_parquet(X_TE_PATH)
tr = pd.read_parquet(TRAIN_FE_PATH)
te = pd.read_parquet(TEST_FE_PATH)

# Centralized encodings (strict OOF Ordered TE + OOF Frequency); drop te_* for base CB per guidance
X_tr_enc, X_te_enc, meta_oof = add_encoded_features(X_tr, X_te, tr, te, y_log, fold_ids, seed=PRIMARY_SEED)
drop_te_cols = [c for c in X_tr_enc.columns if c.startswith('te_')]
X_tr_cb = X_tr_enc.drop(columns=drop_te_cols, errors='ignore').copy()
X_te_cb = X_te_enc.drop(columns=drop_te_cols, errors='ignore').copy()
print('CB base matrices (no te_*):', X_tr_cb.shape, X_te_cb.shape)

# Add raw categoricals
X_tr_cb['spacegroup'] = tr['spacegroup'].astype(str).values
X_te_cb['spacegroup'] = te['spacegroup'].astype(str).values
X_tr_cb['lattice_system'] = tr['lattice_system'].astype(int).astype(str).values
X_te_cb['lattice_system'] = te['lattice_system'].astype(int).astype(str).values
# Nb categorical (qcut on train, digitize test by train bins) using q=8 as in encoders
try:
    _, bins = pd.qcut(tr['N'].astype(float), q=8, duplicates='drop', retbins=True)
    bins = np.unique(bins)
    Nb_tr_lab = pd.qcut(tr['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
    Nb_te_raw = np.digitize(te['N'].astype(float).values, bins[1:-1], right=True)
    Nb_te_lab = pd.Series(Nb_te_raw, index=te.index).astype('Int64')
except Exception:
    Nb_tr_lab = pd.qcut(tr['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
    Nb_te_lab = pd.qcut(te['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
X_tr_cb['Nb_cat'] = Nb_tr_lab.astype(str).fillna('-1').values
X_te_cb['Nb_cat'] = Nb_te_lab.astype(str).fillna('-1').values

# Cat features indices
cat_cols = ['spacegroup','lattice_system','Nb_cat']
cat_idx = [X_tr_cb.columns.get_loc(c) for c in cat_cols]

# Fill NaNs for numeric columns only; leave categoricals as-is
num_cols = X_tr_cb.columns.difference(cat_cols)
med = X_tr_cb[num_cols].median(numeric_only=True)
X_tr_cb[num_cols] = X_tr_cb[num_cols].fillna(med)
X_te_cb[num_cols] = X_te_cb[num_cols].fillna(med)

seeds = SEEDS
n_splits = len(np.unique(fold_ids))
oof_cb_seeds = []; pred_cb_seeds = []

for SEED in seeds:
    params = dict(
        loss_function='RMSE', iterations=8000, learning_rate=0.028, depth=7,
        l2_leaf_reg=15.0, subsample=0.8, rsm=0.78, od_type='Iter', od_wait=400,
        random_seed=int(SEED), verbose=0, allow_writing_files=False, thread_count=N_THREADS
    )
    oof = np.zeros(len(X_tr_cb), dtype=float)
    pred = np.zeros(len(X_te_cb), dtype=float)
    t0 = time.time()
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        pool_tr = Pool(X_tr_cb.iloc[tr_idx], y_log[tr_idx], cat_features=cat_idx)
        pool_va = Pool(X_tr_cb.iloc[va_idx], y_log[va_idx], cat_features=cat_idx)
        model = CatBoostRegressor(**params)
        model.fit(pool_tr, eval_set=pool_va, use_best_model=True)
        oof[va_idx] = model.predict(pool_va)
        pred += model.predict(Pool(X_te_cb, cat_features=cat_idx)) / n_splits
        del model, pool_tr, pool_va; gc.collect()
    rmse = float(mean_squared_error(y_log, oof) ** 0.5)
    print(f'CatBoost SEED {SEED}: OOF RMSLE {rmse:.6f} | elapsed {time.time()-t0:.1f}s')
    oof_cb_seeds.append(oof); pred_cb_seeds.append(pred)

# Average across seeds for CatBoost
oof_cb = np.mean(np.vstack(oof_cb_seeds), axis=0)
pred_cb = np.mean(np.vstack(pred_cb_seeds), axis=0)
cv_cb = float(mean_squared_error(y_log, oof_cb) ** 0.5)
print(f'CatBoost averaged CV RMSLE: {cv_cb:.6f}')
np.save('oof_catboost.npy', oof_cb)
np.save('pred_catboost_test.npy', pred_cb)

# Optional: blend with existing LGB OOF if available (for quick check); calibration handled in separate cell
if Path('oof_lgbm.npy').exists() and Path('pred_lgbm_test.npy').exists():
    oof_lgb = np.load('oof_lgbm.npy')
    pred_lgb = np.load('pred_lgbm_test.npy')
    P = np.vstack([oof_lgb, oof_cb]).T
    w, _ = nnls(P, y_log); w = w/(w.sum() if w.sum()>0 else 1.0)
    oof_blend = P @ w; cv_blend = float(mean_squared_error(y_log, oof_blend) ** 0.5)
    print('NNLS weights (LGB, CB):', w, '| Blended CV RMSLE:', f'{cv_blend:.6f}')
    Ptest = np.vstack([pred_lgb, pred_cb]).T
    pred_blend = Ptest @ w
    sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_blend).clip(0, 6.5)})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (blend preview):', sub.shape)
else:
    sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_cb).clip(0, 6.5)})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (CatBoost only):', sub.shape)

print('CatBoost (centralized encodings) done | total elapsed', f'{time.time()-t0_all:.1f}s')

CatBoost (centralized encodings) start


CB base matrices (no te_*): (2160, 126) (240, 126)


CatBoost SEED 7: OOF RMSLE 0.086514 | elapsed 93.0s


CatBoost SEED 42: OOF RMSLE 0.086495 | elapsed 93.7s


CatBoost SEED 2025: OOF RMSLE 0.086509 | elapsed 94.0s
CatBoost averaged CV RMSLE: 0.086165
NNLS weights (LGB, CB): [0.43589716 0.56410284] | Blended CV RMSLE: 0.085477
Saved submission.csv (blend preview): (240, 2)
CatBoost (centralized encodings) done | total elapsed 281.1s


In [16]:
# Post-processing: NNLS re-blend + Per-fold/Global Isotonic calibration (choose best on OOF)
import numpy as np, pandas as pd, time
from pathlib import Path
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error
from scipy.optimize import nnls

t0 = time.time()
print('Calibration start (per-fold vs global)')

# Load OOF/logits and targets
y_log = np.load('y.npy')
fold_ids = np.load('fold_ids.npy')
oof_lgb = np.load('oof_lgbm.npy')
pred_lgb = np.load('pred_lgbm_test.npy')
oof_cb = np.load('oof_catboost.npy')
pred_cb = np.load('pred_catboost_test.npy')

# NNLS weights on OOF (log space)
P = np.vstack([oof_lgb, oof_cb]).T
w, _ = nnls(P, y_log)
w = w / (w.sum() if w.sum() > 0 else 1.0)
print('NNLS weights (LGB, CB):', w)
oof_blend = P @ w
cv_blend = float(mean_squared_error(y_log, oof_blend) ** 0.5)
print(f'Pre-calibration blended CV RMSLE: {cv_blend:.6f}')
Ptest = np.vstack([pred_lgb, pred_cb]).T
pred_blend = Ptest @ w

# Per-fold isotonic calibration
n_splits = len(np.unique(fold_ids))
oof_cal_fold = np.zeros_like(oof_blend)
pred_cal_fold_parts = []
for k in range(n_splits):
    tr_idx = np.where(fold_ids != k)[0]
    va_idx = np.where(fold_ids == k)[0]
    iso_k = IsotonicRegression(out_of_bounds='clip')
    iso_k.fit(oof_blend[tr_idx], y_log[tr_idx])
    oof_cal_fold[va_idx] = iso_k.transform(oof_blend[va_idx])
    pred_cal_fold_parts.append(iso_k.transform(pred_blend))
cv_fold = float(mean_squared_error(y_log, oof_cal_fold) ** 0.5)
pred_cal_fold = np.mean(np.stack(pred_cal_fold_parts, axis=0), axis=0)
print(f'Per-fold isotonic blended CV RMSLE: {cv_fold:.6f}')

# Global isotonic calibration
iso_full = IsotonicRegression(out_of_bounds='clip')
iso_full.fit(oof_blend, y_log)
oof_cal_full = iso_full.transform(oof_blend)
cv_full = float(mean_squared_error(y_log, oof_cal_full) ** 0.5)
pred_cal_full = iso_full.transform(pred_blend)
print(f'Global isotonic blended CV RMSLE: {cv_full:.6f}')

# Choose best calibration based on OOF CV
use_full = cv_full <= cv_fold
pred_cal = pred_cal_full if use_full else pred_cal_fold
chosen = 'global' if use_full else 'per-fold'
print(f'Chosen calibration: {chosen}')

sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_cal).clip(0, 6.5)})
sub.to_csv('submission.csv', index=False)
print('Calibrated submission.csv saved:', sub.shape, '| elapsed', f'{time.time()-t0:.1f}s')

Calibration start (per-fold vs global)
NNLS weights (LGB, CB): [0.50270604 0.49729396]
Pre-calibration blended CV RMSLE: 0.085059
Per-fold isotonic blended CV RMSLE: 0.088344
Global isotonic blended CV RMSLE: 0.082182
Chosen calibration: global
Calibrated submission.csv saved: (240, 2) | elapsed 0.0s


In [17]:
# Residual corrector: small LGBM on compact features; add scaled residuals, recalibrate, save submission
import numpy as np, pandas as pd, json, time, gc
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.isotonic import IsotonicRegression
from scipy.optimize import nnls
import lightgbm as lgb

t0 = time.time()
print('Residual corrector start')

# Load cached frames and matrices
y_log = np.load('y.npy')
fold_ids = np.load('fold_ids.npy')
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
X_tr = pd.read_parquet('X.parquet')
X_te = pd.read_parquet('X_test.parquet')

# Rebuild encodings used in base models (spacegroup TE+FE, lattice_system FE)
m_smooth = 18.0
global_mean = float(y_log.mean())
sg_tr = tr['spacegroup'].astype(str)
sg_te = te['spacegroup'].astype(str)
ls_tr = tr['lattice_system'].astype(int)
ls_te = te['lattice_system'].astype(int)
te_sg = np.zeros(len(tr), dtype=float); fe_sg = np.zeros(len(tr), dtype=float)
for k in np.unique(fold_ids):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    s_tr = sg_tr.iloc[tr_idx]
    counts = s_tr.groupby(s_tr).size()
    sums = pd.Series(y_log[tr_idx], index=s_tr.index).groupby(s_tr).sum()
    enc = (sums + m_smooth*global_mean) / (counts + m_smooth)
    te_sg[va_idx] = sg_tr.iloc[va_idx].map(enc).fillna(global_mean).values
    fe = counts / counts.sum()
    fe_sg[va_idx] = sg_tr.iloc[va_idx].map(fe).fillna(0.0).values
counts_all = sg_tr.groupby(sg_tr).size()
sums_all = pd.Series(y_log, index=sg_tr.index).groupby(sg_tr).sum()
enc_all = (sums_all + m_smooth*global_mean) / (counts_all + m_smooth)
fe_all = counts_all / counts_all.sum()
te_sg_test = sg_te.map(enc_all).fillna(global_mean).values
fe_sg_test = sg_te.map(fe_all).fillna(0.0).values
fe_ls = np.zeros(len(tr), dtype=float)
for k in np.unique(fold_ids):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    ls_counts = ls_tr.iloc[tr_idx].value_counts(normalize=True)
    fe_ls[va_idx] = ls_tr.iloc[va_idx].map(ls_counts).fillna(0.0).values
fe_ls_test = ls_te.map(ls_tr.value_counts(normalize=True)).fillna(0.0).values

# Assemble compact feature subset for residual corrector
feat_names = []
def add_feat(col):
    if col in tr.columns: feat_names.append(col)

for col in ['vegard_bg','bow_in','bow_ga','w_al','w_ga','w_in','w_al_sq','w_ga_sq','w_in_sq',
            'w_al_ga','w_al_in','w_ga_in','N','inv_vpa','log_vpa','o_minus_catw_chi_pauling',
            'catw_chi_pauling_mean','catw_ionic_radius_mean','vo_cat_p_frac','vo_tot_p_frac']:
    add_feat(col)

Xr_tr = tr[feat_names].copy()
Xr_te = te[feat_names].copy()
Xr_tr['te_sg'] = te_sg; Xr_tr['fe_sg'] = fe_sg; Xr_tr['fe_ls'] = fe_ls
Xr_te['te_sg'] = te_sg_test; Xr_te['fe_sg'] = fe_sg_test; Xr_te['fe_ls'] = fe_ls_test
med = Xr_tr.median(numeric_only=True)
Xr_tr = Xr_tr.fillna(med); Xr_te = Xr_te.fillna(med)

# Build base blend logits from saved OOF/test arrays
oof_lgb = np.load('oof_lgbm.npy')
oof_cb = np.load('oof_catboost.npy')
pred_lgb = np.load('pred_lgbm_test.npy')
pred_cb = np.load('pred_catboost_test.npy')
P = np.vstack([oof_lgb, oof_cb]).T
w, _ = nnls(P, y_log); w = w / (w.sum() if w.sum() > 0 else 1.0)
oof_blend = P @ w
Ptest = np.vstack([pred_lgb, pred_cb]).T
pred_blend = Ptest @ w
cv_blend = float(mean_squared_error(y_log, oof_blend) ** 0.5)
print(f'Base blended CV RMSLE: {cv_blend:.6f} | NNLS weights {w}')

# Residuals
residual = y_log - oof_blend

# Train small LGBM on residuals (frozen folds, strong regularization)
params = {
  'objective':'regression','metric':'rmse','learning_rate':0.05,
  'num_leaves':31,'min_data_in_leaf':600,'feature_fraction':0.7,
  'bagging_fraction':0.8,'bagging_freq':1,'lambda_l2':20.0,'lambda_l1':0.0,
  'verbosity':-1,'num_threads': 6, 'deterministic': True, 'force_col_wise': True
}
oof_res = np.zeros(len(Xr_tr)); pred_res = np.zeros(len(Xr_te))
n_splits = len(np.unique(fold_ids))
for k in range(n_splits):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    dtr = lgb.Dataset(Xr_tr.iloc[tr_idx], label=residual[tr_idx], free_raw_data=False)
    dva = lgb.Dataset(Xr_tr.iloc[va_idx], label=residual[va_idx], free_raw_data=False)
    m = lgb.train(params, dtr, num_boost_round=1200, valid_sets=[dtr,dva], valid_names=['train','valid'], callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
    oof_res[va_idx] = m.predict(Xr_tr.iloc[va_idx], num_iteration=m.best_iteration)
    pred_res += m.predict(Xr_te, num_iteration=m.best_iteration) / n_splits
    del m, dtr, dva; gc.collect()
cv_res = float(mean_squared_error(residual, oof_res) ** 0.5)
print(f'Residual model CV RMSE (log space): {cv_res:.6f}')

# Line search for alpha scaling on residuals
alphas = np.linspace(0.1, 0.35, 6)
best_alpha, best_cv = 0.0, 1e9
for a in alphas:
    oof_adj = oof_blend + a * oof_res
    cv_a = float(mean_squared_error(y_log, oof_adj) ** 0.5)
    if cv_a < best_cv: best_cv, best_alpha = cv_a, float(a)
print(f'Best alpha: {best_alpha:.3f} | CV RMSLE: {best_cv:.6f}')

# Apply to test logits and calibrate isotonic again
oof_final = oof_blend + best_alpha * oof_res
pred_final = pred_blend + best_alpha * pred_res
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(oof_final, y_log)
oof_cal = iso.transform(oof_final)
cv_cal = float(mean_squared_error(y_log, oof_cal) ** 0.5)
print(f'Post-residual isotonic CV RMSLE: {cv_cal:.6f}')
pred_cal = iso.transform(pred_final)

# Save final submission
sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_cal).clip(0, 6.5)})
sub.to_csv('submission.csv', index=False)
print('Final submission.csv saved:', sub.shape, '| elapsed', f'{time.time()-t0:.1f}s')

Residual corrector start
Base blended CV RMSLE: 0.085059 | NNLS weights [0.50270604 0.49729396]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[41]	train's rmse: 0.0866941	valid's rmse: 0.0727166


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[80]	train's rmse: 0.0829254	valid's rmse: 0.0953313
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[58]	train's rmse: 0.0852044	valid's rmse: 0.0815654


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[63]	train's rmse: 0.082142	valid's rmse: 0.101062
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's rmse: 0.0857059	valid's rmse: 0.0798164


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	train's rmse: 0.0866116	valid's rmse: 0.0702811
Training until validation scores don't improve for 200 rounds


Early stopping, best iteration is:
[361]	train's rmse: 0.0816651	valid's rmse: 0.102641
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's rmse: 0.0870461	valid's rmse: 0.0678113


Residual model CV RMSE (log space): 0.084967
Best alpha: 0.350 | CV RMSLE: 0.085008
Post-residual isotonic CV RMSLE: 0.082152
Final submission.csv saved: (240, 2) | elapsed 1.2s


In [None]:
# Add stoichiometry-group OOF target mean + freq; retrain LGBM+CatBoost; NNLS blend; isotonic
import numpy as np, pandas as pd, time, gc, json, os, sys, subprocess
from pathlib import Path
from sklearn.metrics import mean_squared_error
from scipy.optimize import nnls
from sklearn.isotonic import IsotonicRegression

print('Group OOF TE + retrain start')

# Ensure CatBoost
try:
    from catboost import CatBoostRegressor, Pool
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--prefer-binary', '-q', 'catboost'])
    from catboost import CatBoostRegressor, Pool
import lightgbm as lgb

# Load caches
fold_ids = np.load('fold_ids.npy')
y_log = np.load('y.npy')
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
X_tr = pd.read_parquet('X.parquet')
X_te = pd.read_parquet('X_test.parquet')

# Re-create SG and LS encodings (as before) to append consistently
m_smooth_sg = 18.0
global_mean = float(y_log.mean())
sg_tr = tr['spacegroup'].astype(str)
sg_te = te['spacegroup'].astype(str)
ls_tr = tr['lattice_system'].astype(int)
ls_te = te['lattice_system'].astype(int)
te_sg = np.zeros(len(tr), dtype=float); fe_sg = np.zeros(len(tr), dtype=float); fe_ls = np.zeros(len(tr), dtype=float)
for k in np.unique(fold_ids):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    s_tr = sg_tr.iloc[tr_idx]
    counts = s_tr.groupby(s_tr).size()
    sums = pd.Series(y_log[tr_idx], index=s_tr.index).groupby(s_tr).sum()
    enc = (sums + m_smooth_sg*global_mean) / (counts + m_smooth_sg)
    te_sg[va_idx] = sg_tr.iloc[va_idx].map(enc).fillna(global_mean).values
    fe = counts / counts.sum()
    fe_sg[va_idx] = sg_tr.iloc[va_idx].map(fe).fillna(0.0).values
    ls_counts = ls_tr.iloc[tr_idx].value_counts(normalize=True)
    fe_ls[va_idx] = ls_tr.iloc[va_idx].map(ls_counts).fillna(0.0).values
counts_all = sg_tr.groupby(sg_tr).size()
sums_all = pd.Series(y_log, index=sg_tr.index).groupby(sg_tr).sum()
enc_all = (sums_all + m_smooth_sg*global_mean) / (counts_all + m_smooth_sg)
fe_all = counts_all / counts_all.sum()
te_sg_test = sg_te.map(enc_all).fillna(global_mean).values
fe_sg_test = sg_te.map(fe_all).fillna(0.0).values
fe_ls_test = ls_te.map(ls_tr.value_counts(normalize=True)).fillna(0.0).values

# Build stoichiometry group key from counts saved in engineered frames
g_tr = tr[['N','n_al','n_ga','n_in']].astype(int).astype(str).agg('_'.join, axis=1)
g_te = te[['N','n_al','n_ga','n_in']].astype(int).astype(str).agg('_'.join, axis=1)

# OOF target encoding for stoich group (log space mean) + group frequency
m_smooth_g = 20.0
te_group = np.zeros(len(tr), dtype=float)
fe_group = np.zeros(len(tr), dtype=float)
for k in np.unique(fold_ids):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    g_tr_k = g_tr.iloc[tr_idx]
    counts = g_tr_k.groupby(g_tr_k).size()
    sums = pd.Series(y_log[tr_idx], index=g_tr_k.index).groupby(g_tr_k).sum()
    enc = (sums + m_smooth_g*global_mean) / (counts + m_smooth_g)
    te_group[va_idx] = g_tr.iloc[va_idx].map(enc).fillna(global_mean).values
    fe = counts / counts.sum()
    fe_group[va_idx] = g_tr.iloc[va_idx].map(fe).fillna(0.0).values
# Full-map for test
counts_all_g = g_tr.groupby(g_tr).size()
sums_all_g = pd.Series(y_log, index=g_tr.index).groupby(g_tr).sum()
enc_all_g = (sums_all_g + m_smooth_g*global_mean) / (counts_all_g + m_smooth_g)
fe_all_g = counts_all_g / counts_all_g.sum()
te_group_test = g_te.map(enc_all_g).fillna(global_mean).values
fe_group_test = g_te.map(fe_all_g).fillna(0.0).values

# Assemble modeling matrices by appending encodings
X_tr_enc = X_tr.copy(); X_te_enc = X_te.copy()
for name, arr_tr, arr_te in [('te_sg', te_sg, te_sg_test), ('fe_sg', fe_sg, fe_sg_test), ('fe_ls', fe_ls, fe_ls_test), ('te_group', te_group, te_group_test), ('fe_group', fe_group, fe_group_test)]:
    X_tr_enc[name] = arr_tr; X_te_enc[name] = arr_te
med = X_tr_enc.median(numeric_only=True)
X_tr_enc = X_tr_enc.fillna(med); X_te_enc = X_te_enc.fillna(med)
num_cols = list(X_tr_enc.select_dtypes(include=[np.number]).columns)
X_tr_enc = X_tr_enc[num_cols]; X_te_enc = X_te_enc[num_cols]
print('Matrices with group encodings:', X_tr_enc.shape, X_te_enc.shape)

# Train LGBM (3 seeds, 8 folds)
base_params = {'objective':'regression','metric':'rmse','learning_rate':0.03,'num_leaves':96,'max_depth':-1,'min_data_in_leaf':420,'feature_fraction':0.78,'bagging_fraction':0.8,'bagging_freq':1,'lambda_l2':10.0,'lambda_l1':0.0,'verbosity':-1,'num_threads': N_THREADS, 'deterministic': True, 'force_col_wise': True}
seeds = SEEDS
n_splits = len(np.unique(fold_ids))
oof_lgb_seeds, pred_lgb_seeds = [], []
for SEED in seeds:
    params = dict(base_params); params['seed'] = int(SEED)
    oof = np.zeros(len(X_tr_enc), dtype=float); pred = np.zeros(len(X_te_enc), dtype=float)
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        dtr = lgb.Dataset(X_tr_enc.iloc[tr_idx], label=y_log[tr_idx], free_raw_data=False)
        dva = lgb.Dataset(X_tr_enc.iloc[va_idx], label=y_log[va_idx], free_raw_data=False)
        m = lgb.train(params, dtr, num_boost_round=7000, valid_sets=[dtr,dva], valid_names=['train','valid'], callbacks=[lgb.early_stopping(400), lgb.log_evaluation(0)])
        oof[va_idx] = m.predict(X_tr_enc.iloc[va_idx], num_iteration=m.best_iteration)
        pred += m.predict(X_te_enc, num_iteration=m.best_iteration)/n_splits
        del m, dtr, dva; gc.collect()
    print('LGB seed', SEED, 'OOF', float(mean_squared_error(y_log, oof) ** 0.5))
    oof_lgb_seeds.append(oof); pred_lgb_seeds.append(pred)
oof_lgb = np.mean(np.vstack(oof_lgb_seeds), axis=0)
pred_lgb = np.mean(np.vstack(pred_lgb_seeds), axis=0)
print('LGB avg CV:', float(mean_squared_error(y_log, oof_lgb) ** 0.5))
np.save('oof_lgbm_grp.npy', oof_lgb); np.save('pred_lgbm_grp_test.npy', pred_lgb)

# Train CatBoost (3 seeds, 8 folds) with raw categoricals + appended encodings
X_tr_cb = X_tr_enc.copy(); X_te_cb = X_te_enc.copy()
X_tr_cb['spacegroup'] = sg_tr.values; X_te_cb['spacegroup'] = sg_te.values
X_tr_cb['lattice_system'] = ls_tr.values; X_te_cb['lattice_system'] = ls_te.values
cat_cols = ['spacegroup','lattice_system']
cat_idx = [X_tr_cb.columns.get_loc(c) for c in cat_cols]
num_only = X_tr_cb.columns.difference(cat_cols)
med_cb = X_tr_cb[num_only].median(numeric_only=True)
X_tr_cb[num_only] = X_tr_cb[num_only].fillna(med_cb); X_te_cb[num_only] = X_te_cb[num_only].fillna(med_cb)
oof_cb_seeds, pred_cb_seeds = [], []
for SEED in seeds:
    params_cb = dict(loss_function='RMSE', iterations=6000, learning_rate=0.028, depth=8, l2_leaf_reg=10.0, subsample=0.8, rsm=0.75, od_type='Iter', od_wait=350, random_seed=int(SEED), verbose=0, allow_writing_files=False, thread_count=N_THREADS)
    oof = np.zeros(len(X_tr_cb), dtype=float); pred = np.zeros(len(X_te_cb), dtype=float)
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        pool_tr = Pool(X_tr_cb.iloc[tr_idx], y_log[tr_idx], cat_features=cat_idx); pool_va = Pool(X_tr_cb.iloc[va_idx], y_log[va_idx], cat_features=cat_idx)
        m = CatBoostRegressor(**params_cb); m.fit(pool_tr, eval_set=pool_va, use_best_model=True)
        oof[va_idx] = m.predict(pool_va); pred += m.predict(Pool(X_te_cb, cat_features=cat_idx))/n_splits
        del m, pool_tr, pool_va; gc.collect()
    print('CB seed', SEED, 'OOF', float(mean_squared_error(y_log, oof) ** 0.5))
    oof_cb_seeds.append(oof); pred_cb_seeds.append(pred)
oof_cb = np.mean(np.vstack(oof_cb_seeds), axis=0)
pred_cb = np.mean(np.vstack(pred_cb_seeds), axis=0)
print('CB avg CV:', float(mean_squared_error(y_log, oof_cb) ** 0.5))
np.save('oof_catboost_grp.npy', oof_cb); np.save('pred_catboost_grp_test.npy', pred_cb)

# NNLS blend on new OOF logits
P = np.vstack([oof_lgb, oof_cb]).T
w, _ = nnls(P, y_log); w = w / (w.sum() if w.sum() > 0 else 1.0)
oof_blend = P @ w; cv_blend = float(mean_squared_error(y_log, oof_blend) ** 0.5)
print('NNLS w (LGB,CB):', w, '| Blended CV:', cv_blend)
Ptest = np.vstack([pred_lgb, pred_cb]).T
pred_blend = Ptest @ w

# Isotonic calibration
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(oof_blend, y_log)
oof_cal = iso.transform(oof_blend)
cv_cal = float(mean_squared_error(y_log, oof_cal) ** 0.5)
print('Post-calibration CV:', cv_cal)
pred_cal = iso.transform(pred_blend)

# Save submission
sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_cal).clip(0, 6.5)})
sub.to_csv('submission.csv', index=False)
print('submission.csv saved (group TE run):', sub.shape)

In [None]:
# KNN prior on stoichiometry groups (OOF, log space) + quick LGBM retrain
import numpy as np, pandas as pd, time, gc, json
from pathlib import Path
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

print('KNN prior build + LGBM retrain start')

# Load caches
fold_ids = np.load('fold_ids.npy')
y_log = np.load('y.npy')
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
X_tr = pd.read_parquet('X.parquet')
X_te = pd.read_parquet('X_test.parquet')

# Build stoichiometry vector per sample: (N, n_al, n_ga, n_in) as ints
G_tr = tr[['N','n_al','n_ga','n_in']].astype(int).values
G_te = te[['N','n_al','n_ga','n_in']].astype(int).values

# Helper: KNN prior per fold (no leakage). Distance = L1 on counts. m-smooth to global mean.
def knn_prior_oof(G, y_log, fold_ids, K=7, m=20.0):
    n = len(G)
    out = np.zeros(n, dtype=float)
    gmean = float(y_log.mean())
    for k in np.unique(fold_ids):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        Gtr, ytr = G[tr_idx], y_log[tr_idx]
        # unique groups in train fold
        df_tr = pd.DataFrame(Gtr, columns=['N','al','ga','in'])
        grp = df_tr.groupby(['N','al','ga','in']).agg(idx=('N','size')).reset_index()
        # build map: group key -> (mean, count)
        keys = list(map(tuple, grp[['N','al','ga','in']].values))
        # compute mean and count per unique group
        df_stats = pd.DataFrame(Gtr, columns=['N','al','ga','in'])
        df_stats['y'] = ytr
        stats = df_stats.groupby(['N','al','ga','in']).agg(mean=('y','mean'), cnt=('y','size')).reset_index()
        U = stats[['N','al','ga','in']].values
        mu = stats['mean'].values; cnt = stats['cnt'].values
        # for each valid sample, compute L1 distance to all U and take KNN weighted mean (count-weighted), then m-smooth
        Gva = G[va_idx]
        # compute distances
        for i, g in enumerate(Gva):
            d = np.sum(np.abs(U - g), axis=1)
            # take K nearest
            idx = np.argpartition(d, K)[:K]
            w = cnt[idx].astype(float) + 1e-6
            m_knn = np.sum(mu[idx] * w) / np.sum(w)
            # m-smoothing towards global mean using total counts
            c_tot = float(np.sum(cnt[idx]))
            prior = (m_knn * c_tot + m * gmean) / (c_tot + m)
            out[va_idx[i]] = prior
    return out

t0 = time.time()
knn_oof = knn_prior_oof(G_tr, y_log, fold_ids, K=7, m=20.0)
print('KNN OOF built in', f'{time.time()-t0:.1f}s')

# Train-fold full-map for test using all train data
def knn_prior_infer(G_all, y_all, G_query, K=7, m=20.0):
    gmean = float(y_all.mean())
    df = pd.DataFrame(G_all, columns=['N','al','ga','in'])
    df['y'] = y_all
    stats = df.groupby(['N','al','ga','in']).agg(mean=('y','mean'), cnt=('y','size')).reset_index()
    U = stats[['N','al','ga','in']].values
    mu = stats['mean'].values; cnt = stats['cnt'].values
    out = np.zeros(len(G_query), dtype=float)
    for i, g in enumerate(G_query):
        d = np.sum(np.abs(U - g), axis=1)
        idx = np.argpartition(d, K)[:K]
        w = cnt[idx].astype(float) + 1e-6
        m_knn = np.sum(mu[idx] * w) / np.sum(w)
        c_tot = float(np.sum(cnt[idx]))
        out[i] = (m_knn * c_tot + m * gmean) / (c_tot + m)
    return out

t1 = time.time()
knn_te = knn_prior_infer(G_tr, y_log, G_te, K=7, m=20.0)
print('KNN test built in', f'{time.time()-t1:.1f}s')

# Append to matrices and retrain LGBM quickly (3 seeds)
X_tr_knn = X_tr.copy(); X_te_knn = X_te.copy()
X_tr_knn['knn_group_prior'] = knn_oof
X_te_knn['knn_group_prior'] = knn_te
med = X_tr_knn.median(numeric_only=True)
X_tr_knn = X_tr_knn.fillna(med); X_te_knn = X_te_knn.fillna(med)
num_cols = list(X_tr_knn.select_dtypes(include=[np.number]).columns)
X_tr_knn = X_tr_knn[num_cols]; X_te_knn = X_te_knn[num_cols]

params = {'objective':'regression','metric':'rmse','learning_rate':0.03,'num_leaves':96,'max_depth':-1,'min_data_in_leaf':500,'feature_fraction':0.75,'bagging_fraction':0.8,'bagging_freq':1,'lambda_l2':12.0,'lambda_l1':0.0,'verbosity':-1,'num_threads': N_THREADS,'deterministic': True,'force_col_wise': True}
seeds = SEEDS
n_splits = len(np.unique(fold_ids))
oof_seeds = []; pred_seeds = []
for SEED in seeds:
    p = dict(params); p['seed'] = int(SEED)
    oof = np.zeros(len(X_tr_knn)); pred = np.zeros(len(X_te_knn))
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        dtr = lgb.Dataset(X_tr_knn.iloc[tr_idx], label=y_log[tr_idx], free_raw_data=False)
        dva = lgb.Dataset(X_tr_knn.iloc[va_idx], label=y_log[va_idx], free_raw_data=False)
        m = lgb.train(p, dtr, num_boost_round=7000, valid_sets=[dtr,dva], valid_names=['train','valid'], callbacks=[lgb.early_stopping(400), lgb.log_evaluation(0)])
        oof[va_idx] = m.predict(X_tr_knn.iloc[va_idx], num_iteration=m.best_iteration)
        pred += m.predict(X_te_knn, num_iteration=m.best_iteration)/n_splits
        del m, dtr, dva; gc.collect()
    rmse = float(mean_squared_error(y_log, oof) ** 0.5)
    print(f'KNN-LGB SEED {SEED}: OOF {rmse:.6f}')
    oof_seeds.append(oof); pred_seeds.append(pred)
oof_avg = np.mean(np.vstack(oof_seeds), axis=0)
pred_avg = np.mean(np.vstack(pred_seeds), axis=0)
cv = float(mean_squared_error(y_log, oof_avg) ** 0.5)
print(f'KNN-LGB blended seeds CV: {cv:.6f}')
np.save('oof_lgbm_knn.npy', oof_avg); np.save('pred_lgbm_knn_test.npy', pred_avg)

# Optional: blend with previous CB OOF if available and recalibrate
if Path('oof_catboost.npy').exists() and Path('pred_catboost_test.npy').exists():
    o_cb = np.load('oof_catboost.npy'); p_cb = np.load('pred_catboost_test.npy')
    P = np.vstack([oof_avg, o_cb]).T
    from scipy.optimize import nnls
    w, _ = nnls(P, y_log); w = w/(w.sum() if w.sum()>0 else 1.0)
    oof_blend = P @ w; cv_blend = float(mean_squared_error(y_log, oof_blend) ** 0.5)
    print('KNN-LGB + CB NNLS w:', w, '| CV:', cv_blend)
    Ptest = np.vstack([pred_avg, p_cb]).T
    pred_blend = Ptest @ w
    # Isotonic calibration
    from sklearn.isotonic import IsotonicRegression
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(oof_blend, y_log)
    oof_cal = iso.transform(oof_blend)
    cv_cal = float(mean_squared_error(y_log, oof_cal) ** 0.5)
    print('Post-calibration CV:', cv_cal)
    sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(iso.transform(pred_blend)).clip(0,6.5)})
    sub.to_csv('submission.csv', index=False)
    print('submission.csv saved (KNN prior run):', sub.shape)
print('Done.')

In [18]:
# Ridge meta-stacker with OOF encodings + physics features + per-fold/global isotonic (choose best)
import numpy as np, pandas as pd, time, gc, json
from pathlib import Path
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_squared_error

print('Ridge meta-stacker start')

# Load base OOF/test
y_log = np.load('y.npy')
fold_ids = np.load('fold_ids.npy')
oof_lgb = np.load('oof_lgbm.npy')
pred_lgb = np.load('pred_lgbm_test.npy')
oof_cb = np.load('oof_catboost.npy')
pred_cb = np.load('pred_catboost_test.npy')

# Load engineered frames for physics features
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')

# Recompute OOF ordered TEs to get train (OOF) + test encodings for meta features
X_tr_base = pd.read_parquet('X.parquet')
X_te_base = pd.read_parquet('X_test.parquet')
X_tr_enc_tmp, X_te_enc_tmp, meta_oof = add_encoded_features(X_tr_base, X_te_base, tr, te, y_log, fold_ids, seed=PRIMARY_SEED)

# Extract meta encoding columns (train OOF and test mapped)
def get_enc_pair(name):
    tr_col = f'te_{name}'; fe_tr_col = f'fe_{name}'
    te_col = f'te_{name}'; fe_te_col = f'fe_{name}'
    return (X_tr_enc_tmp[tr_col].values, X_te_enc_tmp[te_col].values,
            X_tr_enc_tmp[fe_tr_col].values, X_te_enc_tmp[fe_te_col].values)

te_sg_tr, te_sg_te, fe_sg_tr, fe_sg_te = get_enc_pair('sg')
te_ls_tr, te_ls_te, fe_ls_tr, fe_ls_te = get_enc_pair('ls')
te_Nb_tr, te_Nb_te, fe_Nb_tr, fe_Nb_te = get_enc_pair('Nb')

# Physics features for meta (compact, high-signal set) with availability guard
phys_cols = ['vegard_bg','H_cation','eff_cations','t_ratio','t_dev','rM_var','charge_density_6N',
             'dist_l2_center','w_al','w_ga','w_in','veg_w_al','veg_w_ga','veg_w_in']
phys_cols_avail = [c for c in phys_cols if c in tr.columns]
missing = [c for c in phys_cols if c not in phys_cols_avail]
if missing:
    print('Missing physics cols (skipped):', missing)
phys_tr = tr[phys_cols_avail].copy() if phys_cols_avail else pd.DataFrame(index=tr.index)
phys_te = te[phys_cols_avail].copy() if phys_cols_avail else pd.DataFrame(index=te.index)
if not phys_tr.empty:
    med = phys_tr.median(numeric_only=True)
    phys_tr = phys_tr.fillna(med)
    phys_te = phys_te.fillna(med)

# Build meta matrices
M_tr_list = [oof_lgb, oof_cb, te_sg_tr, te_ls_tr, te_Nb_tr, fe_sg_tr, fe_ls_tr, fe_Nb_tr]
M_te_list = [pred_lgb, pred_cb, te_sg_te, te_ls_te, te_Nb_te, fe_sg_te, fe_ls_te, fe_Nb_te]
if not phys_tr.empty:
    M_tr = np.column_stack(M_tr_list + [phys_tr.values])
    M_te = np.column_stack(M_te_list + [phys_te.values])
else:
    M_tr = np.column_stack(M_tr_list)
    M_te = np.column_stack(M_te_list)
print('Meta matrices:', M_tr.shape, M_te.shape)

# Per-fold standardization + RidgeCV
alphas = [0.1, 1.0, 10.0]
n = len(y_log); n_splits = len(np.unique(fold_ids))
oof_meta = np.zeros(n, dtype=float)
pred_meta = np.zeros(len(M_te), dtype=float)
for k in range(n_splits):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    sc = StandardScaler(with_mean=True, with_std=True)
    Mtr = sc.fit_transform(M_tr[tr_idx]); Mva = sc.transform(M_tr[va_idx]); Mte_sc = sc.transform(M_te)
    ridge = RidgeCV(alphas=alphas, fit_intercept=True, cv=None, scoring=None)
    ridge.fit(Mtr, y_log[tr_idx])
    oof_meta[va_idx] = ridge.predict(Mva)
    pred_meta += ridge.predict(Mte_sc) / n_splits
    print(f'Fold {k} Ridge alpha={ridge.alpha_:.3f}')
cv_meta = float(mean_squared_error(y_log, oof_meta) ** 0.5)
print(f'Ridge meta OOF CV RMSLE: {cv_meta:.6f}')

# Per-fold isotonic calibration on meta
oof_cal_fold = np.zeros_like(oof_meta)
pred_cal_fold_parts = []
for k in range(n_splits):
    tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
    iso_k = IsotonicRegression(out_of_bounds='clip')
    iso_k.fit(oof_meta[tr_idx], y_log[tr_idx])
    oof_cal_fold[va_idx] = iso_k.transform(oof_meta[va_idx])
    pred_cal_fold_parts.append(iso_k.transform(pred_meta))
cv_fold = float(mean_squared_error(y_log, oof_cal_fold) ** 0.5)
pred_cal_fold = np.mean(np.stack(pred_cal_fold_parts, axis=0), axis=0)
print(f'Per-fold isotonic-calibrated meta CV RMSLE: {cv_fold:.6f}')

# Global isotonic calibration on meta
iso_full = IsotonicRegression(out_of_bounds='clip')
iso_full.fit(oof_meta, y_log)
oof_cal_full = iso_full.transform(oof_meta)
cv_full = float(mean_squared_error(y_log, oof_cal_full) ** 0.5)
pred_cal_full = iso_full.transform(pred_meta)
print(f'Global isotonic meta CV RMSLE: {cv_full:.6f}')

# Choose best
use_full = cv_full <= cv_fold
pred_cal = pred_cal_full if use_full else pred_cal_fold
chosen = 'global' if use_full else 'per-fold'
print('Chosen calibration for meta:', chosen)

# Save calibrated submission
sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'bandgap_energy_ev': np.expm1(pred_cal).clip(0, 6.5)})
sub.to_csv('submission.csv', index=False)
print('submission.csv saved (ridge meta, best iso):', sub.shape)

Ridge meta-stacker start


Meta matrices: (2160, 22) (240, 22)
Fold 0 Ridge alpha=10.000
Fold 1 Ridge alpha=10.000
Fold 2 Ridge alpha=10.000
Fold 3 Ridge alpha=10.000
Fold 4 Ridge alpha=10.000
Fold 5 Ridge alpha=10.000
Fold 6 Ridge alpha=10.000
Fold 7 Ridge alpha=10.000
Ridge meta OOF CV RMSLE: 0.085054
Per-fold isotonic-calibrated meta CV RMSLE: 0.086810
Global isotonic meta CV RMSLE: 0.081867
Chosen calibration for meta: global
submission.csv saved (ridge meta, best iso): (240, 2)


In [None]:
# Diagnostics: inspect engineered feature columns
import pandas as pd
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
cols = set(tr.columns.tolist())
check = ['vegard_bg','H_cation','eff_cations','t_ratio','t_dev','rM_var','charge_density_6N','dist_l2_center','w_al','w_ga','w_in','veg_w_al','veg_w_ga','veg_w_in']
missing = [c for c in check if c not in cols]
present = [c for c in check if c in cols]
print('Present:', present)
print('Missing:', missing)
print('Total columns in train_fe:', len(tr.columns))
print('Sample columns:', tr.columns[:30].tolist())

In [13]:
# Dual-CV base models: build second fold split (seed=777), retrain LGBM (Variant A) and CatBoost,
# average OOF/test across splits, and overwrite base OOF/preds for meta-stacking
import numpy as np, pandas as pd, time, gc, json, sys, subprocess, os
from pathlib import Path
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Ensure CatBoost is available
try:
    from catboost import CatBoostRegressor, Pool
except Exception:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--prefer-binary', '-q', 'catboost'])
    from catboost import CatBoostRegressor, Pool

print('Dual-CV base models start')
t0_all = time.time()

# Load caches and raw train for group key
y_log = np.load('y.npy')
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
X_tr = pd.read_parquet('X.parquet')
X_te = pd.read_parquet('X_test.parquet')
train_csv = pd.read_csv('train.csv')

# Build stoichiometry group key from raw train (for fold construction)
gkey_tr, _, _, _, _, _ = compute_stoich_groups(train_csv)
y = train_csv['bandgap_energy_ev'].astype(float)

split_seeds = [PRIMARY_SEED, 777]
oofs_lgb_splits, preds_lgb_splits = [], []
oofs_cb_splits, preds_cb_splits = [], []

for s in split_seeds:
    print(f'--- Split seed {s} ---')
    fold_ids_split = build_stratified_group_folds(tr, gkey_tr.astype(str), y, n_splits=N_FOLDS, seed=int(s))
    # Encoded features (uses strict OOF within this split)
    X_tr_enc, X_te_enc, _ = add_encoded_features(X_tr, X_te, tr, te, y_log, fold_ids_split, seed=int(s))
    # LGBM Variant A, dropping te_* (physics + fe_ only), auto-drop const cols
    drop_te = [c for c in X_tr_enc.columns if c.startswith('te_')]
    X_tr_lgb = X_tr_enc.drop(columns=drop_te, errors='ignore').copy()
    X_te_lgb = X_te_enc.drop(columns=drop_te, errors='ignore').copy()
    std = X_tr_lgb.std(numeric_only=True); const_cols = list(std[std == 0].index)
    if const_cols:
        X_tr_lgb = X_tr_lgb.drop(columns=const_cols, errors='ignore'); X_te_lgb = X_te_lgb.drop(columns=const_cols, errors='ignore')
    params_lgb = {
        'objective':'regression','metric':'rmse','learning_rate':0.023,'num_leaves':48,'max_depth':-1,
        'min_data_in_leaf':160,'feature_fraction':0.62,'bagging_fraction':0.80,'bagging_freq':1,
        'lambda_l2':15.0,'lambda_l1':0.0,'verbosity':-1,'num_threads': N_THREADS,'deterministic': True,'force_col_wise': True
    }
    seeds = SEEDS
    n_splits = len(np.unique(fold_ids_split))
    oof_lgb_seeds, pred_lgb_seeds = [], []
    for SEED in seeds:
        p = dict(params_lgb); p['seed'] = int(SEED)
        oof = np.zeros(len(X_tr_lgb)); pred = np.zeros(len(X_te_lgb))
        t0 = time.time()
        for k in range(n_splits):
            tr_idx = np.where(fold_ids_split != k)[0]; va_idx = np.where(fold_ids_split == k)[0]
            dtr = lgb.Dataset(X_tr_lgb.iloc[tr_idx], label=y_log[tr_idx], free_raw_data=False)
            dva = lgb.Dataset(X_tr_lgb.iloc[va_idx], label=y_log[va_idx], free_raw_data=False)
            m = lgb.train(p, dtr, num_boost_round=9000, valid_sets=[dtr,dva], valid_names=['train','valid'], callbacks=[lgb.early_stopping(500), lgb.log_evaluation(0)])
            oof[va_idx] = m.predict(X_tr_lgb.iloc[va_idx], num_iteration=m.best_iteration)
            pred += m.predict(X_te_lgb, num_iteration=m.best_iteration) / n_splits
            del m, dtr, dva; gc.collect()
        print(f'LGB split {s} seed {SEED} OOF:', float(mean_squared_error(y_log, oof)**0.5))
        oof_lgb_seeds.append(oof); pred_lgb_seeds.append(pred)
    oof_lgb_avg = np.mean(np.vstack(oof_lgb_seeds), axis=0)
    pred_lgb_avg = np.mean(np.vstack(pred_lgb_seeds), axis=0)
    print(f'LGB split {s} blended seeds CV:', float(mean_squared_error(y_log, oof_lgb_avg)**0.5))
    oofs_lgb_splits.append(oof_lgb_avg); preds_lgb_splits.append(pred_lgb_avg)

    # CatBoost: drop te_* and add raw categoricals
    X_tr_cb = X_tr_enc.drop(columns=drop_te, errors='ignore').copy()
    X_te_cb = X_te_enc.drop(columns=drop_te, errors='ignore').copy()
    X_tr_cb['spacegroup'] = tr['spacegroup'].astype(str).values
    X_te_cb['spacegroup'] = te['spacegroup'].astype(str).values
    X_tr_cb['lattice_system'] = tr['lattice_system'].astype(int).astype(str).values
    X_te_cb['lattice_system'] = te['lattice_system'].astype(int).astype(str).values
    # Nb categorical with q=8 based on train bins
    try:
        _, bins = pd.qcut(tr['N'].astype(float), q=8, duplicates='drop', retbins=True)
        bins = np.unique(bins)
        Nb_tr_lab = pd.qcut(tr['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
        Nb_te_raw = np.digitize(te['N'].astype(float).values, bins[1:-1], right=True)
        Nb_te_lab = pd.Series(Nb_te_raw, index=te.index).astype('Int64')
    except Exception:
        Nb_tr_lab = pd.qcut(tr['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
        Nb_te_lab = pd.qcut(te['N'].astype(float), q=8, labels=False, duplicates='drop').astype('Int64')
    X_tr_cb['Nb_cat'] = Nb_tr_lab.astype(str).fillna('-1').values
    X_te_cb['Nb_cat'] = Nb_te_lab.astype(str).fillna('-1').values
    cat_cols = ['spacegroup','lattice_system','Nb_cat']
    cat_idx = [X_tr_cb.columns.get_loc(c) for c in cat_cols]
    num_cols = X_tr_cb.columns.difference(cat_cols)
    med = X_tr_cb[num_cols].median(numeric_only=True)
    X_tr_cb[num_cols] = X_tr_cb[num_cols].fillna(med)
    X_te_cb[num_cols] = X_te_cb[num_cols].fillna(med)
    oof_cb_seeds, pred_cb_seeds = [], []
    for SEED in seeds:
        params_cb = dict(loss_function='RMSE', iterations=8000, learning_rate=0.028, depth=7, l2_leaf_reg=15.0, subsample=0.8, rsm=0.78, od_type='Iter', od_wait=400, random_seed=int(SEED), verbose=0, allow_writing_files=False, thread_count=N_THREADS)
        oof = np.zeros(len(X_tr_cb)); pred = np.zeros(len(X_te_cb))
        for k in range(n_splits):
            tr_idx = np.where(fold_ids_split != k)[0]; va_idx = np.where(fold_ids_split == k)[0]
            pool_tr = Pool(X_tr_cb.iloc[tr_idx], y_log[tr_idx], cat_features=cat_idx)
            pool_va = Pool(X_tr_cb.iloc[va_idx], y_log[va_idx], cat_features=cat_idx)
            m = CatBoostRegressor(**params_cb); m.fit(pool_tr, eval_set=pool_va, use_best_model=True)
            oof[va_idx] = m.predict(pool_va)
            pred += m.predict(Pool(X_te_cb, cat_features=cat_idx)) / n_splits
            del m, pool_tr, pool_va; gc.collect()
        print(f'CB split {s} seed {SEED} OOF:', float(mean_squared_error(y_log, oof)**0.5))
        oof_cb_seeds.append(oof); pred_cb_seeds.append(pred)
    oof_cb_avg = np.mean(np.vstack(oof_cb_seeds), axis=0)
    pred_cb_avg = np.mean(np.vstack(pred_cb_seeds), axis=0)
    print(f'CB split {s} blended seeds CV:', float(mean_squared_error(y_log, oof_cb_avg)**0.5))
    oofs_cb_splits.append(oof_cb_avg); preds_cb_splits.append(pred_cb_avg)

# Average across splits (elementwise) and overwrite base arrays for meta
oof_lgb_dual = np.mean(np.vstack(oofs_lgb_splits), axis=0)
pred_lgb_dual = np.mean(np.vstack(preds_lgb_splits), axis=0)
oof_cb_dual = np.mean(np.vstack(oofs_cb_splits), axis=0)
pred_cb_dual = np.mean(np.vstack(preds_cb_splits), axis=0)
cv_lgb_dual = float(mean_squared_error(y_log, oof_lgb_dual) ** 0.5)
cv_cb_dual = float(mean_squared_error(y_log, oof_cb_dual) ** 0.5)
print(f'Dual-split LGB CV: {cv_lgb_dual:.6f} | Dual-split CB CV: {cv_cb_dual:.6f}')
np.save('oof_lgbm.npy', oof_lgb_dual); np.save('pred_lgbm_test.npy', pred_lgb_dual)
np.save('oof_catboost.npy', oof_cb_dual); np.save('pred_catboost_test.npy', pred_cb_dual)
print('Saved averaged base OOF/test arrays for meta. | elapsed', f'{time.time()-t0_all:.1f}s')

Dual-CV base models start
--- Split seed 42 ---


Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1375]	train's rmse: 0.0700237	valid's rmse: 0.0736036
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1337]	train's rmse: 0.0674385	valid's rmse: 0.094339
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1330]	train's rmse: 0.0682411	valid's rmse: 0.0828005
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2700]	train's rmse: 0.060081	valid's rmse: 0.10534
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1035]	train's rmse: 0.0714191	valid's rmse: 0.0849327
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1937]	train's rmse: 0.0664754	valid's rmse: 0.0715234
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2003]	train's rmse: 0.0623148	valid's rmse: 0.102896
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2128]	train's rmse: 0.0657641	valid's rmse: 0.0689785
LGB split 42 seed 7 OOF: 0.08655077648822491
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1842]	train's rmse: 0.0671059	valid's rmse: 0.0738396
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1120]	train's rmse: 0.0692965	valid's rmse: 0.0947926
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1329]	train's rmse: 0.0683401	valid's rmse: 0.083658
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[3367]	train's rmse: 0.0579914	valid's rmse: 0.105053
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[684]	train's rmse: 0.0757667	valid's rmse: 0.0855322
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1737]	train's rmse: 0.0675309	valid's rmse: 0.0718839
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1555]	train's rmse: 0.0648864	valid's rmse: 0.103393
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[5163]	train's rmse: 0.0565444	valid's rmse: 0.0687686
LGB split 42 seed 42 OOF: 0.08685890890565168


Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1291]	train's rmse: 0.0705062	valid's rmse: 0.0742929
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1322]	train's rmse: 0.0676976	valid's rmse: 0.0948128
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1130]	train's rmse: 0.0699983	valid's rmse: 0.0832855
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2791]	train's rmse: 0.0597308	valid's rmse: 0.104716
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[893]	train's rmse: 0.0728169	valid's rmse: 0.0849865
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1601]	train's rmse: 0.0684979	valid's rmse: 0.0728776
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1958]	train's rmse: 0.0625253	valid's rmse: 0.102719
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2447]	train's rmse: 0.0645121	valid's rmse: 0.0690991
LGB split 42 seed 2025 OOF: 0.08679915604398557
LGB split 42 blended seeds CV: 0.08662247995012161


CB split 42 seed 7 OOF: 0.08651405928813617


CB split 42 seed 42 OOF: 0.08649545911911077


CB split 42 seed 2025 OOF: 0.08650920878616801
CB split 42 blended seeds CV: 0.0861648606238306
--- Split seed 777 ---


Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2051]	train's rmse: 0.0645006	valid's rmse: 0.0859913
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[631]	train's rmse: 0.0773221	valid's rmse: 0.0737914
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2456]	train's rmse: 0.0636422	valid's rmse: 0.0823387
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1745]	train's rmse: 0.0671703	valid's rmse: 0.0798162
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2066]	train's rmse: 0.0633397	valid's rmse: 0.104263
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1287]	train's rmse: 0.0682879	valid's rmse: 0.0911895
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[678]	train's rmse: 0.0766524	valid's rmse: 0.0717539
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1940]	train's rmse: 0.063647	valid's rmse: 0.0991081
LGB split 777 seed 7 OOF: 0.08637382185707773
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2257]	train's rmse: 0.0636461	valid's rmse: 0.0855932
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[586]	train's rmse: 0.0781826	valid's rmse: 0.0733462
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2165]	train's rmse: 0.0649358	valid's rmse: 0.08171
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1907]	train's rmse: 0.0665785	valid's rmse: 0.0802221
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2273]	train's rmse: 0.0624058	valid's rmse: 0.103934
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1370]	train's rmse: 0.0676342	valid's rmse: 0.0909848
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[650]	train's rmse: 0.0772517	valid's rmse: 0.0715847
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2232]	train's rmse: 0.0622368	valid's rmse: 0.099482
LGB split 777 seed 42 OOF: 0.08621274051102557
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2290]	train's rmse: 0.0634761	valid's rmse: 0.0858605
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[701]	train's rmse: 0.0763349	valid's rmse: 0.073789
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2194]	train's rmse: 0.0646934	valid's rmse: 0.0815118
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2183]	train's rmse: 0.0652629	valid's rmse: 0.0802758
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1558]	train's rmse: 0.0659906	valid's rmse: 0.103709
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[1501]	train's rmse: 0.0666867	valid's rmse: 0.0907639
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[596]	train's rmse: 0.0782789	valid's rmse: 0.0713732
Training until validation scores don't improve for 500 rounds


Early stopping, best iteration is:
[2023]	train's rmse: 0.0632113	valid's rmse: 0.0994276
LGB split 777 seed 2025 OOF: 0.08618616533255861
LGB split 777 blended seeds CV: 0.08615981693803354


CB split 777 seed 7 OOF: 0.0864775044391502


CB split 777 seed 42 OOF: 0.0866485247088922


CB split 777 seed 2025 OOF: 0.08732978920289618
CB split 777 blended seeds CV: 0.0863866721831346
Dual-split LGB CV: 0.085840 | Dual-split CB CV: 0.085882
Saved averaged base OOF/test arrays for meta. | elapsed 611.1s


In [19]:
# Validate and standardize submission format
import pandas as pd, numpy as np
test_ids = pd.read_csv('test.csv')['id']
sub = pd.read_csv('submission.csv')
print('Submission head:', sub.head())
assert list(sub.columns) == ['id','bandgap_energy_ev'], f'Bad columns: {sub.columns.tolist()}'
assert len(sub) == len(test_ids), f'Row count mismatch: {len(sub)} vs {len(test_ids)}'
assert set(sub['id']) == set(test_ids), 'ID set mismatch with test.csv'
assert sub['bandgap_energy_ev'].notna().all(), 'Found NaNs in predictions'
assert np.isfinite(sub['bandgap_energy_ev']).all(), 'Found non-finite values in predictions'
# enforce types and order by id to be safe
sub = sub[['id','bandgap_energy_ev']].copy()
sub['id'] = sub['id'].astype(int)
sub['bandgap_energy_ev'] = sub['bandgap_energy_ev'].astype(float)
sub = sub.merge(test_ids.to_frame('id'), on='id', how='right')
sub = sub[['id','bandgap_energy_ev']].sort_values('id')
sub.to_csv('submission.csv', index=False)
print('submission.csv validated and saved:', sub.shape, sub.dtypes.to_dict())

Submission head:    id  bandgap_energy_ev
0   1           1.884447
1   2           1.701527
2   3           4.335909
3   4           2.973716
4   5           1.222728
submission.csv validated and saved: (240, 2) {'id': dtype('int64'), 'bandgap_energy_ev': dtype('float64')}


In [20]:
# Diagnose expected submission columns and target availability
import pandas as pd, numpy as np
train = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print('Train columns:', train.columns.tolist())
print('Test columns:', test_df.columns.tolist())
for col in ['bandgap_energy_ev', 'formation_energy_ev_natom']:
    if col in train.columns:
        mn, mx = train[col].min(), train[col].max()
        print(f"{col}: min={mn}, max={mx}")
    else:
        print(col, 'NOT FOUND in train.csv')
print('submission.csv preview:')
print(pd.read_csv('submission.csv').head())

Train columns: ['id', 'spacegroup', 'number_of_total_atoms', 'percent_atom_al', 'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang', 'lattice_vector_2_ang', 'lattice_vector_3_ang', 'lattice_angle_alpha_degree', 'lattice_angle_beta_degree', 'lattice_angle_gamma_degree', 'formation_energy_ev_natom', 'bandgap_energy_ev']
Test columns: ['id', 'spacegroup', 'number_of_total_atoms', 'percent_atom_al', 'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang', 'lattice_vector_2_ang', 'lattice_vector_3_ang', 'lattice_angle_alpha_degree', 'lattice_angle_beta_degree', 'lattice_angle_gamma_degree']
bandgap_energy_ev: min=0.0001, max=5.2861
formation_energy_ev_natom: min=0.0, max=0.6572
submission.csv preview:
   id  bandgap_energy_ev
0   1           1.884447
1   2           1.701527
2   3           4.335909
3   4           2.973716
4   5           1.222728


In [21]:
# Train formation_energy_ev_natom model (LGBM) and build multi-target submission
import numpy as np, pandas as pd, time, gc, json
from pathlib import Path
import lightgbm as lgb

print('Formation energy model start')
t0 = time.time()

# Load caches and folds
fold_ids = np.load('fold_ids.npy')
X_tr = pd.read_parquet('X.parquet')
X_te = pd.read_parquet('X_test.parquet')
tr = pd.read_parquet('train_fe.parquet')
te = pd.read_parquet('test_fe.parquet')
train_csv = pd.read_csv('train.csv')

# Target (log1p)
y_form = train_csv['formation_energy_ev_natom'].astype(float).values
y_form_log = np.log1p(np.clip(y_form, 0, None))

# Centralized encodings using same folds and target y_form_log
X_tr_enc, X_te_enc, _ = add_encoded_features(X_tr, X_te, tr, te, y_form_log, fold_ids, seed=PRIMARY_SEED)

# For LGBM stability: drop te_* (use physics + fe_ only), drop const
drop_te_cols = [c for c in X_tr_enc.columns if c.startswith('te_')]
X_tr_lgb = X_tr_enc.drop(columns=drop_te_cols, errors='ignore').copy()
X_te_lgb = X_te_enc.drop(columns=drop_te_cols, errors='ignore').copy()
std = X_tr_lgb.std(numeric_only=True)
const_cols = list(std[std == 0].index)
if const_cols:
    X_tr_lgb = X_tr_lgb.drop(columns=const_cols, errors='ignore')
    X_te_lgb = X_te_lgb.drop(columns=const_cols, errors='ignore')
print('Form LGB matrices:', X_tr_lgb.shape, X_te_lgb.shape)

# LGBM params (mirrored, slightly stronger leaf regularization for smoother target)
params = {
    'objective': 'regression', 'metric': 'rmse',
    'learning_rate': 0.023, 'num_leaves': 48, 'max_depth': -1,
    'min_data_in_leaf': 200, 'feature_fraction': 0.62,
    'bagging_fraction': 0.80, 'bagging_freq': 1,
    'lambda_l2': 15.0, 'lambda_l1': 0.0,
    'verbosity': -1, 'num_threads': N_THREADS,
    'deterministic': True, 'force_col_wise': True
}

seeds = SEEDS
n_splits = len(np.unique(fold_ids))
oof_seeds, pred_seeds = [], []
for SEED in seeds:
    p = dict(params); p['seed'] = int(SEED)
    oof = np.zeros(len(X_tr_lgb), dtype=float)
    pred = np.zeros(len(X_te_lgb), dtype=float)
    t0s = time.time()
    for k in range(n_splits):
        tr_idx = np.where(fold_ids != k)[0]; va_idx = np.where(fold_ids == k)[0]
        dtr = lgb.Dataset(X_tr_lgb.iloc[tr_idx], label=y_form_log[tr_idx], free_raw_data=False)
        dva = lgb.Dataset(X_tr_lgb.iloc[va_idx], label=y_form_log[va_idx], free_raw_data=False)
        m = lgb.train(p, dtr, num_boost_round=6000, valid_sets=[dtr, dva], valid_names=['train','valid'], callbacks=[lgb.early_stopping(400), lgb.log_evaluation(0)])
        oof[va_idx] = m.predict(X_tr_lgb.iloc[va_idx], num_iteration=m.best_iteration)
        pred += m.predict(X_te_lgb, num_iteration=m.best_iteration) / n_splits
        del m, dtr, dva; gc.collect()
    oof_seeds.append(oof); pred_seeds.append(pred)
    from sklearn.metrics import mean_squared_error
    rmse = float(mean_squared_error(y_form_log, oof) ** 0.5)
    print(f'Form LGB SEED {SEED}: OOF RMSLE {rmse:.6f}')

oof_avg = np.mean(np.vstack(oof_seeds), axis=0)
pred_avg = np.mean(np.vstack(pred_seeds), axis=0)
from sklearn.metrics import mean_squared_error
cv = float(mean_squared_error(y_form_log, oof_avg) ** 0.5)
print(f'Form LGB blended seeds CV RMSLE: {cv:.6f}')
np.save('oof_form_lgbm.npy', oof_avg)
np.save('pred_form_lgbm_test.npy', pred_avg)

# Build multi-target submission by merging formation predictions with current bandgap submission
sub_bg = pd.read_csv('submission.csv')  # contains id, bandgap_energy_ev
form_pred = np.expm1(pred_avg).clip(0, None)
sub = pd.DataFrame({'id': pd.read_csv('test.csv')['id']})
sub['formation_energy_ev_natom'] = form_pred
sub = sub.merge(sub_bg, on='id', how='left')
assert sub['bandgap_energy_ev'].notna().all(), 'Missing bandgap predictions when merging'
sub = sub[['id', 'formation_energy_ev_natom', 'bandgap_energy_ev']]
sub.to_csv('submission.csv', index=False)
print('Multi-target submission.csv saved:', sub.shape, '| elapsed', f'{time.time()-t0:.1f}s')

Formation energy model start


Form LGB matrices: (2160, 125) (240, 125)
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[4280]	train's rmse: 0.0246047	valid's rmse: 0.0313949
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1174]	train's rmse: 0.0281075	valid's rmse: 0.0352266
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[3231]	train's rmse: 0.0255598	valid's rmse: 0.0299978
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[2419]	train's rmse: 0.0255088	valid's rmse: 0.0355533
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1170]	train's rmse: 0.0285994	valid's rmse: 0.0290569
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1333]	train's rmse: 0.0266241	valid's rmse: 0.0388658
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[3260]	train's rmse: 0.0252959	valid's rmse: 0.0310031
Form LGB SEED 7: OOF RMSLE 0.033014
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[4736]	train's rmse: 0.0243078	valid's rmse: 0.0312392


Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1142]	train's rmse: 0.0281651	valid's rmse: 0.0352259
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[2602]	train's rmse: 0.0261938	valid's rmse: 0.0301218
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1462]	train's rmse: 0.0275537	valid's rmse: 0.0326305
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[3219]	train's rmse: 0.0245709	valid's rmse: 0.035575
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1252]	train's rmse: 0.0283729	valid's rmse: 0.0287606
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1642]	train's rmse: 0.0260012	valid's rmse: 0.0390344
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[3647]	train's rmse: 0.0249991	valid's rmse: 0.0310648
Form LGB SEED 42: OOF RMSLE 0.033064
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[4789]	train's rmse: 0.0242708	valid's rmse: 0.0314578


Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1056]	train's rmse: 0.0283244	valid's rmse: 0.0353031
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[2570]	train's rmse: 0.0262369	valid's rmse: 0.0303921
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[2184]	train's rmse: 0.0264441	valid's rmse: 0.0324046
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[3144]	train's rmse: 0.0246561	valid's rmse: 0.0355834
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1322]	train's rmse: 0.0281652	valid's rmse: 0.0289095
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[1372]	train's rmse: 0.0265239	valid's rmse: 0.039015
Training until validation scores don't improve for 400 rounds


Early stopping, best iteration is:
[4177]	train's rmse: 0.0245691	valid's rmse: 0.0308337
Form LGB SEED 2025: OOF RMSLE 0.033096
Form LGB blended seeds CV RMSLE: 0.033022
Multi-target submission.csv saved: (240, 3) | elapsed 31.6s
