# INGV - Volcanic Eruption Prediction: Plan

Goal: Ship a strong, validated baseline fast, then iterate to medal.

Plan v0:
- Verify environment (GPU) and repo contents
- Inspect data schema (train.csv, sample_submission.csv, train/ and test/ folders)
- Establish CV protocol mirroring test (grouped by series, temporal-safe)
- Baseline features from raw signals (per-sensor stats, spectral features) with fast model (XGBoost GPU / CatBoost GPU)
- Create a fast baseline submission
- Iterate: richer features (STFT bands, autocorr, RMS, kurtosis, rolling windows), seed/fold repeats, simple blends

Checkpoints to request expert review:
1) After this plan
2) After EDA + CV choice
3) After baseline model + OOF
4) After feature set v1/v2
5) Before long training jobs / blends

Metric: MAE on time_to_eruption. Output: submission.csv

In [1]:
# Env + Data schema check
import os, sys, glob, subprocess, time, json
from datetime import datetime
import pandas as pd
import numpy as np

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# 1) GPU check
log('Checking GPU with nvidia-smi...')
try:
    res = subprocess.run(['bash','-lc','nvidia-smi || true'], capture_output=True, text=True)
    print(res.stdout)
except Exception as e:
    log(f'GPU check failed: {e}')

# 2) Repo contents
log('Listing repo contents...')
for p in sorted(os.listdir('.')):
    try:
        print(p, '->', len(os.listdir(p)) if os.path.isdir(p) else os.path.getsize(p))
    except Exception:
        print(p)

# 3) Load CSVs
log('Loading train.csv and sample_submission.csv')
train = pd.read_csv('train.csv')
ss = pd.read_csv('sample_submission.csv')
log(f'train.shape={train.shape}; columns={list(train.columns)}')
log(f'sample_submission.shape={ss.shape}; columns={list(ss.columns)}')
print(train.head(3))
print(ss.head(3))

# Basic target stats
if 'time_to_eruption' in train.columns:
    y = train['time_to_eruption']
    log(f'target describe (secs): min={y.min()}, max={y.max()}, mean={y.mean():.1f}, median={y.median():.1f}')

# 4) Train/Test files
train_files = sorted(glob.glob('train/*.csv'))
test_files = sorted(glob.glob('test/*.csv'))
log(f'Found {len(train_files)} train files, {len(test_files)} test files')
print('Example train files:', train_files[:3])
print('Example test files:', test_files[:3])

# Peek one train segment file to infer schema
if train_files:
    log(f'Loading sample segment file: {train_files[0]}')
    df0 = pd.read_csv(train_files[0])
    log(f'segment shape: {df0.shape}; cols: {list(df0.columns)[:10]}...')
    print(df0.head(3))

# 5) Infer series_id groups for CV suggestion
if {'segment_id','time_to_eruption'}.issubset(train.columns):
    tmp = train[['segment_id','time_to_eruption']].sort_values('segment_id').copy()
    # Heuristic: series boundaries where t diff increases vs previous row
    d = tmp['time_to_eruption'].diff()
    series_id = (d > 0).cumsum().astype(int)
    train['series_id_inferred'] = series_id.values
    log(f'Inferred series count: {train.series_id_inferred.nunique()}')
    print(train[['segment_id','time_to_eruption','series_id_inferred']].head(10))

log('Env/data check complete.')

[2025-09-27T02:19:20.199431Z] Checking GPU with nvidia-smi...


Sat Sep 27 02:19:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     182MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

.00_eda_and_planning_kernel_state.json -> 182
00_eda_and_planning.ipynb -> 4911
agent_metadata -> 10
description.md -> 4726
docker_run.log -> 39871
requirements.txt -> 2021
sample_submission.csv -> 5559
submission.csv -> 5559
task.txt -> 2836
test -> 444
train -> 3987
train.csv -> 76704
[2025-09-27T02:19:20.227044Z] Loading train.csv and sample_submission.csv


[2025-09-27T02:19:20.230176Z] train.shape=(3987, 2); columns=['segment_id', 'time_to_eruption']


[2025-09-27T02:19:20.230595Z] sample_submission.shape=(444, 2); columns=['segment_id', 'time_to_eruption']


   segment_id  time_to_eruption
0  1410693848          20109998
1   987159268           7202883
2  1990984540          28138930
   segment_id  time_to_eruption
0   951290289                 0
1   508758258                 0
2  1566132188                 0
[2025-09-27T02:19:20.233278Z] target describe (secs): min=6250, max=49046087, mean=22675929.1, median=22300345.0


[2025-09-27T02:19:20.239174Z] Found 3987 train files, 444 test files


Example train files: ['train/1000015382.csv', 'train/1000554676.csv', 'train/1000745424.csv']
Example test files: ['test/1003520023.csv', 'test/1004346803.csv', 'test/1007996426.csv']
[2025-09-27T02:19:20.239677Z] Loading sample segment file: train/1000015382.csv


[2025-09-27T02:19:20.275681Z] segment shape: (60001, 10); cols: ['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10']...


   sensor_1  sensor_2  sensor_3  sensor_4  sensor_5  sensor_6  sensor_7  \
0     260.0      64.0    -232.0     -36.0      -2.0     -35.0     103.0   
1     233.0     175.0     146.0     160.0      -4.0      29.0    -120.0   
2     216.0     236.0     321.0     202.0       2.0     113.0    -230.0   

   sensor_8  sensor_9  sensor_10  
0     389.0      67.0       41.0  
1     498.0      59.0       63.0  
2     554.0      97.0       90.0  
[2025-09-27T02:19:20.282421Z] Inferred series count: 1973


   segment_id  time_to_eruption  series_id_inferred
0  1410693848          20109998                   0
1   987159268           7202883                   0
2  1990984540          28138930                   0
3   983270799            759643                   1
4   116548092          47441128                   1
5  1211468948          20770278                   2
6   256946052          19237462                   2
7  1109804924           1153377                   2
8    48233383          28213217                   3
9  1305598339          34191056                   4
[2025-09-27T02:19:20.284125Z] Env/data check complete.


In [2]:
# CV groups (series_id) + Fast feature pipeline smoke test
import os, glob, gc
import numpy as np
import pandas as pd
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# 1) Infer series_id from ORIGINAL train.csv order (per expert guidance)
train_csv = pd.read_csv('train.csv')
t = train_csv['time_to_eruption']
series_id = (t.diff() > 0).cumsum().astype(int)
train_csv['series_id_inferred'] = series_id
log(f"Series inferred (orig order): n_series={train_csv.series_id_inferred.nunique()} with sizes head: {train_csv.series_id_inferred.value_counts().head(5).to_dict()}")

# 2) File maps
train_map = {int(os.path.splitext(os.path.basename(p))[0]): p for p in glob.glob('train/*.csv')}
test_map = {int(os.path.splitext(os.path.basename(p))[0]): p for p in glob.glob('test/*.csv')}
assert len(train_map) == len(train_csv), 'Mismatch train files vs train.csv rows'
log(f"Train files mapped: {len(train_map)}; Test files mapped: {len(test_map)}")

# 3) Feature extraction
SENSORS = [f'sensor_{i}' for i in range(1,11)]
WINDOW_SPECS = {
    'full': (0.0, 1.0),
    'half': (0.5, 1.0),
    'quarter': (0.75, 1.0),
}

def robust_stats(x):
    x = x.astype(np.float32)
    med = np.median(x)
    mad = np.median(np.abs(x - med))
    iqr = np.subtract(*np.percentile(x, [75, 25]))
    q05, q95 = np.percentile(x, [5, 95])
    return {
        'mean': float(np.mean(x)),
        'std': float(np.std(x)),
        'median': float(med),
        'mad': float(mad),
        'iqr': float(iqr),
        'min': float(np.min(x)),
        'max': float(np.max(x)),
        'ptp': float(np.ptp(x)),
        'q05': float(q05),
        'q95': float(q95),
    }

def zcr(x):
    x = x.astype(np.float32)
    s = np.signbit(x)
    return float(np.count_nonzero(s[1:] != s[:-1]) / (len(x) - 1))

def line_length(x):
    x = x.astype(np.float32)
    return float(np.sum(np.abs(np.diff(x))) / (len(x) - 1))

def rms(x):
    x = x.astype(np.float32)
    return float(np.sqrt(np.mean(x * x)))

def extract_features_for_segment(segment_path):
    df = pd.read_csv(segment_path)
    n = len(df)
    feats = {}
    for wname, (start_frac, end_frac) in WINDOW_SPECS.items():
        s = int(n * start_frac)
        e = int(n * end_frac)
        win = df.iloc[s:e]
        for col in SENSORS:
            x = win[col].values
            rs = robust_stats(x)
            feats.update({f"{col}__{wname}__{k}": v for k, v in rs.items()})
            feats[f"{col}__{wname}__rms"] = rms(x)
            feats[f"{col}__{wname}__zcr"] = zcr(x)
            feats[f"{col}__{wname}__linelen"] = line_length(x)
    # cross-sensor aggregates (mean/std across sensors for a couple stats on full window)
    # Example: mean/std of rms across sensors (full window)
    rms_vals = [feats[f"sensor_{i}__full__rms"] for i in range(1,11)]
    feats['cross_full_rms_mean'] = float(np.mean(rms_vals))
    feats['cross_full_rms_std'] = float(np.std(rms_vals))
    return feats

# 4) Smoke test on a subset for speed
FAST_N = 200  # increase later to all
train_rows = train_csv.head(FAST_N).copy()
test_ids = list(test_map.keys())[:min(100, len(test_map))]  # quick smoke on test

log(f"Extracting features for {len(train_rows)} train segments (smoke)")
train_feat_rows = []
for i, row in train_rows.iterrows():
    seg_id = int(row['segment_id'])
    path = train_map[seg_id]
    f = extract_features_for_segment(path)
    f['segment_id'] = seg_id
    f['series_id'] = int(row['series_id_inferred'])
    f['time_to_eruption'] = int(row['time_to_eruption'])
    train_feat_rows.append(f)
    if (len(train_feat_rows) % 25) == 0:
        log(f"Processed {len(train_feat_rows)}/{len(train_rows)} train segments")

train_feats = pd.DataFrame(train_feat_rows).set_index('segment_id')
log(f"Train features shape: {train_feats.shape}")

log(f"Extracting features for {len(test_ids)} test segments (smoke)")
test_feat_rows = []
for idx, seg_id in enumerate(test_ids, 1):
    path = test_map[seg_id]
    f = extract_features_for_segment(path)
    f['segment_id'] = seg_id
    test_feat_rows.append(f)
    if (idx % 25) == 0:
        log(f"Processed {idx}/{len(test_ids)} test segments")

test_feats = pd.DataFrame(test_feat_rows).set_index('segment_id')
log(f"Test features shape: {test_feats.shape}")
log(f"Feature columns: {len(train_feats.columns)}")
print(sorted(train_feats.columns)[:15], '...')

gc.collect();
log('Smoke feature build complete. Next: scale up to full dataset and train XGBoost GPU with GroupKFold.')

[2025-09-27T02:20:21.792431Z] Series inferred (orig order): n_series=2003 with sizes head: {1593: 6, 1795: 6, 131: 6, 671: 5, 1268: 5}


[2025-09-27T02:20:21.801259Z] Train files mapped: 3987; Test files mapped: 444


[2025-09-27T02:20:21.802426Z] Extracting features for 200 train segments (smoke)


[2025-09-27T02:20:23.648586Z] Processed 25/200 train segments


[2025-09-27T02:20:25.439184Z] Processed 50/200 train segments


[2025-09-27T02:20:27.245325Z] Processed 75/200 train segments


[2025-09-27T02:20:29.082433Z] Processed 100/200 train segments


[2025-09-27T02:20:30.831002Z] Processed 125/200 train segments


[2025-09-27T02:20:32.661424Z] Processed 150/200 train segments


[2025-09-27T02:20:34.439953Z] Processed 175/200 train segments


[2025-09-27T02:20:36.296617Z] Processed 200/200 train segments


[2025-09-27T02:20:36.308461Z] Train features shape: (200, 394)


[2025-09-27T02:20:36.308899Z] Extracting features for 100 test segments (smoke)


[2025-09-27T02:20:38.130220Z] Processed 25/100 test segments


[2025-09-27T02:20:39.941790Z] Processed 50/100 test segments


[2025-09-27T02:20:41.753577Z] Processed 75/100 test segments


[2025-09-27T02:20:43.585741Z] Processed 100/100 test segments


[2025-09-27T02:20:43.592514Z] Test features shape: (100, 392)


[2025-09-27T02:20:43.592872Z] Feature columns: 394


['cross_full_rms_mean', 'cross_full_rms_std', 'sensor_10__full__iqr', 'sensor_10__full__linelen', 'sensor_10__full__mad', 'sensor_10__full__max', 'sensor_10__full__mean', 'sensor_10__full__median', 'sensor_10__full__min', 'sensor_10__full__ptp', 'sensor_10__full__q05', 'sensor_10__full__q95', 'sensor_10__full__rms', 'sensor_10__full__std', 'sensor_10__full__zcr'] ...
[2025-09-27T02:20:43.637955Z] Smoke feature build complete. Next: scale up to full dataset and train XGBoost GPU with GroupKFold.


In [8]:
# Full feature build with rFFT spectral features + short windows + ACF, caching (CSV)
import os, glob, gc, math
import numpy as np
import pandas as pd
from datetime import datetime
from joblib import Parallel, delayed

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Load train meta and infer series_id from original order
train_meta = pd.read_csv('train.csv')
t = train_meta['time_to_eruption']
train_meta['series_id'] = (t.diff() > 0).cumsum().astype(int)

# Maps
train_map = {int(os.path.splitext(os.path.basename(p))[0]): p for p in glob.glob('train/*.csv')}
test_map = {int(os.path.splitext(os.path.basename(p))[0]): p for p in glob.glob('test/*.csv')}
assert len(train_map) == len(train_meta), 'Mismatch train files vs train.csv rows'

SENSORS = [f'sensor_{i}' for i in range(1,11)]
WINDOW_SPECS = {
    'full': (0.0, 1.0),
    'quarter': (0.75, 1.0),
    'p10': (0.90, 1.0),
    'p5': (0.95, 1.0),
    'p2': (0.98, 1.0),
    'p1': (0.99, 1.0),
}

# Time-domain feature helpers
def robust_stats(x):
    x = x.astype(np.float32)
    med = np.median(x)
    mad = np.median(np.abs(x - med))
    iqr = np.subtract(*np.percentile(x, [75, 25]))
    q05, q95 = np.percentile(x, [5, 95])
    return {
        'mean': float(np.mean(x)),
        'std': float(np.std(x)),
        'median': float(med),
        'mad': float(mad),
        'iqr': float(iqr),
        'min': float(np.min(x)),
        'max': float(np.max(x)),
        'ptp': float(np.ptp(x)),
        'q05': float(q05),
        'q95': float(q95),
    }

def zcr(x):
    x = x.astype(np.float32)
    s = np.signbit(x)
    return float(np.count_nonzero(s[1:] != s[:-1]) / max(1, (len(x) - 1)))

def line_length(x):
    x = x.astype(np.float32)
    return float(np.sum(np.abs(np.diff(x))) / max(1, (len(x) - 1)))

def rms(x):
    x = x.astype(np.float32)
    return float(np.sqrt(np.mean(x * x)))

def acf_feats(x, lags=(1,2,4,8,16)):
    x = x.astype(np.float32)
    n = len(x)
    if n < 3:
        return {**{f'acf_lag_{l}': 0.0 for l in lags}, 'acf_decay_e': float(lags[-1]), 'acf_decay_0p5': float(lags[-1])}
    xm = x - np.mean(x)
    var = float(np.sum(xm * xm) / max(1, n))
    out = {}
    acf_vals = []
    for lag in lags:
        if lag >= n:
            r = 0.0
        else:
            num = float(np.dot(xm[:-lag], xm[lag:]) / max(1, (n - lag)))
            r = num / (var + 1e-8)
        out[f'acf_lag_{lag}'] = r
        acf_vals.append((lag, r))
    # decay: first lag where |acf| < 1/e and < 0.5
    decay_e = lags[-1]
    decay_05 = lags[-1]
    for lag, r in acf_vals:
        if decay_e == lags[-1] and abs(r) < (1.0 / math.e):
            decay_e = lag
        if decay_05 == lags[-1] and abs(r) < 0.5:
            decay_05 = lag
    out['acf_decay_e'] = float(decay_e)
    out['acf_decay_0p5'] = float(decay_05)
    return out

# Spectral helpers (rFFT, fs=100 Hz, n_fft=4096, Hann)
FS = 100.0
N_FFT = 4096
FREQS = np.fft.rfftfreq(N_FFT, 1.0/FS).astype(np.float32)
BANDS = [(0.0,2.0),(2.0,5.0),(5.0,10.0),(10.0,20.0),(20.0,50.0)]
HANN = np.hanning(N_FFT).astype(np.float32)

def band_indices(freqs, band):
    lo, hi = band
    return np.where((freqs >= lo) & (freqs < hi))[0]

BAND_IDXS = [band_indices(FREQS, b) for b in BANDS]

def spectral_feats(x):
    x = x.astype(np.float32)
    # take last N_FFT samples (pad if needed)
    if len(x) >= N_FFT:
        xw = x[-N_FFT:]
    else:
        pad = np.zeros(N_FFT, dtype=np.float32)
        pad[-len(x):] = x
        xw = pad
    xw = xw * HANN
    X = np.fft.rfft(xw, n=N_FFT)
    P = (np.abs(X) ** 2).astype(np.float32)
    total = float(P.sum() + 1e-12)
    # bandpowers and fractions
    bands_power = [float(P[idx].sum()) for idx in BAND_IDXS]
    bands_frac = [bp / total for bp in bands_power]
    centroid = float((FREQS * P).sum() / total)
    p_norm = P / total
    entropy = float(-(p_norm * np.log(p_norm + 1e-12)).sum())
    flatness = float(np.exp(np.mean(np.log(P + 1e-12))) / (np.mean(P) + 1e-12))
    out = {
        'spec_total_power': total,
        'spec_centroid': centroid,
        'spec_entropy': entropy,
        'spec_flatness': flatness,
    }
    for i, (lo,hi) in enumerate(BANDS):
        out[f'spec_band_{int(lo)}_{int(hi)}_power'] = bands_power[i]
        out[f'spec_band_{int(lo)}_{int(hi)}_frac'] = bands_frac[i]
    return out

def extract_one(seg_id, path):
    try:
        df = pd.read_csv(path)
        n = len(df)
        feats = {}
        # per-window, per-sensor
        for wname, (start_frac, end_frac) in WINDOW_SPECS.items():
            s = int(n * start_frac)
            e = int(n * end_frac)
            win = df.iloc[s:e]
            # time-domain + ACF
            for col in SENSORS:
                x = win[col].values.astype(np.float32)
                rs = robust_stats(x)
                for k,v in rs.items():
                    feats[f"{col}__{wname}__{k}"] = np.float32(v)
                feats[f"{col}__{wname}__rms"] = np.float32(rms(x))
                feats[f"{col}__{wname}__zcr"] = np.float32(zcr(x))
                feats[f"{col}__{wname}__linelen"] = np.float32(line_length(x))
                acf = acf_feats(x)
                for k,v in acf.items():
                    feats[f"{col}__{wname}__{k}"] = np.float32(v)
            # spectral on last N_FFT samples of this window
            total_powers = []
            centroids = []
            for col in SENSORS:
                x = win[col].values.astype(np.float32)
                sf = spectral_feats(x)
                for k,v in sf.items():
                    feats[f"{col}__{wname}__{k}"] = np.float32(v)
                total_powers.append(sf['spec_total_power'])
                centroids.append(sf['spec_centroid'])
            feats[f"cross_{wname}__spec_total_power_mean"] = np.float32(np.mean(total_powers))
            feats[f"cross_{wname}__spec_total_power_std"] = np.float32(np.std(total_powers))
            feats[f"cross_{wname}__spec_centroid_mean"] = np.float32(np.mean(centroids))
            feats[f"cross_{wname}__spec_centroid_std"] = np.float32(np.std(centroids))
        feats['segment_id'] = int(seg_id)
        return feats
    except Exception as e:
        log(f"Error processing {seg_id}: {e}")
        return {'segment_id': int(seg_id)}

def build_features(map_dict, is_train=False):
    ids = list(map_dict.keys())
    ids.sort()
    log(f"Starting feature extraction for {len(ids)} segments | train={is_train}")
    def _proc(i, seg_id):
        feats = extract_one(seg_id, map_dict[seg_id])
        if (i+1) % 200 == 0:
            log(f"Processed {i+1}/{len(ids)} segments")
        return feats
    res = Parallel(n_jobs=-1, backend='loky')(delayed(_proc)(i, seg_id) for i, seg_id in enumerate(ids))
    feats_df = pd.DataFrame(res).set_index('segment_id')
    # ensure float32
    for c in feats_df.columns:
        feats_df[c] = feats_df[c].astype(np.float32)
    if is_train:
        meta = train_meta.set_index('segment_id').loc[feats_df.index, ['time_to_eruption','series_id']].copy()
        feats_df = feats_df.join(meta)
    log(f"Built features shape: {feats_df.shape}")
    return feats_df

# Build and cache (overwrite v1 files for simplicity)
train_out = 'features_train_v1.csv'
test_out = 'features_test_v1.csv'
rebuild = True
if rebuild or not os.path.exists(train_out):
    train_feats = build_features(train_map, is_train=True)
    train_feats.to_csv(train_out)
    log(f"Saved {train_out} ({os.path.getsize(train_out)} bytes)")
else:
    train_feats = pd.read_csv(train_out).set_index('segment_id')
    log(f"Loaded cached {train_out} -> {train_feats.shape}")

if rebuild or not os.path.exists(test_out):
    test_feats = build_features(test_map, is_train=False)
    test_feats.to_csv(test_out)
    log(f"Saved {test_out} ({os.path.getsize(test_out)} bytes)")
else:
    test_feats = pd.read_csv(test_out).set_index('segment_id')
    log(f"Loaded cached {test_out} -> {test_feats.shape}")

log('Feature build v2 (short windows + ACF + rFFT) complete. Next: retrain XGBoost and blend if time allows.')

[2025-09-27T02:38:19.765384Z] Starting feature extraction for 3987 segments | train=True


[2025-09-27T02:38:21.834093Z] Processed 200/3987 segments


[2025-09-27T02:38:22.899997Z] Processed 400/3987 segments


[2025-09-27T02:38:23.954161Z] Processed 600/3987 segments


[2025-09-27T02:38:26.051684Z] Processed 1000/3987 segments


[2025-09-27T02:38:27.082591Z] Processed 1200/3987 segments


[2025-09-27T02:38:28.169339Z] Processed 1400/3987 segments


[2025-09-27T02:38:29.184635Z] Processed 1600/3987 segments


[2025-09-27T02:38:30.251582Z] Processed 1800/3987 segments


[2025-09-27T02:38:31.346414Z] Processed 2000/3987 segments


[2025-09-27T02:38:32.378191Z] Processed 2200/3987 segments


[2025-09-27T02:38:33.393966Z] Processed 2400/3987 segments


[2025-09-27T02:38:34.447941Z] Processed 2600/3987 segments


[2025-09-27T02:38:35.495697Z] Processed 2800/3987 segments


[2025-09-27T02:38:36.486382Z] Processed 3000/3987 segments


[2025-09-27T02:38:37.498437Z] Processed 3200/3987 segments


[2025-09-27T02:38:38.507645Z] Processed 3400/3987 segments


[2025-09-27T02:38:39.503032Z] Processed 3600/3987 segments


[2025-09-27T02:38:40.514907Z] Processed 3800/3987 segments


[2025-09-27T02:38:43.376351Z] Built features shape: (3987, 2066)


[2025-09-27T02:38:47.183811Z] Saved features_train_v1.csv (75307100 bytes)


[2025-09-27T02:38:47.185760Z] Starting feature extraction for 444 segments | train=False


[2025-09-27T02:38:48.209334Z] Processed 200/444 segments


[2025-09-27T02:38:49.022586Z] Processed 400/444 segments


[2025-09-27T02:38:49.527580Z] Built features shape: (444, 2064)


[2025-09-27T02:38:50.244176Z] Saved features_test_v1.csv (8422404 bytes)


[2025-09-27T02:38:50.245022Z] Feature build v2 (short windows + ACF + rFFT) complete. Next: retrain XGBoost and blend if time allows.


In [13]:
# XGBoost GPU training with 5-fold GroupKFold using xgb.train (handles early stopping), OOF MAE, clipping sweep, and submission
import os, time, subprocess, sys, gc
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Ensure xgboost is installed (GPU-capable).
try:
    import xgboost as xgb
except Exception:
    log('Installing xgboost...')
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.1'], check=True)
    import xgboost as xgb

# Load cached features
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
assert os.path.exists(train_path) and os.path.exists(test_path), 'Feature CSVs not found. Run feature build first.'
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
log(f'Loaded features: train {train_feats.shape}, test {test_feats.shape}')

# Split X, y, groups; align columns between train and test
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy()
X_test = X_test.fillna(0.0)
log(f'Feature columns: {len(feature_cols)}')

# CV setup
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# XGBoost params for xgb.train
params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:squarederror',  # use MAE metric for optimization signal
    'max_depth': 6,
    'eta': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'min_child_weight': 12,
    'alpha': 0.1,
    'lambda': 3.0,
    'gamma': 0.1,
    'eval_metric': 'mae',
    'seed': 42,
    'verbosity': 0,
}

oof = np.zeros(len(X), dtype=np.float32)
test_preds_folds = []
fold_maes = []
start_all = time.time()
DM_test = xgb.DMatrix(X_test.values, feature_names=feature_cols)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    t0 = time.time()
    log(f'Fold {fold}/{n_splits}: train={len(tr_idx)} valid={len(va_idx)}')
    X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_va, y_va = X.iloc[va_idx], y[va_idx]
    DM_tr = xgb.DMatrix(X_tr.values, label=y_tr, feature_names=feature_cols)
    DM_va = xgb.DMatrix(X_va.values, label=y_va, feature_names=feature_cols)
    evals = [(DM_tr, 'train'), (DM_va, 'valid')]
    bst = xgb.train(
        params=params,
        dtrain=DM_tr,
        num_boost_round=10000,
        evals=evals,
        early_stopping_rounds=300,
        verbose_eval=False,
    )
    best_iter = getattr(bst, 'best_iteration', None)
    if best_iter is None:
        best_iter = bst.num_boosted_rounds
    va_pred = bst.predict(DM_va, iteration_range=(0, int(best_iter)))
    oof[va_idx] = va_pred.astype(np.float32)
    fold_mae = mean_absolute_error(y_va, va_pred)
    fold_maes.append(fold_mae)
    log(f'Fold {fold} MAE={fold_mae:,.0f} | best_iter={best_iter}')
    tp = bst.predict(DM_test, iteration_range=(0, int(best_iter)))
    test_preds_folds.append(tp.astype(np.float32))
    log(f'Fold {fold} done in {time.time()-t0:.1f}s')

oof_mae = mean_absolute_error(y, oof)
log(f'OOF MAE={oof_mae:,.0f}; folds: {[int(m) for m in fold_maes]} | elapsed {time.time()-start_all:.1f}s')

# Averaged test preds
test_pred = np.mean(test_preds_folds, axis=0).astype(np.float32)

# Clipping sweep on OOF to pick upper bound
quantiles = [0.99, 0.995, 0.999, 1.0]
best_upper = None
best_mae = oof_mae
for q in quantiles:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    oof_clip = np.clip(oof, 0.0, upper)
    mae = mean_absolute_error(y, oof_clip)
    log(f'Clip upper {q} -> {upper:,.0f}: OOF MAE {mae:,.0f}')
    if mae < best_mae:
        best_mae = mae
        best_upper = upper
if best_upper is None:
    best_upper = float(np.max(y))
log(f'Chosen clip upper={best_upper:,.0f} (OOF {best_mae:,.0f})')

# Save raw XGB test predictions (unaligned, then aligned to sample order) without clipping for later blending
ss = pd.read_csv('sample_submission.csv')
test_idx = X_test.index.values
pred_series_raw = pd.Series(test_pred, index=test_idx)
pred_aligned_raw = pred_series_raw.reindex(ss['segment_id'].values).fillna(pred_series_raw.median()).values
pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned_raw.astype(np.float32)}).to_csv('pred_test_xgb_raw.csv', index=False)
log(f'Saved pred_test_xgb_raw.csv for blending.')

# Create submission from clipped XGB preds
pred_aligned = np.clip(pred_aligned_raw, 0.0, best_upper)
sub = pd.DataFrame({
    'segment_id': ss['segment_id'].values,
    'time_to_eruption': pred_aligned.astype(np.int64),
})
sub.to_csv('submission.csv', index=False)
log(f'Saved submission.csv ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())

# Save OOF for diagnostics
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof': oof.astype(np.float32)}).to_csv('oof_xgb_v1.csv', index=False)
log('Training+pseudo-inference complete.')

[2025-09-27T03:22:44.211410Z] Loaded features: train (3987, 2066), test (444, 2064)


[2025-09-27T03:22:44.241174Z] Feature columns: 2064


[2025-09-27T03:22:44.252316Z] Fold 1/5: train=3189 valid=798


[2025-09-27T03:24:01.871375Z] Fold 1 MAE=3,166,326 | best_iter=3748


[2025-09-27T03:24:01.918948Z] Fold 1 done in 77.7s


[2025-09-27T03:24:01.919728Z] Fold 2/5: train=3189 valid=798


[2025-09-27T03:25:23.122693Z] Fold 2 MAE=3,246,702 | best_iter=3910


[2025-09-27T03:25:23.171576Z] Fold 2 done in 81.3s


[2025-09-27T03:25:23.172255Z] Fold 3/5: train=3190 valid=797


[2025-09-27T03:26:38.986211Z] Fold 3 MAE=3,358,654 | best_iter=3584


[2025-09-27T03:26:39.031233Z] Fold 3 done in 75.9s


[2025-09-27T03:26:39.031868Z] Fold 4/5: train=3190 valid=797


[2025-09-27T03:28:07.575464Z] Fold 4 MAE=3,181,652 | best_iter=4408


[2025-09-27T03:28:07.630099Z] Fold 4 done in 88.6s


[2025-09-27T03:28:07.630838Z] Fold 5/5: train=3190 valid=797


[2025-09-27T03:29:35.283277Z] Fold 5 MAE=3,223,680 | best_iter=4270


[2025-09-27T03:29:35.336194Z] Fold 5 done in 87.7s


[2025-09-27T03:29:35.337189Z] OOF MAE=3,235,388; folds: [3166325, 3246702, 3358653, 3181652, 3223680] | elapsed 411.1s


[2025-09-27T03:29:35.338403Z] Clip upper 0.99 -> 46,962,008: OOF MAE 3,234,676


[2025-09-27T03:29:35.339230Z] Clip upper 0.995 -> 47,715,640: OOF MAE 3,233,765


[2025-09-27T03:29:35.340285Z] Clip upper 0.999 -> 48,356,643: OOF MAE 3,233,824


[2025-09-27T03:29:35.340980Z] Clip upper 1.0 -> 49,046,088: OOF MAE 3,234,111


[2025-09-27T03:29:35.341354Z] Chosen clip upper=47,715,640 (OOF 3,233,765)


[2025-09-27T03:29:35.344878Z] Saved pred_test_xgb_raw.csv for blending.


[2025-09-27T03:29:35.346140Z] Saved submission.csv (8613 bytes). Head:


   segment_id  time_to_eruption
0   951290289          30333056
1   508758258          15063424
2  1566132188          33993968
3  1891418251          21747250
4  1968343855           7629103
[2025-09-27T03:29:35.351334Z] Training+pseudo-inference complete.


In [10]:
# XGB dual-model (MAE + log1p) 5-fold GroupKFold, blend and submit
import os, time, sys, subprocess
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import xgboost as xgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.1'], check=True)
    import xgboost as xgb

# Load features v2
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy().fillna(0.0)
DM_test = xgb.DMatrix(X_test.values, feature_names=feature_cols)
log(f'Features loaded: X={X.shape}, X_test={X_test.shape}')

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# Model A: MAE objective (direct)
params_mae = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:absoluteerror',
    'max_depth': 6,
    'eta': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'min_child_weight': 12,
    'alpha': 0.1,
    'lambda': 3.0,
    'gamma': 0.1,
    'eval_metric': 'mae',
    'seed': 42,
    'verbosity': 0,
}

oof_mae_direct = np.zeros(len(X), dtype=np.float32)
test_mae_direct = []
fold_maes_direct = []
t0_all = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    DM_tr = xgb.DMatrix(X.iloc[tr_idx].values, label=y[tr_idx], feature_names=feature_cols)
    DM_va = xgb.DMatrix(X.iloc[va_idx].values, label=y[va_idx], feature_names=feature_cols)
    bst = xgb.train(params_mae, DM_tr, num_boost_round=10000, evals=[(DM_va,'valid')], early_stopping_rounds=300, verbose_eval=False)
    best_iter = getattr(bst, 'best_iteration', None) or bst.num_boosted_rounds
    pred_va = bst.predict(DM_va, iteration_range=(0, int(best_iter))).astype(np.float32)
    oof_mae_direct[va_idx] = pred_va
    fold_mae = mean_absolute_error(y[va_idx], pred_va)
    fold_maes_direct.append(fold_mae)
    log(f'[MAE] Fold {fold} MAE={fold_mae:,.0f} | best_iter={best_iter}')
    pred_te = bst.predict(DM_test, iteration_range=(0, int(best_iter))).astype(np.float32)
    test_mae_direct.append(pred_te)
log(f'[MAE] OOF MAE={mean_absolute_error(y, oof_mae_direct):,.0f}; folds={[(i+1,int(m)) for i,m in enumerate(fold_maes_direct)]} | {time.time()-t0_all:.1f}s')

# Model B: log1p target with RMSE objective
y_log = np.log1p(y.astype(np.float64)).astype(np.float32)
params_log = params_mae.copy()
params_log.update({'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'seed': 43})
oof_log = np.zeros(len(X), dtype=np.float32)
test_log = []
fold_rmses = []
t0_all = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y_log, groups=groups), 1):
    DM_tr = xgb.DMatrix(X.iloc[tr_idx].values, label=y_log[tr_idx], feature_names=feature_cols)
    DM_va = xgb.DMatrix(X.iloc[va_idx].values, label=y_log[va_idx], feature_names=feature_cols)
    bst = xgb.train(params_log, DM_tr, num_boost_round=10000, evals=[(DM_va,'valid')], early_stopping_rounds=300, verbose_eval=False)
    best_iter = getattr(bst, 'best_iteration', None) or bst.num_boosted_rounds
    pred_va_log = bst.predict(DM_va, iteration_range=(0, int(best_iter))).astype(np.float32)
    oof_log[va_idx] = pred_va_log
    # report MAE on original scale for reference
    pred_va = np.expm1(pred_va_log).astype(np.float32)
    mae_val = mean_absolute_error(y[va_idx], pred_va)
    fold_rmses.append(mae_val)
    log(f'[LOG] Fold {fold} MAE(orig)={mae_val:,.0f} | best_iter={best_iter}')
    pred_te = bst.predict(DM_test, iteration_range=(0, int(best_iter))).astype(np.float32)
    test_log.append(pred_te)
oof_log_orig = np.expm1(oof_log).astype(np.float32)
log(f'[LOG] OOF MAE on orig scale={mean_absolute_error(y, oof_log_orig):,.0f}; folds={[(i+1,int(m)) for i,m in enumerate(fold_rmses)]} | {time.time()-t0_all:.1f}s')

# Blend OOF and test predictions
test_mae_direct = np.mean(np.stack(test_mae_direct, axis=0), axis=0).astype(np.float32)
test_log_orig = np.expm1(np.mean(np.stack(test_log, axis=0), axis=0)).astype(np.float32)
w_log = 0.35  # 65% direct MAE model, 35% log1p model
oof_blend = (1.0 - w_log) * oof_mae_direct + w_log * oof_log_orig
oof_mae_blend = mean_absolute_error(y, oof_blend)
log(f'[BLEND] OOF MAE blend={oof_mae_blend:,.0f} (direct={mean_absolute_error(y, oof_mae_direct):,.0f}, log1p={mean_absolute_error(y, oof_log_orig):,.0f})')
test_blend = (1.0 - w_log) * test_mae_direct + w_log * test_log_orig

# Clipping sweep using blended OOF
quantiles = [0.99, 0.995, 0.999, 1.0]
best_upper = None
best_mae = oof_mae_blend
for q in quantiles:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    mae = mean_absolute_error(y, np.clip(oof_blend, 0.0, upper))
    log(f'[BLEND] Clip upper {q} -> {upper:,.0f}: OOF MAE {mae:,.0f}')
    if mae < best_mae:
        best_mae = mae
        best_upper = upper
if best_upper is None:
    best_upper = float(np.max(y))
log(f'[BLEND] Chosen upper={best_upper:,.0f} (OOF {best_mae:,.0f})')

# Build submission
ss = pd.read_csv('sample_submission.csv')
pred_series = pd.Series(test_blend, index=X_test.index.values)
pred_aligned = pred_series.reindex(ss['segment_id'].values).fillna(pred_series.median()).values
pred_aligned = np.clip(pred_aligned, 0.0, best_upper)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned.astype(np.int64)})
sub.to_csv('submission.csv', index=False)
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof_blend': oof_blend.astype(np.float32)}).to_csv('oof_xgb_blend.csv', index=False)
log(f'Saved submission.csv ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())
log('Dual-model blend complete.')

[2025-09-27T02:47:14.284848Z] Features loaded: X=(3987, 2064), X_test=(444, 2064)


[2025-09-27T02:50:14.175820Z] [MAE] Fold 1 MAE=3,420,550 | best_iter=9999


[2025-09-27T02:53:14.332124Z] [MAE] Fold 2 MAE=3,666,292 | best_iter=9991


[2025-09-27T02:56:14.361260Z] [MAE] Fold 3 MAE=3,772,811 | best_iter=9985


[2025-09-27T02:59:15.489098Z] [MAE] Fold 4 MAE=3,639,105 | best_iter=9998


[2025-09-27T03:02:15.717516Z] [MAE] Fold 5 MAE=3,564,985 | best_iter=9997


[2025-09-27T03:02:15.840011Z] [MAE] OOF MAE=3,612,714; folds=[(1, 3420549), (2, 3666292), (3, 3772811), (4, 3639105), (5, 3564985)] | 901.6s


[2025-09-27T03:02:36.751845Z] [LOG] Fold 1 MAE(orig)=3,948,449 | best_iter=2220


[2025-09-27T03:03:07.227654Z] [LOG] Fold 2 MAE(orig)=4,195,308 | best_iter=3603


[2025-09-27T03:03:34.723769Z] [LOG] Fold 3 MAE(orig)=4,087,214 | best_iter=3198


[2025-09-27T03:03:54.248148Z] [LOG] Fold 4 MAE(orig)=4,002,513 | best_iter=2322


[2025-09-27T03:04:17.350544Z] [LOG] Fold 5 MAE(orig)=4,044,058 | best_iter=2963


[2025-09-27T03:04:17.368160Z] [LOG] OOF MAE on orig scale=4,055,516; folds=[(1, 3948449), (2, 4195307), (3, 4087213), (4, 4002512), (5, 4044057)] | 121.5s


[2025-09-27T03:04:17.369736Z] [BLEND] OOF MAE blend=3,634,965 (direct=3,612,714, log1p=4,055,516)


[2025-09-27T03:04:17.370786Z] [BLEND] Clip upper 0.99 -> 46,962,008: OOF MAE 3,634,257


[2025-09-27T03:04:17.371567Z] [BLEND] Clip upper 0.995 -> 47,715,640: OOF MAE 3,633,760


[2025-09-27T03:04:17.372254Z] [BLEND] Clip upper 0.999 -> 48,356,643: OOF MAE 3,633,702


[2025-09-27T03:04:17.372851Z] [BLEND] Clip upper 1.0 -> 49,046,088: OOF MAE 3,634,047


[2025-09-27T03:04:17.373172Z] [BLEND] Chosen upper=48,356,643 (OOF 3,633,702)


[2025-09-27T03:04:17.380445Z] Saved submission.csv (8606 bytes). Head:


   segment_id  time_to_eruption
0   951290289          30214176
1   508758258          11675458
2  1566132188          30444312
3  1891418251          22872668
4  1968343855           5564844
[2025-09-27T03:04:17.381466Z] Dual-model blend complete.


In [11]:
# LightGBM (MAE) 5-fold GroupKFold on v2 features, predict and submit
import os, sys, subprocess, time
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import lightgbm as lgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'lightgbm==4.6.0'], check=True)
    import lightgbm as lgb

# Load features v2
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy()
X_test = X_test.fillna(0.0)
log(f'LGB data: X={X.shape}, X_test={X_test.shape}, feats={len(feature_cols)}')

params = {
    'objective': 'mae',
    'metric': 'mae',
    'learning_rate': 0.03,
    'num_leaves': 31,
    'max_depth': 8,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'min_data_in_leaf': 40,
    'lambda_l1': 0.1,
    'lambda_l2': 3.0,
    'force_col_wise': True,
    'verbose': -1,
    'num_threads': 0,
    'seed': 1234,
}

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
oof = np.zeros(len(X), dtype=np.float32)
test_preds = []
fold_maes = []
t0 = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
    X_va, y_va = X.iloc[va_idx], y[va_idx]
    dtrain = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
    dvalid = lgb.Dataset(X_va, label=y_va, reference=dtrain, free_raw_data=False)
    log(f'LGB Fold {fold}/{n_splits}: train={len(tr_idx)} valid={len(va_idx)}')
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=20000,
        valid_sets=[dvalid],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False)],
    )
    va_pred = model.predict(X_va, num_iteration=model.best_iteration).astype(np.float32)
    oof[va_idx] = va_pred
    mae = mean_absolute_error(y_va, va_pred)
    fold_maes.append(mae)
    log(f'LGB Fold {fold} MAE={mae:,.0f} | best_iter={model.best_iteration}')
    tp = model.predict(X_test, num_iteration=model.best_iteration).astype(np.float32)
    test_preds.append(tp)

oof_mae = mean_absolute_error(y, oof)
log(f'LGB OOF MAE={oof_mae:,.0f}; folds={[(i+1,int(m)) for i,m in enumerate(fold_maes)]} | elapsed {time.time()-t0:.1f}s')

# Average test preds and clip based on OOF sweep
test_mean = np.mean(np.stack(test_preds, axis=0), axis=0).astype(np.float32)
best_upper = float(np.max(y))
best_mae = oof_mae
for q in [0.99, 0.995, 0.999, 1.0]:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    mae = mean_absolute_error(y, np.clip(oof, 0.0, upper))
    log(f'LGB clip {q}: {upper:,.0f} -> OOF {mae:,.0f}')
    if mae < best_mae:
        best_mae = mae
        best_upper = upper
log(f'LGB chosen upper={best_upper:,.0f} (OOF {best_mae:,.0f})')

ss = pd.read_csv('sample_submission.csv')
pred_series = pd.Series(test_mean, index=X_test.index.values)
pred_aligned = pred_series.reindex(ss['segment_id'].values).fillna(pred_series.median()).values
pred_aligned = np.clip(pred_aligned, 0.0, best_upper)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned.astype(np.int64)})
sub.to_csv('submission.csv', index=False)
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof_lgb': oof.astype(np.float32)}).to_csv('oof_lgb.csv', index=False)
log(f'Saved submission.csv ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())
log('LGB training complete; ready to submit or blend with XGB if needed.')

[2025-09-27T03:05:33.233143Z] LGB data: X=(3987, 2064), X_test=(444, 2064), feats=2064


[2025-09-27T03:05:33.243138Z] LGB Fold 1/5: train=3189 valid=798


[2025-09-27T03:08:38.910964Z] LGB Fold 1 MAE=3,471,504 | best_iter=19988


[2025-09-27T03:08:39.001075Z] LGB Fold 2/5: train=3189 valid=798


[2025-09-27T03:11:51.931349Z] LGB Fold 2 MAE=3,667,655 | best_iter=20000


[2025-09-27T03:11:52.015434Z] LGB Fold 3/5: train=3190 valid=797


[2025-09-27T03:14:52.429707Z] LGB Fold 3 MAE=3,714,156 | best_iter=18950


[2025-09-27T03:14:52.514292Z] LGB Fold 4/5: train=3190 valid=797


[2025-09-27T03:17:54.052613Z] LGB Fold 4 MAE=3,591,186 | best_iter=19964


[2025-09-27T03:17:54.137811Z] LGB Fold 5/5: train=3190 valid=797


[2025-09-27T03:20:59.048819Z] LGB Fold 5 MAE=3,604,946 | best_iter=19976


[2025-09-27T03:20:59.130081Z] LGB OOF MAE=3,609,869; folds=[(1, 3471503), (2, 3667655), (3, 3714156), (4, 3591186), (5, 3604945)] | elapsed 925.9s


[2025-09-27T03:20:59.131516Z] LGB clip 0.99: 46,962,008 -> OOF 3,610,533


[2025-09-27T03:20:59.132325Z] LGB clip 0.995: 47,715,640 -> OOF 3,609,608


[2025-09-27T03:20:59.133104Z] LGB clip 0.999: 48,356,643 -> OOF 3,609,537


[2025-09-27T03:20:59.133759Z] LGB clip 1.0: 49,046,088 -> OOF 3,609,601


[2025-09-27T03:20:59.134175Z] LGB chosen upper=48,356,643 (OOF 3,609,537)


[2025-09-27T03:20:59.141485Z] Saved submission.csv (8622 bytes). Head:


   segment_id  time_to_eruption
0   951290289          30272764
1   508758258          14239392
2  1566132188          31933286
3  1891418251          24881624
4  1968343855           7650647
[2025-09-27T03:20:59.142639Z] LGB training complete; ready to submit or blend with XGB if needed.


In [12]:
# Preserve current LGB submission as preds file
import pandas as pd, shutil, os
src = 'submission.csv'
dst = 'pred_test_lgb.csv'
if os.path.exists(src):
    df = pd.read_csv(src)
    # Ensure correct columns and order
    df = df[['segment_id','time_to_eruption']].copy()
    df.to_csv(dst, index=False)
    print(f'Saved {dst} with shape {df.shape}')
else:
    print('submission.csv not found; nothing to preserve.')

Saved pred_test_lgb.csv with shape (444, 2)


In [16]:
# Blend XGB (v2) and LGB predictions using OOF-optimized weights; clip via OOF; write submission.csv
import numpy as np, pandas as pd, os
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Load OOFs
oof_xgb = pd.read_csv('oof_xgb_v1.csv')  # columns: segment_id, y, oof
oof_lgb = pd.read_csv('oof_lgb.csv')     # columns: segment_id, y, oof_lgb
oof = oof_xgb.merge(oof_lgb, on=['segment_id'], how='inner', suffixes=('_xgb','_lgb'))

# Handle duplicate 'y' columns after merge (expect y_xgb and y_lgb)
if 'y_xgb' in oof.columns and 'y_lgb' in oof.columns:
    if not np.allclose(oof['y_xgb'].astype(np.float64).values, oof['y_lgb'].astype(np.float64).values):
        log('Warning: y_xgb and y_lgb differ after merge; proceeding with y_xgb')
    y = oof['y_xgb'].astype(np.float64).values
elif 'y' in oof.columns:
    y = oof['y'].astype(np.float64).values
else:
    raise KeyError(f"Target column y not found after merging OOF files; cols={list(oof.columns)}")

px = oof['oof'].values.astype(np.float64)
pl = oof['oof_lgb'].values.astype(np.float64)
log(f'Merged OOF: {oof.shape}, y range [{y.min()},{y.max()}]')

# Grid search weight w for LGB in blend: p = (1-w)*px + w*pl minimizing MAE
weights = np.linspace(0.0, 1.0, 21)
def mae(a,b):
    return float(np.mean(np.abs(a-b)))
best_w, best_mae = None, 1e18
for w in weights:
    p = (1.0 - w) * px + w * pl
    m = mae(y, p)
    log(f'w={w:.2f} -> OOF MAE {m:,.0f}')
    if m < best_mae:
        best_mae, best_w = m, w
log(f'Chosen weight w_LGB={best_w:.2f} -> OOF MAE {best_mae:,.0f}')

# Choose clipping upper bound based on blended OOF
p_blend = (1.0 - best_w) * px + best_w * pl
best_upper, best_clip_mae = None, best_mae
for q in [0.99, 0.995, 0.999, 1.0]:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    m = mae(y, np.clip(p_blend, 0.0, upper))
    log(f'Clip upper {q} -> {upper:,.0f}: OOF MAE {m:,.0f}')
    if m < best_clip_mae:
        best_clip_mae, best_upper = m, upper
if best_upper is None:
    best_upper = float(np.max(y))
log(f'Chosen clip upper={best_upper:,.0f} (OOF {best_clip_mae:,.0f})')

# Load test predictions
pred_xgb = pd.read_csv('pred_test_xgb_raw.csv')   # aligned to sample order, raw (unclipped) float32
pred_lgb = pd.read_csv('pred_test_lgb.csv')       # aligned to sample order, int (from submission), but we can treat as floats

# Align to sample_submission order explicitly
ss = pd.read_csv('sample_submission.csv')
px_test = pred_xgb.set_index('segment_id').reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values
pl_test = pred_lgb.set_index('segment_id').reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

# Blend and clip
p_test = (1.0 - best_w) * px_test + best_w * pl_test
p_test = np.clip(p_test, 0.0, best_upper)

# Write submission
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': p_test.astype(np.int64)})
sub.to_csv('submission.csv', index=False)
log(f'Blended submission saved ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())

[2025-09-27T03:32:00.839455Z] Merged OOF: (3987, 5), y range [6250.0,49046088.0]


[2025-09-27T03:32:00.840036Z] w=0.00 -> OOF MAE 3,235,388


[2025-09-27T03:32:00.840330Z] w=0.05 -> OOF MAE 3,238,633


[2025-09-27T03:32:00.840598Z] w=0.10 -> OOF MAE 3,243,832


[2025-09-27T03:32:00.840855Z] w=0.15 -> OOF MAE 3,250,681


[2025-09-27T03:32:00.841103Z] w=0.20 -> OOF MAE 3,259,396


[2025-09-27T03:32:00.841332Z] w=0.25 -> OOF MAE 3,269,461


[2025-09-27T03:32:00.841585Z] w=0.30 -> OOF MAE 3,281,221


[2025-09-27T03:32:00.841858Z] w=0.35 -> OOF MAE 3,294,968


[2025-09-27T03:32:00.842078Z] w=0.40 -> OOF MAE 3,310,771


[2025-09-27T03:32:00.842325Z] w=0.45 -> OOF MAE 3,328,371


[2025-09-27T03:32:00.842569Z] w=0.50 -> OOF MAE 3,347,208


[2025-09-27T03:32:00.842868Z] w=0.55 -> OOF MAE 3,367,667


[2025-09-27T03:32:00.843156Z] w=0.60 -> OOF MAE 3,389,620


[2025-09-27T03:32:00.843383Z] w=0.65 -> OOF MAE 3,413,142


[2025-09-27T03:32:00.843639Z] w=0.70 -> OOF MAE 3,437,734


[2025-09-27T03:32:00.843921Z] w=0.75 -> OOF MAE 3,463,636


[2025-09-27T03:32:00.844195Z] w=0.80 -> OOF MAE 3,490,705


[2025-09-27T03:32:00.844404Z] w=0.85 -> OOF MAE 3,518,808


[2025-09-27T03:32:00.844654Z] w=0.90 -> OOF MAE 3,547,991


[2025-09-27T03:32:00.844919Z] w=0.95 -> OOF MAE 3,578,254


[2025-09-27T03:32:00.845136Z] w=1.00 -> OOF MAE 3,609,869


[2025-09-27T03:32:00.845358Z] Chosen weight w_LGB=0.00 -> OOF MAE 3,235,388


[2025-09-27T03:32:00.846060Z] Clip upper 0.99 -> 46,962,008: OOF MAE 3,234,677


[2025-09-27T03:32:00.846543Z] Clip upper 0.995 -> 47,715,640: OOF MAE 3,233,765


[2025-09-27T03:32:00.847004Z] Clip upper 0.999 -> 48,356,643: OOF MAE 3,233,824


[2025-09-27T03:32:00.847232Z] Clip upper 1.0 -> 49,046,088: OOF MAE 3,234,111


[2025-09-27T03:32:00.847487Z] Chosen clip upper=47,715,640 (OOF 3,233,765)


[2025-09-27T03:32:00.851844Z] Blended submission saved (8613 bytes). Head:


   segment_id  time_to_eruption
0   951290289          30333056
1   508758258          15063424
2  1566132188          33993970
3  1891418251          21747250
4  1968343855           7629103


In [17]:
# XGBoost 3-seed bagging (same CV/params), averaged test preds, OOF-based clip, save submission + raw preds
import os, time, subprocess, sys, gc
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import xgboost as xgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.1'], check=True)
    import xgboost as xgb

# Load features
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
assert os.path.exists(train_path) and os.path.exists(test_path)
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy().fillna(0.0)
ss = pd.read_csv('sample_submission.csv')
log(f'Data loaded: X={X.shape}, X_test={X_test.shape}, feats={len(feature_cols)}')

seeds = [42, 123, 456]
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# Base params; vary seed only
base_params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'min_child_weight': 12,
    'alpha': 0.1,
    'lambda': 3.0,
    'gamma': 0.1,
    'eval_metric': 'mae',
    'verbosity': 0,
}

all_oof = []
all_test_preds = []
start_all = time.time()
DM_test = xgb.DMatrix(X_test.values, feature_names=feature_cols)

for si, seed in enumerate(seeds, 1):
    params = dict(base_params)
    params['seed'] = int(seed)
    oof = np.zeros(len(X), dtype=np.float32)
    test_preds_folds = []
    fold_maes = []
    log(f'[Seed {seed}] Training with GroupKFold={n_splits}')
    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
        t0 = time.time()
        DM_tr = xgb.DMatrix(X.iloc[tr_idx].values, label=y[tr_idx], feature_names=feature_cols)
        DM_va = xgb.DMatrix(X.iloc[va_idx].values, label=y[va_idx], feature_names=feature_cols)
        bst = xgb.train(params, DM_tr, num_boost_round=10000, evals=[(DM_va,'valid')], early_stopping_rounds=300, verbose_eval=False)
        best_iter = getattr(bst, 'best_iteration', None) or bst.num_boosted_rounds
        va_pred = bst.predict(DM_va, iteration_range=(0, int(best_iter))).astype(np.float32)
        oof[va_idx] = va_pred
        mae = mean_absolute_error(y[va_idx], va_pred)
        fold_maes.append(mae)
        tp = bst.predict(DM_test, iteration_range=(0, int(best_iter))).astype(np.float32)
        test_preds_folds.append(tp)
        log(f'[Seed {seed}] Fold {fold}/{n_splits} MAE={mae:,.0f} | best_iter={best_iter} | {time.time()-t0:.1f}s')
    oof_mae = mean_absolute_error(y, oof)
    log(f'[Seed {seed}] OOF MAE={oof_mae:,.0f}; folds={[(i+1,int(m)) for i,m in enumerate(fold_maes)]}')
    all_oof.append(oof.astype(np.float32))
    all_test_preds.append(np.mean(test_preds_folds, axis=0).astype(np.float32))

log(f'All seeds done in {time.time()-start_all:.1f}s')

# Average OOF and test preds across seeds
oof_mean = np.mean(np.stack(all_oof, axis=0), axis=0).astype(np.float32)
test_mean = np.mean(np.stack(all_test_preds, axis=0), axis=0).astype(np.float32)
oof_mae_mean = mean_absolute_error(y, oof_mean)
log(f'[3-seed] OOF MAE (mean across seeds) = {oof_mae_mean:,.0f}')

# Clip selection via OOF sweep
best_upper = float(np.max(y))
best_mae = oof_mae_mean
for q in [0.99, 0.995, 0.999, 1.0]:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    mae_clip = mean_absolute_error(y, np.clip(oof_mean, 0.0, upper))
    log(f'[3-seed] Clip {q}: {upper:,.0f} -> OOF {mae_clip:,.0f}')
    if mae_clip < best_mae:
        best_mae = mae_clip
        best_upper = upper
log(f'[3-seed] Chosen upper={best_upper:,.0f} (OOF {best_mae:,.0f})')

# Align to sample order, save raw and submission
pred_series_raw = pd.Series(test_mean, index=X_test.index.values)
pred_aligned_raw = pred_series_raw.reindex(ss['segment_id'].values).fillna(pred_series_raw.median()).values.astype(np.float32)
pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned_raw}).to_csv('pred_test_xgb_3seed_raw.csv', index=False)
log('Saved pred_test_xgb_3seed_raw.csv')

pred_clipped = np.clip(pred_aligned_raw, 0.0, best_upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_clipped})
sub.to_csv('submission.csv', index=False)
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof_mean': oof_mean}).to_csv('oof_xgb_3seed.csv', index=False)
log(f'[3-seed] Saved submission.csv ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())

[2025-09-27T03:36:36.240732Z] Data loaded: X=(3987, 2064), X_test=(444, 2064), feats=2064


[2025-09-27T03:36:36.248324Z] [Seed 42] Training with GroupKFold=5


[2025-09-27T03:37:48.221769Z] [Seed 42] Fold 1/5 MAE=3,166,326 | best_iter=3748 | 72.0s


[2025-09-27T03:39:03.576041Z] [Seed 42] Fold 2/5 MAE=3,246,702 | best_iter=3910 | 75.4s


[2025-09-27T03:40:13.947053Z] [Seed 42] Fold 3/5 MAE=3,358,654 | best_iter=3584 | 70.4s


[2025-09-27T03:41:36.022978Z] [Seed 42] Fold 4/5 MAE=3,181,652 | best_iter=4408 | 82.1s


[2025-09-27T03:42:57.316814Z] [Seed 42] Fold 5/5 MAE=3,223,680 | best_iter=4270 | 81.3s


[2025-09-27T03:42:57.317751Z] [Seed 42] OOF MAE=3,235,388; folds=[(1, 3166325), (2, 3246702), (3, 3358653), (4, 3181652), (5, 3223680)]


[2025-09-27T03:42:57.318169Z] [Seed 123] Training with GroupKFold=5


[2025-09-27T03:44:01.282771Z] [Seed 123] Fold 1/5 MAE=3,169,532 | best_iter=3384 | 64.0s


[2025-09-27T03:45:06.321271Z] [Seed 123] Fold 2/5 MAE=3,292,624 | best_iter=3406 | 65.0s


[2025-09-27T03:46:25.173599Z] [Seed 123] Fold 3/5 MAE=3,313,426 | best_iter=4080 | 78.9s


[2025-09-27T03:47:44.670866Z] [Seed 123] Fold 4/5 MAE=3,215,607 | best_iter=4161 | 79.5s


[2025-09-27T03:48:56.832409Z] [Seed 123] Fold 5/5 MAE=3,289,002 | best_iter=3752 | 72.2s


[2025-09-27T03:48:56.833356Z] [Seed 123] OOF MAE=3,256,026; folds=[(1, 3169532), (2, 3292623), (3, 3313425), (4, 3215607), (5, 3289002)]


[2025-09-27T03:48:56.833823Z] [Seed 456] Training with GroupKFold=5


[2025-09-27T03:50:10.356266Z] [Seed 456] Fold 1/5 MAE=3,148,158 | best_iter=3900 | 73.5s


[2025-09-27T03:51:26.120670Z] [Seed 456] Fold 2/5 MAE=3,297,580 | best_iter=3996 | 75.8s


[2025-09-27T03:52:27.846598Z] [Seed 456] Fold 3/5 MAE=3,367,945 | best_iter=3119 | 61.7s


[2025-09-27T03:53:17.663478Z] [Seed 456] Fold 4/5 MAE=3,134,891 | best_iter=2476 | 49.8s


[2025-09-27T03:54:31.277938Z] [Seed 456] Fold 5/5 MAE=3,252,836 | best_iter=3788 | 73.6s


[2025-09-27T03:54:31.278897Z] [Seed 456] OOF MAE=3,240,273; folds=[(1, 3148157), (2, 3297579), (3, 3367945), (4, 3134891), (5, 3252835)]


[2025-09-27T03:54:31.279430Z] All seeds done in 1075.0s


[2025-09-27T03:54:31.280256Z] [3-seed] OOF MAE (mean across seeds) = 3,192,504


[2025-09-27T03:54:31.281435Z] [3-seed] Clip 0.99: 46,962,008 -> OOF 3,192,076


[2025-09-27T03:54:31.282102Z] [3-seed] Clip 0.995: 47,715,640 -> OOF 3,191,561


[2025-09-27T03:54:31.282835Z] [3-seed] Clip 0.999: 48,356,643 -> OOF 3,191,449


[2025-09-27T03:54:31.283364Z] [3-seed] Clip 1.0: 49,046,088 -> OOF 3,191,652


[2025-09-27T03:54:31.283726Z] [3-seed] Chosen upper=48,356,643 (OOF 3,191,449)


[2025-09-27T03:54:31.286882Z] Saved pred_test_xgb_3seed_raw.csv


[2025-09-27T03:54:31.291913Z] [3-seed] Saved submission.csv (8616 bytes). Head:


   segment_id  time_to_eruption
0   951290289          30329878
1   508758258          14477632
2  1566132188          34124112
3  1891418251          22477562
4  1968343855           7643206


In [18]:
# LightGBM re-run (CPU) to save RAW test preds as float32 (pred_test_lgb_raw.csv) without overwriting submission
import os, sys, subprocess, time
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import lightgbm as lgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'lightgbm==4.6.0'], check=True)
    import lightgbm as lgb

# Load features v2
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
assert os.path.exists(train_path) and os.path.exists(test_path)
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy().fillna(0.0)
ss = pd.read_csv('sample_submission.csv')
log(f'LGB re-run: X={X.shape}, X_test={X_test.shape}, feats={len(feature_cols)}')

params = {
    'objective': 'mae',
    'metric': 'mae',
    'learning_rate': 0.03,
    'num_leaves': 31,
    'max_depth': 8,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'min_data_in_leaf': 40,
    'lambda_l1': 0.1,
    'lambda_l2': 3.0,
    'force_col_wise': True,
    'verbose': -1,
    'num_threads': 0,
    'seed': 1234,
}

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
test_preds = []
oof = np.zeros(len(X), dtype=np.float32)
fold_maes = []
t0 = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    dtrain = lgb.Dataset(X.iloc[tr_idx], label=y[tr_idx], free_raw_data=False)
    dvalid = lgb.Dataset(X.iloc[va_idx], label=y[va_idx], reference=dtrain, free_raw_data=False)
    log(f'[LGB RAW] Fold {fold}/{n_splits}: train={len(tr_idx)} valid={len(va_idx)}')
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=20000,
        valid_sets=[dvalid],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False)],
    )
    va_pred = model.predict(X.iloc[va_idx], num_iteration=model.best_iteration).astype(np.float32)
    oof[va_idx] = va_pred
    mae = mean_absolute_error(y[va_idx], va_pred)
    fold_maes.append(mae)
    log(f'[LGB RAW] Fold {fold} MAE={mae:,.0f} | best_iter={model.best_iteration}')
    tp = model.predict(X_test, num_iteration=model.best_iteration).astype(np.float32)
    test_preds.append(tp)

oof_mae = mean_absolute_error(y, oof)
log(f'[LGB RAW] OOF MAE={oof_mae:,.0f}; folds={[(i+1,int(m)) for i,m in enumerate(fold_maes)]} | elapsed {time.time()-t0:.1f}s')

# Average raw test preds and save without clipping/casting
test_mean = np.mean(np.stack(test_preds, axis=0), axis=0).astype(np.float32)
pred_series = pd.Series(test_mean, index=X_test.index.values)
pred_aligned = pred_series.reindex(ss['segment_id'].values).values.astype(np.float32)
out_path = 'pred_test_lgb_raw.csv'
pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned}).to_csv(out_path, index=False)
log(f'Saved {out_path} (raw float) for blending. Head:')
print(pd.read_csv(out_path).head())

[2025-09-27T03:54:44.386390Z] LGB re-run: X=(3987, 2064), X_test=(444, 2064), feats=2064


[2025-09-27T03:54:44.399329Z] [LGB RAW] Fold 1/5: train=3189 valid=798


[2025-09-27T03:57:54.755627Z] [LGB RAW] Fold 1 MAE=3,471,504 | best_iter=19988


[2025-09-27T03:57:54.849898Z] [LGB RAW] Fold 2/5: train=3189 valid=798


[2025-09-27T04:01:03.014599Z] [LGB RAW] Fold 2 MAE=3,667,655 | best_iter=20000


[2025-09-27T04:01:03.110507Z] [LGB RAW] Fold 3/5: train=3190 valid=797


[2025-09-27T04:04:02.308809Z] [LGB RAW] Fold 3 MAE=3,714,156 | best_iter=18950


[2025-09-27T04:04:02.395807Z] [LGB RAW] Fold 4/5: train=3190 valid=797


[2025-09-27T04:07:05.287647Z] [LGB RAW] Fold 4 MAE=3,591,186 | best_iter=19964


[2025-09-27T04:07:05.381025Z] [LGB RAW] Fold 5/5: train=3190 valid=797


[2025-09-27T04:10:11.773150Z] [LGB RAW] Fold 5 MAE=3,604,946 | best_iter=19976


[2025-09-27T04:10:11.851994Z] [LGB RAW] OOF MAE=3,609,869; folds=[(1, 3471503), (2, 3667655), (3, 3714156), (4, 3591186), (5, 3604945)] | elapsed 927.5s


[2025-09-27T04:10:11.854960Z] Saved pred_test_lgb_raw.csv (raw float) for blending. Head:


   segment_id  time_to_eruption
0   951290289        30272764.0
1   508758258        14239392.0
2  1566132188        31933286.0
3  1891418251        24881624.0
4  1968343855         7650647.0


In [20]:
# Small-weight RAW blend: XGB (prefer 3-seed) + LGB (raw) with w in {0.10, 0.15, 0.20}; OOF-driven clip; write submission.csv
import os, numpy as np, pandas as pd
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Load OOFs for weight selection (restricted small weights, but include 0.0 baseline)
oof_xgb = pd.read_csv('oof_xgb_v1.csv')  # segment_id, y, oof
oof_lgb = pd.read_csv('oof_lgb.csv')     # segment_id, y, oof_lgb
oof = oof_xgb.merge(oof_lgb, on='segment_id', how='inner', suffixes=('_xgb','_lgb'))
if 'y_xgb' in oof.columns and 'y_lgb' in oof.columns:
    y_for_w = oof['y_xgb'].astype(np.float64).values
else:
    y_for_w = oof['y'].astype(np.float64).values
px = oof['oof'].astype(np.float64).values
pl = oof['oof_lgb'].astype(np.float64).values
log(f'Merged OOF for weight selection: {oof.shape}')

cand_weights = [0.0, 0.10, 0.15, 0.20]
def mae(a,b): return float(np.mean(np.abs(a-b)))
best_w, best_mae_w = None, 1e18
for w in cand_weights:
    p = (1.0 - w) * px + w * pl
    m = mae(y_for_w, p)
    log(f'w={w:.2f} -> OOF MAE {m:,.0f}')
    if m < best_mae_w:
        best_mae_w, best_w = m, w
log(f'Chosen small-weight w_LGB={best_w:.2f} (OOF {best_mae_w:,.0f})')

# Prefer clipping upper bound from 3-seed XGB OOF if available
best_upper = None
if os.path.exists('oof_xgb_3seed.csv'):
    o3 = pd.read_csv('oof_xgb_3seed.csv')  # segment_id, y, oof_mean
    y3 = o3['y'].astype(np.float64).values
    p3 = o3['oof_mean'].astype(np.float64).values
    best_upper_3s, best_mae_3s = None, mae(y3, p3)
    for q in [0.99, 0.995, 0.999, 1.0]:
        upper = float(np.quantile(y3, q)) if q < 1.0 else float(np.max(y3))
        m = mae(y3, np.clip(p3, 0.0, upper))
        log(f'[3-seed OOF] Clip {q}: {upper:,.0f} -> OOF {m:,.0f}')
        if (best_upper_3s is None) or (m < best_mae_3s):
            best_mae_3s, best_upper_3s = m, upper
    best_upper = float(best_upper_3s)
    log(f'Chosen clip upper from 3-seed OOF = {best_upper:,.0f}')
else:
    # Fallback: use current merged OOF blend for clipping
    p_blend = (1.0 - best_w) * px + best_w * pl
    best_clip = best_mae_w
    for q in [0.99, 0.995, 0.999, 1.0]:
        upper = float(np.quantile(y_for_w, q)) if q < 1.0 else float(np.max(y_for_w))
        m = mae(y_for_w, np.clip(p_blend, 0.0, upper))
        log(f'[fallback] Clip {q}: {upper:,.0f} -> OOF {m:,.0f}')
        if (best_upper is None) or (m < best_clip):
            best_clip, best_upper = m, upper
    log(f'Chosen clip upper (fallback)={best_upper:,.0f}')

# Load RAW test predictions
xgb_path = 'pred_test_xgb_3seed_raw.csv' if os.path.exists('pred_test_xgb_3seed_raw.csv') else 'pred_test_xgb_raw.csv'
lgb_path = 'pred_test_lgb_raw.csv'
assert os.path.exists(xgb_path), f'Missing {xgb_path}'
assert os.path.exists(lgb_path), f'Missing {lgb_path} (run LGB RAW cell first)'
px_test_df = pd.read_csv(xgb_path)
pl_test_df = pd.read_csv(lgb_path)
ss = pd.read_csv('sample_submission.csv')
px_test = px_test_df.set_index('segment_id').reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values
pl_test = pl_test_df.set_index('segment_id').reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

# Blend and clip
p_test = (1.0 - best_w) * px_test + best_w * pl_test
p_test = np.clip(p_test, 0.0, best_upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': p_test})
sub.to_csv('submission.csv', index=False)
log(f'Saved submission.csv using {os.path.basename(xgb_path)} + pred_test_lgb_raw.csv, w_LGB={best_w:.2f}, upper={best_upper:,.0f}. Head:')
print(sub.head())

[2025-09-27T04:11:31.699444Z] Merged OOF for weight selection: (3987, 5)


[2025-09-27T04:11:31.700021Z] w=0.00 -> OOF MAE 3,235,388


[2025-09-27T04:11:31.700573Z] w=0.10 -> OOF MAE 3,243,832


[2025-09-27T04:11:31.700922Z] w=0.15 -> OOF MAE 3,250,681


[2025-09-27T04:11:31.701278Z] w=0.20 -> OOF MAE 3,259,396


[2025-09-27T04:11:31.701610Z] Chosen small-weight w_LGB=0.00 (OOF 3,235,388)


[2025-09-27T04:11:31.703965Z] [3-seed OOF] Clip 0.99: 46,962,008 -> OOF 3,192,076


[2025-09-27T04:11:31.704631Z] [3-seed OOF] Clip 0.995: 47,715,640 -> OOF 3,191,561


[2025-09-27T04:11:31.705247Z] [3-seed OOF] Clip 0.999: 48,356,643 -> OOF 3,191,449


[2025-09-27T04:11:31.705620Z] [3-seed OOF] Clip 1.0: 49,046,088 -> OOF 3,191,652


[2025-09-27T04:11:31.705904Z] Chosen clip upper from 3-seed OOF = 48,356,643


[2025-09-27T04:11:31.709726Z] Saved submission.csv using pred_test_xgb_3seed_raw.csv + pred_test_lgb_raw.csv, w_LGB=0.00, upper=48,356,643. Head:


   segment_id  time_to_eruption
0   951290289          30329878
1   508758258          14477632
2  1566132188          34124110
3  1891418251          22477562
4  1968343855           7643206


In [24]:
# Manual variant: force small LGB weight and chosen clip to probe LB; writes submission.csv
import os, numpy as np, pandas as pd
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Config: try pure 3-seed XGB (w=0.0) and no clipping (q=1.0)
forced_w = 0.00
forced_q = 1.0

xgb_path = 'pred_test_xgb_3seed_raw.csv' if os.path.exists('pred_test_xgb_3seed_raw.csv') else 'pred_test_xgb_raw.csv'
lgb_path = 'pred_test_lgb_raw.csv'
assert os.path.exists(xgb_path) and os.path.exists(lgb_path)

# Determine clip upper from 3-seed OOF if available for the chosen quantile (q=1.0 -> max(y))
if os.path.exists('oof_xgb_3seed.csv'):
    o3 = pd.read_csv('oof_xgb_3seed.csv')
    y3 = o3['y'].astype(np.float64).values
    if forced_q < 1.0:
        upper = float(np.quantile(y3, forced_q))
    else:
        upper = float(np.max(y3))
else:
    y_all = pd.read_csv('oof_xgb_v1.csv')['y'].astype(np.float64).values
    upper = float(np.quantile(y_all, forced_q)) if forced_q < 1.0 else float(np.max(y_all))

px_df = pd.read_csv(xgb_path).set_index('segment_id')
pl_df = pd.read_csv(lgb_path).set_index('segment_id')
ss = pd.read_csv('sample_submission.csv')
px = px_df.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values
pl = pl_df.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

p = (1.0 - forced_w) * px + forced_w * pl
p = np.clip(p, 0.0, upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': p})
sub.to_csv('submission.csv', index=False)
log(f'Variant submission saved with w_LGB={forced_w}, clip_q={forced_q} (upper={upper:,.0f}). Head:')
print(sub.head())

[2025-09-27T04:22:30.807987Z] Variant submission saved with w_LGB=0.0, clip_q=1.0 (upper=49,046,088). Head:


   segment_id  time_to_eruption
0   951290289          30329878
1   508758258          14477632
2  1566132188          34124110
3  1891418251          22477562
4  1968343855           7643206


In [23]:
# Robust XGBoost (single-seed) with stronger regularization to fight LB shift; save RAW preds for later blend
import os, time, subprocess, sys, gc
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import xgboost as xgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.1'], check=True)
    import xgboost as xgb

# Load features
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
assert os.path.exists(train_path) and os.path.exists(test_path)
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy().fillna(0.0)
ss = pd.read_csv('sample_submission.csv')
log(f'Robust XGB data: X={X.shape}, X_test={X_test.shape}, feats={len(feature_cols)}')

# Robust regularization params (conservative)
params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.02,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'min_child_weight': 25,
    'lambda': 12.0,
    'alpha': 1.0,
    'eval_metric': 'mae',
    'seed': 777,
    'verbosity': 0,
}

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
oof = np.zeros(len(X), dtype=np.float32)
test_preds_folds = []
DM_test = xgb.DMatrix(X_test.values, feature_names=feature_cols)
t0_all = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    t0 = time.time()
    DM_tr = xgb.DMatrix(X.iloc[tr_idx].values, label=y[tr_idx], feature_names=feature_cols)
    DM_va = xgb.DMatrix(X.iloc[va_idx].values, label=y[va_idx], feature_names=feature_cols)
    bst = xgb.train(params, DM_tr, num_boost_round=20000, evals=[(DM_va,'valid')], early_stopping_rounds=500, verbose_eval=False)
    best_iter = getattr(bst, 'best_iteration', None) or bst.num_boosted_rounds
    va_pred = bst.predict(DM_va, iteration_range=(0, int(best_iter))).astype(np.float32)
    oof[va_idx] = va_pred
    mae = mean_absolute_error(y[va_idx], va_pred)
    log(f'[Robust] Fold {fold}/{n_splits} MAE={mae:,.0f} | best_iter={best_iter} | {time.time()-t0:.1f}s')
    tp = bst.predict(DM_test, iteration_range=(0, int(best_iter))).astype(np.float32)
    test_preds_folds.append(tp)

oof_mae = mean_absolute_error(y, oof)
log(f'[Robust] OOF MAE={oof_mae:,.0f} | total {time.time()-t0_all:.1f}s')

# Choose clip from robust OOF
best_upper = float(np.max(y))
best_mae_clip = oof_mae
for q in [0.99, 0.995, 0.999, 1.0]:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    m = mean_absolute_error(y, np.clip(oof, 0.0, upper))
    log(f'[Robust] Clip {q}: {upper:,.0f} -> OOF {m:,.0f}')
    if m < best_mae_clip:
        best_mae_clip = m
        best_upper = upper
log(f'[Robust] Chosen upper={best_upper:,.0f} (OOF {best_mae_clip:,.0f})')

# Save RAW preds for blending; also save OOF
test_mean = np.mean(np.stack(test_preds_folds, axis=0), axis=0).astype(np.float32)
pred_series_raw = pd.Series(test_mean, index=X_test.index.values)
pred_aligned_raw = pred_series_raw.reindex(ss['segment_id'].values).fillna(pred_series_raw.median()).values.astype(np.float32)
pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned_raw}).to_csv('pred_test_xgb_robust_raw.csv', index=False)
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof_robust': oof.astype(np.float32)}).to_csv('oof_xgb_robust.csv', index=False)
log('Saved pred_test_xgb_robust_raw.csv and oof_xgb_robust.csv')

# Optional: produce a standalone robust submission (clipped) for sanity
pred_clipped = np.clip(pred_aligned_raw, 0.0, best_upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_clipped})
sub.to_csv('submission.csv', index=False)
log(f'[Robust] Wrote submission.csv ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())

[2025-09-27T04:13:42.439186Z] Robust XGB data: X=(3987, 2064), X_test=(444, 2064), feats=2064


[2025-09-27T04:15:27.565045Z] [Robust] Fold 1/5 MAE=3,250,181 | best_iter=9013 | 105.1s


[2025-09-27T04:17:00.596322Z] [Robust] Fold 2/5 MAE=3,394,504 | best_iter=7672 | 92.9s


[2025-09-27T04:18:26.903612Z] [Robust] Fold 3/5 MAE=3,349,655 | best_iter=7158 | 86.2s


[2025-09-27T04:19:59.605831Z] [Robust] Fold 4/5 MAE=3,295,567 | best_iter=7628 | 92.6s


[2025-09-27T04:22:00.804112Z] [Robust] Fold 5/5 MAE=3,421,705 | best_iter=10304 | 121.1s


[2025-09-27T04:22:00.913407Z] [Robust] OOF MAE=3,342,313 | total 498.5s


[2025-09-27T04:22:00.914705Z] [Robust] Clip 0.99: 46,962,008 -> OOF 3,340,892


[2025-09-27T04:22:00.915446Z] [Robust] Clip 0.995: 47,715,640 -> OOF 3,340,312


[2025-09-27T04:22:00.916164Z] [Robust] Clip 0.999: 48,356,643 -> OOF 3,340,761


[2025-09-27T04:22:00.916685Z] [Robust] Clip 1.0: 49,046,088 -> OOF 3,341,257


[2025-09-27T04:22:00.916963Z] [Robust] Chosen upper=47,715,640 (OOF 3,340,312)


[2025-09-27T04:22:00.923632Z] Saved pred_test_xgb_robust_raw.csv and oof_xgb_robust.csv


[2025-09-27T04:22:00.924783Z] [Robust] Wrote submission.csv (8584 bytes). Head:


   segment_id  time_to_eruption
0   951290289          29999788
1   508758258          13026953
2  1566132188          33837308
3  1891418251          23766566
4  1968343855           5129763


In [25]:
# Tri-model RAW blend: XGB 3-seed (anchor) + XGB robust + LGB (raw); OOF grid-search weights; OOF-based clip; write submission.csv
import os, numpy as np, pandas as pd
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

# Load OOFs
o3 = pd.read_csv('oof_xgb_3seed.csv')    # segment_id, y, oof_mean
orob = pd.read_csv('oof_xgb_robust.csv')  # segment_id, y, oof_robust
olgb = pd.read_csv('oof_lgb.csv')        # segment_id, y, oof_lgb
o = o3.merge(orob[['segment_id','oof_robust']], on='segment_id', how='inner').merge(olgb[['segment_id','oof_lgb']], on='segment_id', how='inner')
y = o['y'].astype(np.float64).values
p3 = o['oof_mean'].astype(np.float64).values
pr = o['oof_robust'].astype(np.float64).values
pl = o['oof_lgb'].astype(np.float64).values
log(f'OOF merged shape: {o.shape}')

def mae(a,b): return float(np.mean(np.abs(a-b)))
best = {'b':0.0,'c':0.0,'mae':1e18}
b_grid = np.linspace(0.0, 0.50, 11)   # robust weight
c_grid = np.linspace(0.0, 0.30, 7)    # lgb weight
for b in b_grid:
    for c in c_grid:
        if b + c <= 0.7:
            a = 1.0 - b - c
            p = a*p3 + b*pr + c*pl
            m = mae(y, p)
            if m < best['mae']:
                best = {'b':float(b), 'c':float(c), 'a':float(a), 'mae':m}
log(f"Chosen weights a(3seed)={best['a']:.2f}, b(robust)={best['b']:.2f}, c(lgb)={best['c']:.2f} -> OOF MAE {best['mae']:,.0f}")

# Clip upper via OOF sweep using blended OOF
p_blend_oof = best['a']*p3 + best['b']*pr + best['c']*pl
best_upper, best_mae_clip = None, best['mae']
for q in [0.99, 0.995, 0.999, 1.0]:
    upper = float(np.quantile(y, q)) if q < 1.0 else float(np.max(y))
    m = mae(y, np.clip(p_blend_oof, 0.0, upper))
    log(f'Clip {q}: {upper:,.0f} -> OOF {m:,.0f}')
    if (best_upper is None) or (m < best_mae_clip):
        best_upper, best_mae_clip = upper, m
log(f'Chosen clip upper={best_upper:,.0f} (OOF {best_mae_clip:,.0f})')

# Load RAW test preds
px3 = pd.read_csv('pred_test_xgb_3seed_raw.csv').set_index('segment_id')
pxr = pd.read_csv('pred_test_xgb_robust_raw.csv').set_index('segment_id')
plg = pd.read_csv('pred_test_lgb_raw.csv').set_index('segment_id')
ss = pd.read_csv('sample_submission.csv')
x3 = px3.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values
xr = pxr.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values
xl = plg.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

# Blend and clip
ptest = best['a']*x3 + best['b']*xr + best['c']*xl
ptest = np.clip(ptest, 0.0, best_upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': ptest})
sub.to_csv('submission.csv', index=False)
log(f"Saved submission.csv (tri-blend). Weights a={best['a']:.2f}, b={best['b']:.2f}, c={best['c']:.2f}; upper={best_upper:,.0f}. Head:")
print(sub.head())

[2025-09-27T04:23:08.838827Z] OOF merged shape: (3987, 5)


[2025-09-27T04:23:08.840919Z] Chosen weights a(3seed)=1.00, b(robust)=0.00, c(lgb)=0.00 -> OOF MAE 3,192,504


[2025-09-27T04:23:08.841618Z] Clip 0.99: 46,962,008 -> OOF 3,192,076


[2025-09-27T04:23:08.842157Z] Clip 0.995: 47,715,640 -> OOF 3,191,561


[2025-09-27T04:23:08.842638Z] Clip 0.999: 48,356,643 -> OOF 3,191,449


[2025-09-27T04:23:08.843003Z] Clip 1.0: 49,046,088 -> OOF 3,191,652


[2025-09-27T04:23:08.843261Z] Chosen clip upper=48,356,643 (OOF 3,191,449)


[2025-09-27T04:23:08.847699Z] Saved submission.csv (tri-blend). Weights a=1.00, b=0.00, c=0.00; upper=48,356,643. Head:


   segment_id  time_to_eruption
0   951290289          30329878
1   508758258          14477632
2  1566132188          34124110
3  1891418251          22477562
4  1968343855           7643206


In [26]:
# Huber calibration submissions: (1) 3-seed XGB calibrated @0.999 clip; (2) fixed-weight tri-blend calibrated @0.995 clip
import os, numpy as np, pandas as pd
from sklearn.linear_model import HuberRegressor
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

ss = pd.read_csv('sample_submission.csv')

def load_test_series(path):
    df = pd.read_csv(path).set_index('segment_id')
    return df.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

def calibrate_and_save(y, oof_pred, test_pred, clip_q, out_path):
    # Fit Huber y ~ oof_pred
    X = oof_pred.reshape(-1,1)
    hub = HuberRegressor(epsilon=1.35, alpha=0.0, fit_intercept=True, max_iter=1000)
    hub.fit(X, y.astype(np.float64))
    # Apply to test
    test_cal = hub.predict(test_pred.reshape(-1,1)).astype(np.float64)
    # Clip upper bound from y quantile
    if clip_q < 1.0:
        upper = float(np.quantile(y, clip_q))
    else:
        upper = float(np.max(y))
    test_cal = np.clip(test_cal, 0.0, upper).astype(np.int64)
    sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': test_cal})
    sub.to_csv(out_path, index=False)
    log(f'Saved {out_path} (clip_q={clip_q}, upper={upper:,.0f}); head:')
    print(sub.head())
    return out_path

# 1) Bias-corrected 3-seed XGB (anchor)
o3 = pd.read_csv('oof_xgb_3seed.csv')  # columns: segment_id, y, oof_mean
y3 = o3['y'].astype(np.float64).values
oof3 = o3['oof_mean'].astype(np.float64).values
test3 = load_test_series('pred_test_xgb_3seed_raw.csv')
out1 = calibrate_and_save(y3, oof3, test3, clip_q=0.999, out_path='sub_calib_3seed_q0999.csv')

# 2) Bias-corrected small-weight tri-blend (fixed weights 0.85 3-seed, 0.10 robust XGB, 0.05 LGB)
orob = pd.read_csv('oof_xgb_robust.csv')   # segment_id, y, oof_robust
olgb = pd.read_csv('oof_lgb.csv')          # segment_id, y, oof_lgb
oo = o3.merge(orob[['segment_id','oof_robust']], on='segment_id').merge(olgb[['segment_id','oof_lgb']], on='segment_id')
y_tri = oo['y'].astype(np.float64).values
oof_tri = (0.85*oo['oof_mean'].astype(np.float64).values + 0.10*oo['oof_robust'].astype(np.float64).values + 0.05*oo['oof_lgb'].astype(np.float64).values)
t3 = load_test_series('pred_test_xgb_3seed_raw.csv')
tr = load_test_series('pred_test_xgb_robust_raw.csv')
tl = load_test_series('pred_test_lgb_raw.csv')
test_tri = (0.85*t3 + 0.10*tr + 0.05*tl).astype(np.float64)
out2 = calibrate_and_save(y_tri, oof_tri, test_tri, clip_q=0.995, out_path='sub_calib_tri_q0995.csv')

log(f'Calibration files ready: {out1}, {out2}. To submit, copy desired file to submission.csv and submit.')

[2025-09-27T04:28:19.188757Z] Saved sub_calib_3seed_q0999.csv (clip_q=0.999, upper=48,356,643); head:


   segment_id  time_to_eruption
0   951290289          30976723
1   508758258          14786396
2  1566132188          34851875
3  1891418251          22956941
4  1968343855           7806213
[2025-09-27T04:28:19.200375Z] Saved sub_calib_tri_q0995.csv (clip_q=0.995, upper=47,715,640); head:


   segment_id  time_to_eruption
0   951290289          30919273
1   508758258          14616226
2  1566132188          34687348
3  1891418251          23195738
4  1968343855           7544808
[2025-09-27T04:28:19.201211Z] Calibration files ready: sub_calib_3seed_q0999.csv, sub_calib_tri_q0995.csv. To submit, copy desired file to submission.csv and submit.


In [32]:
# Set isotonic-calibrated 3-seed submission as final submission.csv; validate shape/order/dtypes
import pandas as pd, numpy as np, os
from datetime import datetime
def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

src = 'sub_iso_3seed_q0999.csv'
ss = pd.read_csv('sample_submission.csv')
sub = pd.read_csv(src)

# Basic validations
assert list(sub.columns) == ['segment_id','time_to_eruption'], 'Columns mismatch'
assert len(sub) == len(ss) == 444, 'Row count mismatch'
assert np.array_equal(sub['segment_id'].values, ss['segment_id'].values), 'Order/IDs mismatch vs sample_submission'
assert sub['time_to_eruption'].dtype == np.int64 or sub['time_to_eruption'].dtype == np.int32, 'time_to_eruption not integer'
assert sub['time_to_eruption'].isna().sum() == 0, 'NaNs present'
assert (sub['time_to_eruption'] >= 0).all(), 'Negative predictions present'

sub.to_csv('submission.csv', index=False)
log(f'submission.csv written from {src} ({os.path.getsize("submission.csv")} bytes). Head:')
print(sub.head())

[2025-09-27T04:45:55.189122Z] submission.csv written from sub_iso_3seed_q0999.csv (8595 bytes). Head:


   segment_id  time_to_eruption
0   951290289          31867039
1   508758258          12945360
2  1566132188          37029393
3  1891418251          22227369
4  1968343855           6864116


In [29]:
# Sample-weighted robust XGB (GPU, MAE) with Huber calibration + 0.995 clip; saves RAW preds and submission
import os, time, subprocess, sys
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import HuberRegressor

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

try:
    import xgboost as xgb
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'xgboost==2.1.1'], check=True)
    import xgboost as xgb

# Load features
train_path = 'features_train_v1.csv'
test_path = 'features_test_v1.csv'
assert os.path.exists(train_path) and os.path.exists(test_path), 'Run feature build first.'
train_feats = pd.read_csv(train_path).set_index('segment_id')
test_feats = pd.read_csv(test_path).set_index('segment_id')
y = train_feats['time_to_eruption'].astype(np.float32).values
groups = train_feats['series_id'].values
feature_cols = [c for c in train_feats.columns if c not in ('time_to_eruption','series_id')]
X = train_feats[feature_cols].astype(np.float32).copy()
X_test = test_feats.reindex(columns=feature_cols).astype(np.float32).copy().fillna(0.0)
ss = pd.read_csv('sample_submission.csv')
log(f'Weighted XGB data: X={X.shape}, X_test={X_test.shape}, feats={len(feature_cols)}')

# Sample weights: w = 1 / (1 + y/1e6)
w_full = (1.0 / (1.0 + (y.astype(np.float64) / 1e6))).astype(np.float32)

# Params per expert advice
params = {
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'objective': 'reg:absoluteerror',
    'eval_metric': 'mae',
    'max_depth': 5,
    'eta': 0.02,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'min_child_weight': 28,
    'lambda': 13.0,
    'alpha': 1.0,
    'seed': 2025,
    'verbosity': 0,
}

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
oof = np.zeros(len(X), dtype=np.float32)
test_preds_folds = []
DM_test = xgb.DMatrix(X_test.values, feature_names=feature_cols)
t0_all = time.time()
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    t0 = time.time()
    DM_tr = xgb.DMatrix(X.iloc[tr_idx].values, label=y[tr_idx], weight=w_full[tr_idx], feature_names=feature_cols)
    DM_va = xgb.DMatrix(X.iloc[va_idx].values, label=y[va_idx], weight=w_full[va_idx], feature_names=feature_cols)
    bst = xgb.train(params, DM_tr, num_boost_round=20000, evals=[(DM_va,'valid')], early_stopping_rounds=700, verbose_eval=False)
    best_iter = getattr(bst, 'best_iteration', None) or bst.num_boosted_rounds
    va_pred = bst.predict(DM_va, iteration_range=(0, int(best_iter))).astype(np.float32)
    oof[va_idx] = va_pred
    mae = mean_absolute_error(y[va_idx], va_pred)
    log(f'[Weighted] Fold {fold}/{n_splits} MAE={mae:,.0f} | best_iter={best_iter} | {time.time()-t0:.1f}s')
    tp = bst.predict(DM_test, iteration_range=(0, int(best_iter))).astype(np.float32)
    test_preds_folds.append(tp)

oof_mae = mean_absolute_error(y, oof)
log(f'[Weighted] OOF MAE={oof_mae:,.0f} | total {time.time()-t0_all:.1f}s')

# Save OOF for later blending/calibration and RAW test preds
pd.DataFrame({'segment_id': X.index.values, 'y': y.astype(np.int64), 'oof_weighted': oof.astype(np.float32)}).to_csv('oof_xgb_weighted.csv', index=False)
test_mean = np.mean(np.stack(test_preds_folds, axis=0), axis=0).astype(np.float32)
pred_series_raw = pd.Series(test_mean, index=X_test.index.values)
pred_aligned_raw = pred_series_raw.reindex(ss['segment_id'].values).fillna(pred_series_raw.median()).values.astype(np.float32)
pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': pred_aligned_raw}).to_csv('pred_test_xgb_weighted_raw.csv', index=False)
log('Saved oof_xgb_weighted.csv and pred_test_xgb_weighted_raw.csv')

# Huber calibration on weighted OOF -> apply to test; clip at 0.995
hub = HuberRegressor(epsilon=1.35, alpha=0.0, fit_intercept=True, max_iter=1000)
hub.fit(oof.reshape(-1,1), y.astype(np.float64))
test_cal = hub.predict(pred_aligned_raw.reshape(-1,1)).astype(np.float64)
upper = float(np.quantile(y.astype(np.float64), 0.995))
test_cal = np.clip(test_cal, 0.0, upper).astype(np.int64)
sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': test_cal})
sub.to_csv('submission.csv', index=False)
log(f'[Weighted Calibrated] Wrote submission.csv with 0.995 clip (upper={upper:,.0f}). Head:')
print(sub.head())

[2025-09-27T04:30:11.363694Z] Weighted XGB data: X=(3987, 2064), X_test=(444, 2064), feats=2064


[2025-09-27T04:32:57.103877Z] [Weighted] Fold 1/5 MAE=9,181,826 | best_iter=19956 | 165.7s


[2025-09-27T04:35:41.167798Z] [Weighted] Fold 2/5 MAE=9,279,052 | best_iter=19954 | 163.9s


[2025-09-27T04:38:24.974281Z] [Weighted] Fold 3/5 MAE=8,780,537 | best_iter=19999 | 163.6s


[2025-09-27T04:41:09.158075Z] [Weighted] Fold 4/5 MAE=9,014,827 | best_iter=19998 | 164.0s


[2025-09-27T04:43:52.950375Z] [Weighted] Fold 5/5 MAE=9,139,727 | best_iter=19999 | 163.6s


[2025-09-27T04:43:53.107852Z] [Weighted] OOF MAE=9,079,269 | total 821.7s


[2025-09-27T04:43:53.114646Z] Saved oof_xgb_weighted.csv and pred_test_xgb_weighted_raw.csv


[2025-09-27T04:43:53.119124Z] [Weighted Calibrated] Wrote submission.csv with 0.995 clip (upper=47,715,640). Head:


   segment_id  time_to_eruption
0   951290289          24163019
1   508758258          12535127
2  1566132188          30433764
3  1891418251          26285762
4  1968343855           7249249


In [31]:
# Isotonic calibration submissions: (1) 3-seed XGB @0.999 clip; (2) fixed-weight tri-blend @0.995 clip
import numpy as np, pandas as pd, os
from sklearn.isotonic import IsotonicRegression
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat()}Z] {msg}", flush=True)

ss = pd.read_csv('sample_submission.csv')

def load_test_series(path):
    df = pd.read_csv(path).set_index('segment_id')
    return df.reindex(ss['segment_id'].values)['time_to_eruption'].astype(np.float64).values

def iso_calibrate_and_save(y, oof_pred, test_pred, clip_q, out_path):
    # Fit monotonic isotonic regression: y = f(oof_pred)
    x = oof_pred.astype(np.float64)
    y = y.astype(np.float64)
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(x, y)
    test_cal = iso.predict(test_pred.astype(np.float64))
    if clip_q < 1.0:
        upper = float(np.quantile(y, clip_q))
    else:
        upper = float(np.max(y))
    test_cal = np.clip(test_cal, 0.0, upper).astype(np.int64)
    sub = pd.DataFrame({'segment_id': ss['segment_id'].values, 'time_to_eruption': test_cal})
    sub.to_csv(out_path, index=False)
    log(f'Saved {out_path} (clip_q={clip_q}, upper={upper:,.0f}); head:')
    print(sub.head())
    return out_path

# 1) Isotonic-calibrated 3-seed XGB
o3 = pd.read_csv('oof_xgb_3seed.csv')  # segment_id,y,oof_mean
y3 = o3['y'].astype(np.float64).values
oof3 = o3['oof_mean'].astype(np.float64).values
test3 = load_test_series('pred_test_xgb_3seed_raw.csv')
out1 = iso_calibrate_and_save(y3, oof3, test3, clip_q=0.999, out_path='sub_iso_3seed_q0999.csv')

# 2) Isotonic-calibrated fixed-weight tri-blend (0.85/0.10/0.05)
orob = pd.read_csv('oof_xgb_robust.csv')
olgb = pd.read_csv('oof_lgb.csv')
oo = o3.merge(orob[['segment_id','oof_robust']], on='segment_id').merge(olgb[['segment_id','oof_lgb']], on='segment_id')
y_tri = oo['y'].astype(np.float64).values
oof_tri = (0.85*oo['oof_mean'].astype(np.float64).values + 0.10*oo['oof_robust'].astype(np.float64).values + 0.05*oo['oof_lgb'].astype(np.float64).values)
t3 = load_test_series('pred_test_xgb_3seed_raw.csv')
tr = load_test_series('pred_test_xgb_robust_raw.csv')
tl = load_test_series('pred_test_lgb_raw.csv')
test_tri = (0.85*t3 + 0.10*tr + 0.05*tl).astype(np.float64)
out2 = iso_calibrate_and_save(y_tri, oof_tri, test_tri, clip_q=0.995, out_path='sub_iso_tri_q0995.csv')

log(f'Isotonic calibration files ready: {out1}, {out2}. Copy chosen file to submission.csv then submit.')

[2025-09-27T04:45:27.660296Z] Saved sub_iso_3seed_q0999.csv (clip_q=0.999, upper=48,356,643); head:


   segment_id  time_to_eruption
0   951290289          31867039
1   508758258          12945360
2  1566132188          37029393
3  1891418251          22227369
4  1968343855           6864116
[2025-09-27T04:45:27.670162Z] Saved sub_iso_tri_q0995.csv (clip_q=0.995, upper=47,715,640); head:


   segment_id  time_to_eruption
0   951290289          31585738
1   508758258          13481325
2  1566132188          35559280
3  1891418251          21920915
4  1968343855           6705357
[2025-09-27T04:45:27.670924Z] Isotonic calibration files ready: sub_iso_3seed_q0999.csv, sub_iso_tri_q0995.csv. Copy chosen file to submission.csv then submit.
