In [None]:
# Stage 2: Train LightGBM on cached Feather/NumPy with robust logging + stdout heartbeats (CPU-only streamlined)
import os, sys, time, gc, json, logging, importlib, subprocess, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

print('[HEARTBEAT] 02_train starting...'); sys.stdout.flush()

# --- Logging ---
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_train.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ],
                    force=True)
os.environ['PYTHONUNBUFFERED'] = '1'

def ensure_package(pkg: str, import_name: str = None):
    name = import_name or pkg
    try:
        return importlib.import_module(name)
    except ImportError:
        print(f'[HEARTBEAT] Installing {pkg}...'); sys.stdout.flush()
        logging.info(f'Installing {pkg}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return importlib.import_module(name)

try:
    lgb = ensure_package('lightgbm', 'lightgbm')
    print('[HEARTBEAT] lightgbm imported OK'); sys.stdout.flush()

    SEED = 42
    N_SPLITS = None  # set after loading folds

    t0 = time.time()
    logging.info('Loading cached datasets (Feather/NumPy)...')
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    y = np.load('y.npy')
    test_ids = np.load('test_ids.npy')
    with open('features.json', 'r') as f:
        features = json.load(f)
    elev_threshold = None
    if os.path.exists('preprocess_meta.json'):
        try:
            with open('preprocess_meta.json', 'r') as f:
                meta = json.load(f)
                elev_threshold = meta.get('elev_threshold', None)
        except Exception:
            elev_threshold = None
    logging.info(f'X shape: {X.shape}, X_test shape: {X_test.shape}, y shape: {y.shape}, features: {len(features)}')
    if elev_threshold is not None:
        logging.info(f'Using persisted elev_threshold: {elev_threshold}')
    print('[HEARTBEAT] Data loaded'); sys.stdout.flush()

    # Enforce feature column order
    missing_in_X = [c for c in features if c not in X.columns]
    missing_in_Xt = [c for c in features if c not in X_test.columns]
    if missing_in_X or missing_in_Xt:
        logging.warning(f'Feature mismatch. Missing in X: {missing_in_X[:5]}... Missing in X_test: {missing_in_Xt[:5]}...')
    X = X[features]
    X_test = X_test[features]

    # Convert to NumPy
    X_np = X.to_numpy()
    X_test_np = X_test.to_numpy()
    del X, X_test
    gc.collect()
    print('[HEARTBEAT] Converted to NumPy'); sys.stdout.flush()

    # Load folds
    if os.path.exists('fold_indices.npy'):
        folds = np.load('fold_indices.npy', allow_pickle=True).tolist()
        logging.info('Loaded fold_indices.npy for consistent CV splits.')
    else:
        logging.info('fold_indices.npy not found; creating new StratifiedKFold splits.')
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
        folds = list(skf.split(X_np, y))
        np.save('fold_indices.npy', np.array(folds, dtype=object))

    # Tie N_SPLITS to folds
    N_SPLITS = len(folds)
    logging.info(f'Using N_SPLITS={N_SPLITS} based on loaded folds.')
    print(f'[HEARTBEAT] N_SPLITS={N_SPLITS}', flush=True)

    # CPU-only LightGBM parameters (stable and efficient for large data).
    params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'learning_rate': 0.025,
        'num_leaves': 110,
        'min_data_in_leaf': 60,
        'max_depth': 9,
        'feature_fraction': 0.82,
        'bagging_fraction': 0.78,
        'bagging_freq': 1,
        'lambda_l1': 1.2,
        'lambda_l2': 2.5,
        'max_bin': 127,
        'bin_construct_sample_cnt': 50000,
        'force_col_wise': True,
        'verbose': -1,
        'seed': SEED,
        'bagging_seed': SEED,
        'feature_fraction_seed': SEED,
        'num_threads': 8,
        'first_metric_only': True,
        'deterministic': True,
        'feature_pre_filter': False,
        'device': 'cpu'
    }

    NUM_BOOST_ROUND = 4000
    EARLY_STOP_ROUNDS = 200
    LOG_PERIOD = 50

    oof_preds = np.zeros((y.shape[0], 7), dtype=np.float32)
    test_preds = np.zeros((X_test_np.shape[0], 7), dtype=np.float32)
    fold_acc = []

    logging.info(f'Starting {N_SPLITS}-fold CV training...')
    print('[HEARTBEAT] Training loop start'); sys.stdout.flush()
    for i, (trn_idx, val_idx) in enumerate(folds, 1):
        fts = time.time()
        logging.info(f'[FOLD {i}/{N_SPLITS}] Train: {len(trn_idx)} | Valid: {len(val_idx)}')
        print(f'[HEARTBEAT] Fold {i} start: trn={len(trn_idx)} val={len(val_idx)}'); sys.stdout.flush()
        X_trn = X_np[trn_idx]
        y_trn = y[trn_idx]
        X_val = X_np[val_idx]
        y_val = y[val_idx]

        print('[HEARTBEAT] Building Datasets...'); sys.stdout.flush()
        dtrain = lgb.Dataset(X_trn, label=y_trn, free_raw_data=True, params={'bin_construct_sample_cnt': 50000})
        dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain, free_raw_data=True)
        print('[HEARTBEAT] Datasets ready. Starting lgb.train...'); sys.stdout.flush()

        model = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            valid_sets=[dtrain, dvalid],
            valid_names=['train', 'valid'],
            callbacks=[
                lgb.early_stopping(stopping_rounds=EARLY_STOP_ROUNDS, verbose=False),
                lgb.log_evaluation(period=LOG_PERIOD)
            ]
        )

        val_proba = model.predict(X_val, num_iteration=model.best_iteration)
        oof_preds[val_idx] = val_proba
        val_pred = np.argmax(val_proba, axis=1)
        acc = accuracy_score(y_val, val_pred)
        fold_acc.append(acc)
        logging.info(f'[FOLD {i}] ACC={acc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-fts:.1f}s')
        print(f'[HEARTBEAT] Fold {i} done: acc={acc:.6f} best_iter={model.best_iteration}'); sys.stdout.flush()

        test_fold = model.predict(X_test_np, num_iteration=model.best_iteration)
        test_preds += test_fold / N_SPLITS

        # Save partial artifacts after each fold
        np.save('lgb_oof_preds_partial.npy', oof_preds)
        np.save('lgb_test_preds_partial.npy', test_preds)

        del X_trn, X_val, y_trn, y_val, dtrain, dvalid, model, val_proba, test_fold
        gc.collect()

    oof_labels = np.argmax(oof_preds, axis=1)
    cv_acc = accuracy_score(y, oof_labels)
    logging.info(f'[CV] Mean ACC over folds: {np.mean(fold_acc):.6f} | OOF ACC: {cv_acc:.6f}')
    print(f'[HEARTBEAT] CV done: mean_acc={np.mean(fold_acc):.6f} OOF={cv_acc:.6f}'); sys.stdout.flush()

    # Save artifacts
    np.save('lgb_oof_preds.npy', oof_preds)
    np.save('lgb_test_preds.npy', test_preds)
    logging.info('Saved lgb_oof_preds.npy and lgb_test_preds.npy')
    submission = pd.DataFrame({'Id': test_ids, 'Cover_Type': np.argmax(test_preds, axis=1) + 1})
    submission.to_csv('submission.csv', index=False)
    logging.info('Saved submission.csv')
    logging.info(f'Done. Total elapsed: {time.time()-t0:.1f}s')
    print('[HEARTBEAT] 02_train finished.'); sys.stdout.flush()

except Exception as e:
    err = traceback.format_exc()
    print('[ERROR] Exception in 02_train:', e); sys.stdout.flush()
    print(err); sys.stdout.flush()
    try:
        with open('train_error.log', 'w') as f:
            f.write(err)
    except Exception:
        pass
    logging.error('Exception occurred in training pipeline:\n' + err)

In [None]:
# Sanity: ensure LightGBM is installed and importable, print version
import sys, subprocess, importlib, logging
try:
    import lightgbm as lgb
    print('[SANITY] lightgbm already installed:', lgb.__version__); sys.stdout.flush()
except Exception as e:
    print('[SANITY] Installing lightgbm due to import error:', e); sys.stdout.flush()
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'lightgbm'])
    import lightgbm as lgb
    print('[SANITY] lightgbm installed:', lgb.__version__); sys.stdout.flush()

In [None]:
# Diagnostics: check artifacts, logs, and environment status
import os, time, glob, json
from datetime import datetime
import numpy as np
import pandas as pd

def stat_file(p):
    if os.path.exists(p):
        sz = os.path.getsize(p)
        mt = os.path.getmtime(p)
        return f'exists size={sz} mtime={datetime.fromtimestamp(mt)}'
    return 'MISSING'

targets = [
    'run_train.log',
    'lgb_oof_preds_partial.npy',
    'lgb_test_preds_partial.npy',
    'lgb_oof_preds.npy',
    'lgb_test_preds.npy',
    'submission.csv',
    'X.feather',
    'X_test.feather',
    'y.npy',
    'test_ids.npy',
    'features.json',
    'fold_indices.npy'
]
print('=== File stats ===')
for p in targets:
    print(f'{p}: {stat_file(p)}')

print('\n=== Try tail of run_train.log (last 1000 chars) ===')
if os.path.exists('run_train.log'):
    try:
        with open('run_train.log', 'r') as f:
            f.seek(0, os.SEEK_END)
            size = f.tell()
            f.seek(max(size-1000, 0))
            print(f.read())
    except Exception as e:
        print('Could not read run_train.log:', e)
else:
    print('run_train.log not found')

print('\n=== Quick dataset/metadata checks ===')
try:
    with open('features.json', 'r') as f:
        feats = json.load(f)
    print('features.json count:', len(feats))
except Exception as e:
    print('features.json read error:', e)

try:
    y = np.load('y.npy')
    print('y.npy shape:', y.shape)
except Exception as e:
    print('y.npy read error:', e)

try:
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    print('X.feather shape:', X.shape, '| X_test.feather shape:', X_test.shape)
except Exception as e:
    print('Feather read error:', e)

print('\n=== Folds info ===')
try:
    folds = np.load('fold_indices.npy', allow_pickle=True).tolist()
    print('fold_indices.npy len:', len(folds))
except Exception as e:
    print('fold_indices load error:', e)

print('\n=== Env/threads ===')
try:
    import multiprocessing as mp
    print('CPU count:', mp.cpu_count())
except Exception as e:
    print('cpu_count error:', e)
print('LIGHTGBM_THREADS env:', os.environ.get('OMP_NUM_THREADS'), os.environ.get('NUMEXPR_MAX_THREADS'), os.environ.get('MKL_NUM_THREADS'))

print('\n=== Done diagnostics ===')

In [None]:
# Quick sanity: write markers to verify execution and environment
import os, time, json, numpy as np
ts = time.strftime('%Y-%m-%d %H:%M:%S')
with open('sanity_marker.txt', 'w') as f:
    f.write(f'started at {ts}')
try:
    import lightgbm as lgb
    with open('sanity_lgb_version.txt', 'w') as f:
        f.write(lgb.__version__)
except Exception as e:
    with open('sanity_lgb_version.txt', 'w') as f:
        f.write(f'import_error: {e}')

with open('features_count.txt', 'w') as f:
    try:
        with open('features.json', 'r') as jf:
            feats = json.load(jf)
        f.write(str(len(feats)))
    except Exception as e:
        f.write(f'error: {e}')

# write a tiny npy so we can confirm file writes work
np.save('tmp_probe.npy', np.array([1,2,3], dtype=np.int32))

# dump a lightweight directory listing snapshot
try:
    files = sorted(os.listdir('.'))
    with open('dir_list.txt', 'w') as f:
        for p in files:
            try:
                sz = os.path.getsize(p)
                f.write(f'{p}\t{sz}\n')
            except Exception:
                f.write(f'{p}\t-1\n')
except Exception:
    pass

In [None]:
# Quick baseline: fast subsample training to validate pipeline and produce a submission (tiny smoke test)
import os, sys, time, json, logging, gc, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print('[QUICK] Starting quick baseline training (SMOKE TEST)...'); sys.stdout.flush()
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_quick.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ],
                    force=True)

try:
    t0 = time.time()
    print('[QUICK] Loading cached data...'); sys.stdout.flush()
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    y = np.load('y.npy')
    with open('features.json', 'r') as f:
        features = json.load(f)
    X = X[features]
    X_test = X_test[features]
    print('[QUICK] Converting to NumPy...'); sys.stdout.flush()
    X_np = X.to_numpy()
    X_test_np = X_test.to_numpy()
    del X, X_test; gc.collect()
    logging.info(f'Data loaded. X: {X_np.shape}, X_test: {X_test_np.shape}')

    # Balanced per-class subsample to avoid stratification edge-cases
    print('[QUICK] Building balanced per-class subsample...'); sys.stdout.flush()
    rng = np.random.default_rng(42)
    classes = np.unique(y)
    per_class = 8000  # target per-class; will cap by available
    idx_list = []
    for c in classes:
        cls_idx = np.where(y == c)[0]
        take = min(per_class, cls_idx.shape[0])
        # If class is extremely rare (<=take), take all (no replace)
        sel = rng.choice(cls_idx, size=take, replace=False) if take > 0 else np.array([], dtype=np.int64)
        idx_list.append(sel)
    sub_idx = np.concatenate([arr for arr in idx_list if arr.size > 0])
    rng.shuffle(sub_idx)
    X_sub = X_np[sub_idx]
    y_sub = y[sub_idx]
    counts = np.array([(y_sub == c).sum() for c in classes])
    logging.info(f'Subsample built: {X_sub.shape}, class counts: ' + ','.join(str(int(k)) for k in counts))

    # Decide if stratification is possible (need at least 2 samples in every class)
    do_strat = (counts.min() >= 2)
    print(f"[QUICK] Train/valid split... (stratify={'yes' if do_strat else 'no'})"); sys.stdout.flush()
    X_trn, X_val, y_trn, y_val = train_test_split(
        X_sub, y_sub, test_size=0.1, random_state=42, stratify=(y_sub if do_strat else None)
    )
    logging.info(f'Train: {X_trn.shape}, Valid: {X_val.shape}')

    import lightgbm as lgb
    params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'learning_rate': 0.05,
        'num_leaves': 96,
        'min_data_in_leaf': 64,
        'max_depth': 8,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 1.0,
        'lambda_l2': 2.0,
        'max_bin': 127,
        'bin_construct_sample_cnt': 100000,
        'force_col_wise': True,
        'num_threads': 8,
        'deterministic': True,
        'feature_pre_filter': False,
        'seed': 42,
        'device': 'cpu'
    }
    print('[QUICK] Building Datasets...'); sys.stdout.flush()
    dtrain = lgb.Dataset(X_trn, label=y_trn, free_raw_data=True, params={'bin_construct_sample_cnt': 100000})
    dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain, free_raw_data=True)

    print('[QUICK] Training LightGBM (200 rounds, ES=50)...'); sys.stdout.flush()
    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=200,
        valid_sets=[dtrain, dvalid],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=20)
        ]
    )
    print('[QUICK] Training complete. Predicting...'); sys.stdout.flush()

    val_proba = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = np.argmax(val_proba, axis=1)
    val_acc = accuracy_score(y_val, val_pred)
    logging.info(f'Quick baseline valid ACC: {val_acc:.6f} at iter {model.best_iteration}')
    print(f'[QUICK] Valid ACC: {val_acc:.6f} iter={model.best_iteration}'); sys.stdout.flush()

    # Predict test and save submission
    test_proba = model.predict(X_test_np, num_iteration=model.best_iteration)
    test_pred = np.argmax(test_proba, axis=1) + 1
    test_ids = np.load('test_ids.npy')
    sub = pd.DataFrame({'Id': test_ids, 'Cover_Type': test_pred})
    sub.to_csv('submission.csv', index=False)
    logging.info('Saved submission.csv from quick baseline.')
    print(f'[QUICK] Done in {time.time()-t0:.1f}s'); sys.stdout.flush()
except Exception as e:
    print('[QUICK][ERROR]', e); sys.stdout.flush()
    with open('quick_error.log', 'w') as f:
        f.write(traceback.format_exc())

In [None]:
# Diagnostics: check target distribution end-to-end
import numpy as np, pandas as pd, json, os
from collections import Counter
print('[DIAG] Loading y.npy ...');
y = np.load('y.npy')
uniq, cnts = np.unique(y, return_counts=True)
print('[DIAG] y.npy classes:', uniq.tolist())
print('[DIAG] y.npy counts:', cnts.tolist(), ' total=', int(cnts.sum()))

print('[DIAG] Loading train.csv to cross-check Cover_Type ...')
df = pd.read_csv('train.csv', usecols=['Cover_Type'])
vc = df['Cover_Type'].value_counts().sort_index()
print('[DIAG] train.csv Cover_Type counts (1..7):', vc.to_dict())
print('[DIAG] train.csv total:', int(df.shape[0]))

print('[DIAG] features.json length:')
with open('features.json','r') as f:
    feats = json.load(f)
print(len(feats))

print('[DIAG] test_ids vs sample_submission length:')
test_ids = np.load('test_ids.npy')
print('test_ids:', len(test_ids))
ss = pd.read_csv('sample_submission.csv')
print('sample_submission rows:', ss.shape[0])
print('[DIAG] Done.')

In [None]:
# Single-model fast training (no CV): sub-sample mid-size train, small valid, quick submission with class weights
import os, sys, time, json, logging, gc, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print('[SINGLE] Starting single-model fast training...'); sys.stdout.flush()
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_single.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ],
                    force=True)

try:
    t0 = time.time()
    # Load cached arrays/feather
    logging.info('Loading cached data...')
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    y = np.load('y.npy')
    with open('features.json','r') as f:
        features = json.load(f)
    X = X[features]
    X_test = X_test[features]
    X_np = X.to_numpy()
    X_test_np = X_test.to_numpy()
    del X, X_test; gc.collect()
    logging.info(f'X: {X_np.shape}, X_test: {X_test_np.shape}')

    # Build a mid-size sub-sample for speed, ensure all classes appear
    rng = np.random.default_rng(42)
    n_total = X_np.shape[0]
    target_rows = min(700000, n_total)  # ~0.7M rows for faster training
    base_idx = rng.choice(n_total, size=target_rows, replace=False)
    # Ensure include at least one sample from every class (especially the rare ones)
    classes = np.unique(y)
    ensure_idx = []
    for c in classes:
        loc = np.where(y == c)[0]
        if loc.size > 0:
            ensure_idx.append(loc[0])
    ensure_idx = np.array(list(set(ensure_idx)), dtype=np.int64)
    sub_idx = np.unique(np.concatenate([base_idx, ensure_idx]))
    X_sub = X_np[sub_idx]
    y_sub = y[sub_idx]
    logging.info(f'Sub-sample: {X_sub.shape}')

    # Train/valid split (not stratified; rare class may be only in train)
    X_trn, X_val, y_trn, y_val = train_test_split(X_sub, y_sub, test_size=0.05, random_state=42, shuffle=True, stratify=None)
    logging.info(f'Train: {X_trn.shape}, Valid: {X_val.shape}')

    # Compute class weights (inverse freq) and assign per-row weights for training
    cls, counts = np.unique(y_trn, return_counts=True)
    freq = counts / counts.sum()
    inv = 1.0 / np.clip(freq, 1e-12, None)
    inv = inv / inv.mean()  # normalize around 1
    weight_map = {int(c): float(w) for c, w in zip(cls, inv)}
    w_trn = np.array([weight_map[int(t)] for t in y_trn], dtype=np.float32)
    logging.info('Class weights: ' + ','.join(f'{int(c)}:{weight_map[int(c)]:.3f}' for c in sorted(weight_map.keys())))

    import lightgbm as lgb
    params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'learning_rate': 0.025,
        'num_leaves': 110,
        'min_data_in_leaf': 60,
        'max_depth': 9,
        'feature_fraction': 0.82,
        'bagging_fraction': 0.78,
        'bagging_freq': 1,
        'lambda_l1': 1.2,
        'lambda_l2': 2.5,
        'max_bin': 127,
        'bin_construct_sample_cnt': 50000,
        'force_col_wise': True,
        'num_threads': 8,
        'deterministic': True,
        'feature_pre_filter': False,
        'seed': 42,
        'device': 'cpu',
        'first_metric_only': True
    }

    dtrain = lgb.Dataset(X_trn, label=y_trn, weight=w_trn, free_raw_data=True, params={'bin_construct_sample_cnt': 50000})
    dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain, free_raw_data=True)
    logging.info('Training LightGBM (single model, weighted) ...')
    model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1500,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=150, verbose=False),
            lgb.log_evaluation(period=50)
        ]
    )

    # Validation report
    val_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred = np.argmax(val_pred_proba, axis=1)
    val_acc = accuracy_score(y_val, val_pred)
    logging.info(f'[SINGLE] Valid ACC: {val_acc:.6f} @iter {model.best_iteration}')
    print(f'[SINGLE] Valid ACC: {val_acc:.6f} @iter {model.best_iteration}'); sys.stdout.flush()

    # Predict test and save submission
    test_pred_proba = model.predict(X_test_np, num_iteration=model.best_iteration)
    test_pred = np.argmax(test_pred_proba, axis=1) + 1
    test_ids = np.load('test_ids.npy')
    sub = pd.DataFrame({'Id': test_ids, 'Cover_Type': test_pred})
    sub.to_csv('submission.csv', index=False)
    logging.info('Saved submission.csv (single model).')
    print(f'[SINGLE] Done in {time.time()-t0:.1f}s'); sys.stdout.flush()
except Exception as e:
    print('[SINGLE][ERROR]', e); sys.stdout.flush()
    with open('single_error.log', 'w') as f:
        f.write(traceback.format_exc())

In [6]:
# GPU-first Bagging ensemble: stronger LightGBM models on larger subsamples with robust imbalance handling
import os, sys, time, json, logging, gc, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Immediate heartbeat files to verify execution start and CWD
try:
    with open('bag_start.txt', 'w') as f:
        f.write(time.strftime('%Y-%m-%d %H:%M:%S'))
except Exception:
    pass

def hb(msg):
    try:
        with open('bag_heartbeats.txt', 'a') as f:
            f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} | {msg}\n")
    except Exception:
        pass

print('[BAG] Starting bagging ensemble training (GPU-first)...'); sys.stdout.flush()
hb('Script start')
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_bagging.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ],
                    force=True)
hb('Logging configured; run_bagging.log handler created')

try:
    t0_all = time.time()
    logging.info('Loading cached data...')
    hb('Loading cached data (feather/npy)')
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    y = np.load('y.npy')
    with open('features.json','r') as f:
        features = json.load(f)
    X = X[features]
    X_test = X_test[features]
    X_np = X.to_numpy(); X_test_np = X_test.to_numpy()
    test_ids = np.load('test_ids.npy')
    del X; gc.collect()
    logging.info(f'Data ready. X: {X_np.shape}, X_test: {X_test_np.shape}')
    hb(f'Data ready {X_np.shape} / {X_test_np.shape}')

    # Thread/env stability before importing LightGBM
    os.environ['OMP_NUM_THREADS'] = os.environ.get('OMP_NUM_THREADS', '16')
    os.environ['MKL_NUM_THREADS'] = os.environ.get('MKL_NUM_THREADS', '1')
    hb('Thread env set')

    import lightgbm as lgb
    hb('lightgbm imported')

    # GPU-optimized base params
    gpu_params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'learning_rate': 0.02,
        'num_leaves': 96,
        'min_data_in_leaf': 80,
        'max_depth': -1,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 1.0,
        'lambda_l2': 2.0,
        'max_bin': 255,
        'min_sum_hessian_in_leaf': 1e-2,
        'force_col_wise': True,
        'num_threads': 16,
        'deterministic': True,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }

    # CPU fallback params (used on per-model fallback if GPU fails)
    cpu_params = {
        'objective': 'multiclass',
        'num_class': 7,
        'metric': 'multi_logloss',
        'learning_rate': 0.025,
        'num_leaves': 128,
        'min_data_in_leaf': 80,
        'max_depth': -1,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 1.0,
        'lambda_l2': 2.0,
        'max_bin': 255,
        'force_col_wise': True,
        'num_threads': 8,
        'deterministic': True,
        'device': 'cpu'
    }

    # Ensemble config (scale up for medal attempt)
    M = 7
    SUB_SIZE = min(600_000, X_np.shape[0])
    VAL_FRAC = 0.05
    N_ROUNDS_GPU = 4000
    ES_ROUNDS_GPU = 200
    LOG_PERIOD = 100

    # CPU fallback rounds
    N_ROUNDS_CPU = 1000
    ES_ROUNDS_CPU = 160

    # Pre-compute class counts for robust sampling
    class_counts = np.bincount(y, minlength=7)
    if class_counts.size < 7:
        class_counts = np.pad(class_counts, (0, 7 - class_counts.size))
    singleton_classes = np.where(class_counts == 1)[0]
    singleton_class = int(singleton_classes[0]) if singleton_classes.size > 0 else None
    hb(f'class_counts={class_counts.tolist()} singleton={singleton_class}')

    test_preds_agg = np.zeros((X_test_np.shape[0], 7), dtype=np.float32)
    oof_like_agg = np.zeros((X_np.shape[0], 7), dtype=np.float32)
    val_acc_list = []

    for m in range(M):
        t0 = time.time()
        seed = 42 + m
        rng = np.random.default_rng(seed)
        logging.info(f'[BAG {m+1}/{M}] Sampling {SUB_SIZE} rows with seed {seed}...')
        hb(f'Bag {m+1}/{M} sampling start')
        idx = rng.choice(X_np.shape[0], size=SUB_SIZE, replace=False)

        # Ensure presence of rare classes in each bag
        ensure_list = []
        for c in range(7):
            loc = np.where(y == c)[0]
            if loc.size == 0:
                continue
            if singleton_class is not None and c == singleton_class:
                k = 1
                sel = rng.choice(loc, size=k, replace=True)
                ensure_list.append(sel)
            elif class_counts[c] < 2000:
                k = min(50, loc.size)
                sel = rng.choice(loc, size=k, replace=(loc.size < k))
                ensure_list.append(sel)
            else:
                ensure_list.append(loc[:1])
        if ensure_list:
            ensure_idx = np.unique(np.concatenate(ensure_list))
            idx = np.unique(np.concatenate([idx, ensure_idx]))

        X_sub = X_np[idx]
        y_sub = y[idx]
        hb(f'Bag {m+1}/{M} subsample ready: {X_sub.shape}')

        X_trn, X_val, y_trn, y_val, trn_mask, val_mask = train_test_split(
            X_sub, y_sub, np.arange(idx.size), test_size=VAL_FRAC, random_state=seed, shuffle=True
        )
        logging.info(f'[BAG {m+1}/{M}] Train: {X_trn.shape}, Valid: {X_val.shape}')
        hb(f'Bag {m+1}/{M} split: trn {X_trn.shape} val {X_val.shape}')

        # Per-row weights: inverse-frequency, capped for stability
        cls, counts = np.unique(y_trn, return_counts=True)
        inv = (counts.sum() / counts)
        inv = inv / inv.mean()
        inv = np.minimum(inv, 10.0)
        w_map = {int(c): float(w) for c, w in zip(cls, inv)}
        w_trn = np.array([w_map.get(int(t), 1.0) for t in y_trn], dtype=np.float32)
        logging.info(f"[BAG {m+1}/{M}] Class weights (capped): " + ','.join(f'{int(c)}:{w_map[int(c)]:.3f}' for c in sorted(w_map.keys())))
        hb(f'Bag {m+1}/{M} weights ready')

        # Ensure consistent binning across GPU and CPU by fixing Dataset params
        dtrain = lgb.Dataset(X_trn, label=y_trn, weight=w_trn, free_raw_data=True, params={'max_bin': 255})
        dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain, free_raw_data=True)

        # Try GPU first, fall back to CPU on failure
        params = dict(gpu_params)
        params['seed'] = seed; params['bagging_seed'] = seed; params['feature_fraction_seed'] = seed
        num_rounds = N_ROUNDS_GPU
        es_rounds = ES_ROUNDS_GPU
        used_device = 'gpu'
        logging.info(f'[BAG {m+1}/{M}] Training LightGBM on GPU...')
        print(f'[BAG-HEARTBEAT] Model {m+1}/{M} training start (GPU)...', flush=True)
        hb(f'Bag {m+1}/{M} lgb.train start (GPU)')
        try:
            model = lgb.train(
                params=params,
                train_set=dtrain,
                num_boost_round=num_rounds,
                valid_sets=[dtrain, dvalid],
                valid_names=['train','valid'],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=es_rounds, verbose=False),
                    lgb.log_evaluation(period=LOG_PERIOD)
                ]
            )
        except Exception as e_gpu:
            logging.warning(f'[BAG {m+1}/{M}] GPU failed ({e_gpu}); falling back to CPU...')
            hb(f'Bag {m+1}/{M} GPU failed; fallback to CPU')
            params = dict(cpu_params);
            params['seed'] = seed; params['bagging_seed'] = seed; params['feature_fraction_seed'] = seed
            num_rounds = N_ROUNDS_CPU
            es_rounds = ES_ROUNDS_CPU
            used_device = 'cpu'
            print(f'[BAG-HEARTBEAT] Model {m+1}/{M} retry training (CPU)...', flush=True)
            model = lgb.train(
                params=params,
                train_set=dtrain,
                num_boost_round=num_rounds,
                valid_sets=[dtrain, dvalid],
                valid_names=['train','valid'],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=es_rounds, verbose=False),
                    lgb.log_evaluation(period=LOG_PERIOD)
                ]
            )

        print(f'[BAG-HEARTBEAT] Model {m+1}/{M} trained on {used_device}: best_iter={model.best_iteration}', flush=True)
        hb(f'Bag {m+1}/{M} trained on {used_device}, best_iter={model.best_iteration}')

        # Validation metrics
        val_proba = model.predict(X_val, num_iteration=model.best_iteration)
        val_pred = np.argmax(val_proba, axis=1)
        val_acc = accuracy_score(y_val, val_pred)
        val_acc_list.append(val_acc)
        logging.info(f'[BAG {m+1}/{M}] Valid ACC: {val_acc:.6f} @iter {model.best_iteration} (device={used_device})')
        hb(f'Bag {m+1}/{M} valid acc={val_acc:.6f}')

        # OOF-like storage back to original indices
        sub_idx = idx[val_mask]
        oof_like_agg[sub_idx] = val_proba

        # Predict test
        test_fold = model.predict(X_test_np, num_iteration=model.best_iteration)
        test_preds_agg += (test_fold / M)

        # Save partial artifacts
        np.save('bag_test_preds_partial.npy', test_preds_agg)
        np.save('bag_oof_like_partial.npy', oof_like_agg)
        pd.DataFrame({'Id': test_ids, 'Cover_Type': np.argmax(test_preds_agg, axis=1) + 1}).to_csv('submission_partial.csv', index=False)
        logging.info(f'[BAG {m+1}/{M}] Partial artifacts saved. Elapsed this model: {time.time()-t0:.1f}s')
        hb(f'Bag {m+1}/{M} partial artifacts saved')

        # Cleanup
        del X_trn, X_val, y_trn, y_val, dtrain, dvalid, model, val_proba, test_fold, w_trn
        gc.collect()
        hb(f'Bag {m+1}/{M} cleanup done')

    # Post-processing for class 5 (singleton in train) before saving final submission
    logging.info('Applying post-processing for class 5...')
    hb('Post-processing class 5 start')
    try:
        X_test_df = pd.read_feather('X_test.feather')[features]
        col_c5 = 'Wilderness_Area_4' if 'Wilderness_Area_4' in X_test_df.columns else ('Wilderness_Area4' if 'Wilderness_Area4' in X_test_df.columns else None)
        if col_c5 is not None and 'Elevation' in X_test_df.columns:
            class5_mask = (X_test_df[col_c5] == 1) & (X_test_df['Elevation'] > 3400)
            num_overridden = int(class5_mask.sum())
        else:
            class5_mask = np.zeros(X_test_df.shape[0], dtype=bool)
            num_overridden = 0
    except Exception:
        class5_mask = np.zeros(X_test_np.shape[0], dtype=bool)
        num_overridden = 0

    final_pred_labels = np.argmax(test_preds_agg, axis=1)
    if num_overridden > 0:
        # zero-based class index 4 corresponds to Cover_Type=5
        final_pred_labels[class5_mask.values if hasattr(class5_mask, 'values') else class5_mask] = 4
        logging.info(f'Overrode {num_overridden} test predictions to class 5 (index 4).')
    hb(f'Post-processing overrides: {num_overridden}')

    # Final artifacts
    np.save('bag_test_preds.npy', test_preds_agg)
    np.save('bag_oof_like.npy', oof_like_agg)
    submission = pd.DataFrame({'Id': test_ids, 'Cover_Type': final_pred_labels + 1})
    submission.to_csv('submission.csv', index=False)
    logging.info(f'[BAG] Done. Models: {M}, mean val ACC: {np.mean(val_acc_list):.6f}. Total elapsed: {time.time()-t0_all:.1f}s')
    print('[BAG] Finished bagging. Submission saved as submission.csv'); sys.stdout.flush()
    hb('Finished and saved submission.csv')

except Exception as e:
    print('[BAG][ERROR]', e); sys.stdout.flush()
    hb(f'ERROR: {e}')
    with open('bagging_error.log', 'w') as f:
        f.write(traceback.format_exc())

[BAG] Starting bagging ensemble training (GPU-first)...


2025-09-08 16:09:58,938 [INFO] Loading cached data...


2025-09-08 16:10:01,456 [INFO] Data ready. X: (3600000, 70), X_test: (400000, 70)


2025-09-08 16:10:01,507 [INFO] [BAG 1/7] Sampling 600000 rows with seed 42...


2025-09-08 16:10:02,511 [INFO] [BAG 1/7] Train: (570044, 70), Valid: (30003, 70)


2025-09-08 16:10:02,671 [INFO] [BAG 1/7] Class weights (capped): 0:0.000,1:0.000,2:0.000,3:0.079,4:6.916,5:0.004,6:0.001


2025-09-08 16:10:02,672 [INFO] [BAG 1/7] Training LightGBM on GPU...


[BAG-HEARTBEAT] Model 1/7 training start (GPU)...




[LightGBM] [Info] This is the GPU trainer!!


[LightGBM] [Info] Total Bins 6045
[LightGBM] [Info] Number of data points in the train set: 570044, number of used features: 68


In [1]:
# Probe: verify CWD and file writes work, and list current directory
import os, time, json, numpy as np, pandas as pd
ts = time.strftime('%Y-%m-%d %H:%M:%S')
with open('probe_marker.txt', 'w') as f:
    f.write(f'probe at {ts}')
np.save('probe_array.npy', np.array([42], dtype=np.int32))
print('[PROBE] Wrote probe_marker.txt and probe_array.npy at', ts)
files = sorted(os.listdir('.'))
print('[PROBE] Dir has', len(files), 'entries; showing first 50:')
for p in files[:50]:
    try:
        print(p, os.path.getsize(p))
    except Exception:
        print(p, -1)

[PROBE] Wrote probe_marker.txt and probe_array.npy at 2025-09-08 15:48:22
[PROBE] Dir has 33 entries; showing first 50:
01_preprocess.ipynb 10811
02_train.ipynb 48627
X.feather 243019218
X_test.feather 27051754
agent_metadata 4096
catboost.ipynb 10575
catboost_info 4096
description.md 3903
dir_list.txt 580
docker_run.log 1235073
features.json 1191
features_count.txt 2
fold_indices.npy 144000751
main.ipynb 22020
preprocess_meta.json 26
probe_array.npy 132
probe_marker.txt 28
quick_error.log 1168
requirements.txt 2021
run_preprocess.log 600
run_quick.log 409
run_train.log 548
sample_submission.csv 3889096
sanity_lgb_version.txt 5
sanity_marker.txt 30
submission.csv 3889096
task.txt 2943
test.csv 53996233
test_ids.npy 1600128
tmp_probe.npy 140
train.csv 493166255
xgboost.ipynb 10608
y.npy 3600128
