# TPS Dec 2021 - Plan, Log, and Baseline

## Plan
- Goal: Achieve medal-level accuracy (>= 0.9566) on Tabular Playground Series Dec 2021.
- Dataset: Synthetic forest cover type; target classes 1..7. Features likely include numeric geomorphology and one-hot Wilderness_Area and Soil_Type.
- Approach:
  1) Quick EDA: shapes, dtypes, target distribution.
  2) Baseline model: LightGBM with stratified 5-fold CV, strong regularization and early stopping; log CV accuracy.
  3) Improve: Tune LGBM params; try CatBoost/XGBoost; blending/ensemble if needed.
  4) Feature engineering: simple interactions if beneficial (e.g., sums of soil/wilderness, elevation-related ratios).
  5) Generate submission when CV >= 0.9566.
- Reproducibility: fixed seeds, clear logging.

## Experiment Log
- [ ] Exp001: LGBM baseline 5-fold, early stopping, default/tuned params.
- [ ] Exp002: Param tune LGBM.
- [ ] Exp003: CatBoost baseline.
- [ ] Exp004: XGBoost baseline.
- [ ] Exp005: Blend/stack best models.

We will request expert reviews at key milestones (post-plan, post-EDA, post-baseline, and before long training).

In [6]:
import os, sys, time, gc, warnings, math, subprocess, importlib
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 200)

SEED = 42
N_SPLITS = 5  # switched to 5-fold for faster iteration
RANDOM_STATE = 42

def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)

def ensure_package(pkg_name: str, import_name: str = None, extra_install: str = ''):
    name = import_name or pkg_name
    try:
        return importlib.import_module(name)
    except ImportError:
        print(f"[INFO] Installing {pkg_name}...")
        cmd = [sys.executable, '-m', 'pip', 'install', pkg_name] + ([extra_install] if extra_install else [])
        subprocess.check_call(cmd)
        return importlib.import_module(name)

set_seed(SEED)

# LightGBM
lgb = ensure_package('lightgbm', 'lightgbm')

t0 = time.time()
print("[INFO] Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"[INFO] train shape: {train.shape}, test shape: {test.shape}")

# Basic checks
assert 'Cover_Type' in train.columns, 'Target Cover_Type not found in train.csv'
if 'Id' in train.columns:
    print('[INFO] Found Id column in train')
if 'Id' in test.columns:
    print('[INFO] Found Id column in test')

print('[INFO] Missing values train:', train.isnull().sum().sum(), ' | test:', test.isnull().sum().sum())

# Target re-indexing to 0..6
y_raw = train['Cover_Type'].values
y = y_raw - 1

# Drop target and Id from features
feature_cols = [c for c in train.columns if c not in ['Cover_Type']]
if 'Id' in feature_cols:
    feature_cols.remove('Id')

X = train[feature_cols].copy()
X_test = test[[c for c in test.columns if c != 'Id']].copy()

# --- Feature Engineering ---
print('[INFO] Feature engineering...')
# 1) Distribution shift feature on Elevation
elev_threshold = X_test['Elevation'].median()  # capture test distribution shift
X['is_high_elevation'] = (X['Elevation'] > elev_threshold).astype(np.int8)
X_test['is_high_elevation'] = (X_test['Elevation'] > elev_threshold).astype(np.int8)

# 2) Hydrology features
if set(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology']).issubset(X.columns):
    X['Hydrology_Euclid'] = np.sqrt(X['Horizontal_Distance_To_Hydrology']**2 + X['Vertical_Distance_To_Hydrology']**2)
    X_test['Hydrology_Euclid'] = np.sqrt(X_test['Horizontal_Distance_To_Hydrology']**2 + X_test['Vertical_Distance_To_Hydrology']**2)
    X['Elev_minus_VertHydro'] = X['Elevation'] - X['Vertical_Distance_To_Hydrology']
    X_test['Elev_minus_VertHydro'] = X_test['Elevation'] - X_test['Vertical_Distance_To_Hydrology']

# 3) Hillshade features
hill_cols = [c for c in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm'] if c in X.columns]
if len(hill_cols) == 3:
    X['Hillshade_Mean'] = X[hill_cols].mean(axis=1)
    X_test['Hillshade_Mean'] = X_test[hill_cols].mean(axis=1)
    X['Hillshade_Min'] = X[hill_cols].min(axis=1)
    X_test['Hillshade_Min'] = X_test[hill_cols].min(axis=1)
    X['Hillshade_Max'] = X[hill_cols].max(axis=1)
    X_test['Hillshade_Max'] = X_test[hill_cols].max(axis=1)
    X['Hillshade_Range'] = X['Hillshade_Max'] - X['Hillshade_Min']
    X_test['Hillshade_Range'] = X_test['Hillshade_Max'] - X_test['Hillshade_Min']

# 4) Distance interactions commonly used in this dataset
dist_cols = ['Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Hydrology']
if set(dist_cols).issubset(X.columns):
    hf, rr, hh = dist_cols
    # pairwise abs diffs
    X['DistDiff_Fire_Road'] = (X[hf] - X[rr]).abs()
    X_test['DistDiff_Fire_Road'] = (X_test[hf] - X_test[rr]).abs()
    X['DistDiff_Fire_Hydro'] = (X[hf] - X[hh]).abs()
    X_test['DistDiff_Fire_Hydro'] = (X_test[hf] - X_test[hh]).abs()
    X['DistDiff_Road_Hydro'] = (X[rr] - X[hh]).abs()
    X_test['DistDiff_Road_Hydro'] = (X_test[rr] - X_test[hh]).abs()
    # aggregates
    X['DistMean_FRH'] = (X[hf] + X[rr] + X[hh]) / 3.0
    X_test['DistMean_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh]) / 3.0
    X['DistSum_FRH'] = (X[hf] + X[rr] + X[hh])
    X_test['DistSum_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh])
    X['DistMin_FRH'] = X[[hf, rr, hh]].min(axis=1)
    X_test['DistMin_FRH'] = X_test[[hf, rr, hh]].min(axis=1)
    X['DistMax_FRH'] = X[[hf, rr, hh]].max(axis=1)
    X_test['DistMax_FRH'] = X_test[[hf, rr, hh]].max(axis=1)

# 5) Sum of one-hot groups
soil_cols = [c for c in X.columns if c.startswith('Soil_Type_')]
wild_cols = [c for c in X.columns if c.startswith('Wilderness_Area_')]
if soil_cols:
    X['Soil_Type_Count'] = X[soil_cols].sum(axis=1)
    X_test['Soil_Type_Count'] = X_test[soil_cols].sum(axis=1)
if wild_cols:
    X['Wilderness_Area_Count'] = X[wild_cols].sum(axis=1)
    X_test['Wilderness_Area_Count'] = X_test[wild_cols].sum(axis=1)

# 6) Aspect encoding (sin/cos)
if 'Aspect' in X.columns:
    X['Aspect_sin'] = np.sin(np.deg2rad(X['Aspect']))
    X_test['Aspect_sin'] = np.sin(np.deg2rad(X_test['Aspect']))
    X['Aspect_cos'] = np.cos(np.deg2rad(X['Aspect']))
    X_test['Aspect_cos'] = np.cos(np.deg2rad(X_test['Aspect']))

features = X.columns.tolist()
print(f"[INFO] Final feature count: {len(features)}")

# --- Cross-Validation Training (LightGBM) ---
params = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'num_leaves': 48,
    'min_data_in_leaf': 96,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'lambda_l1': 1.0,
    'lambda_l2': 2.0,
    'max_bin': 128,
    'bin_construct_sample_cnt': 200000,
    'verbose': -1,
    'seed': SEED,
    'num_threads': 24,
    'first_metric_only': True,
    'deterministic': True,
    'feature_pre_filter': False
}

# Use consistent folds if available
fold_file = 'fold_indices.npy'
if os.path.exists(fold_file):
    folds = np.load(fold_file, allow_pickle=True).tolist()
else:
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    folds = list(skf.split(X, y))
    np.save(fold_file, np.array(folds, dtype=object))
    print(f"[INFO] Saved fold indices to {fold_file}")

oof_preds = np.zeros((X.shape[0], 7), dtype=np.float32)
test_preds = np.zeros((X_test.shape[0], 7), dtype=np.float32)
fold_acc = []

print('[INFO] Starting CV training...')
for fold, (trn_idx, val_idx) in enumerate(folds, 1):
    f_t = time.time()
    print(f"[FOLD {fold}/{N_SPLITS}] Train: {len(trn_idx)}, Valid: {len(val_idx)}")
    X_trn = X.iloc[trn_idx]
    y_trn = y[trn_idx]
    X_val = X.iloc[val_idx]
    y_val = y[val_idx]

    lgb_train = lgb.Dataset(X_trn, label=y_trn, free_raw_data=False)
    lgb_valid = lgb.Dataset(X_val, label=y_val, free_raw_data=False)

    model = lgb.train(
        params=params,
        train_set=lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train','valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )

    val_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
    oof_preds[val_idx] = val_pred_proba
    val_pred = np.argmax(val_pred_proba, axis=1)
    acc = accuracy_score(y_val, val_pred)
    fold_acc.append(acc)
    print(f"[FOLD {fold}] ACC={acc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-f_t:.1f}s")
    
    test_fold_pred = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds += test_fold_pred / N_SPLITS
    
    del X_trn, X_val, y_trn, y_val, lgb_train, lgb_valid, model, val_pred_proba, test_fold_pred
    gc.collect()

oof_pred_labels = np.argmax(oof_preds, axis=1)
cv_acc = accuracy_score(y, oof_pred_labels)
print(f"[CV] Mean ACC over {N_SPLITS} folds: {np.mean(fold_acc):.6f}; OOF ACC: {cv_acc:.6f}")

# Save preds for ensembling
np.save('lgb_oof_preds.npy', oof_preds)
np.save('lgb_test_preds.npy', test_preds)
print('[INFO] Saved lgb_oof_preds.npy and lgb_test_preds.npy')

# --- Submission ---
sub = pd.DataFrame({
    'Id': test['Id'].values if 'Id' in test.columns else np.arange(len(test)),
    'Cover_Type': np.argmax(test_preds, axis=1) + 1  # back to 1..7
})
sub.to_csv('submission.csv', index=False)
print('[INFO] Saved submission.csv')
print(f"[DONE] Total elapsed: {time.time()-t0:.1f}s")

In [12]:
# Ultra-fast sanity run on a small head subset to validate pipeline and get quick CV
import sys, time, gc
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

N_ROWS = 120_000  # small head subset for speed
SEED_FAST = 2021
N_SPLITS_FAST = 5

import importlib, subprocess, sys
def ensure_pkg(name):
    try:
        return importlib.import_module(name)
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', name])
        return importlib.import_module(name)

lgb_fast = ensure_pkg('lightgbm')

t0 = time.time()
print('[FAST] Loading head subset and test...'); sys.stdout.flush()
train = pd.read_csv('train.csv', nrows=N_ROWS)
test_full = pd.read_csv('test.csv')
print(f"[FAST] train_head shape: {train.shape} | test shape: {test_full.shape}"); sys.stdout.flush()

y = train['Cover_Type'].values - 1
feature_cols = [c for c in train.columns if c not in ['Cover_Type', 'Id']]
X = train[feature_cols].copy()
X_test = test_full[[c for c in test_full.columns if c != 'Id']].copy()

# --- Feature Engineering (same as main) ---
print('[FAST] Feature engineering...'); sys.stdout.flush()
elev_threshold = X_test['Elevation'].median()
X['is_high_elevation'] = (X['Elevation'] > elev_threshold).astype(np.int8)
X_test['is_high_elevation'] = (X_test['Elevation'] > elev_threshold).astype(np.int8)

if set(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology']).issubset(X.columns):
    X['Hydrology_Euclid'] = np.sqrt(X['Horizontal_Distance_To_Hydrology']**2 + X['Vertical_Distance_To_Hydrology']**2)
    X_test['Hydrology_Euclid'] = np.sqrt(X_test['Horizontal_Distance_To_Hydrology']**2 + X_test['Vertical_Distance_To_Hydrology']**2)
    X['Elev_minus_VertHydro'] = X['Elevation'] - X['Vertical_Distance_To_Hydrology']
    X_test['Elev_minus_VertHydro'] = X_test['Elevation'] - X_test['Vertical_Distance_To_Hydrology']

hill_cols = [c for c in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm'] if c in X.columns]
if len(hill_cols) == 3:
    X['Hillshade_Mean'] = X[hill_cols].mean(axis=1)
    X_test['Hillshade_Mean'] = X_test[hill_cols].mean(axis=1)
    X['Hillshade_Min'] = X[hill_cols].min(axis=1)
    X_test['Hillshade_Min'] = X_test[hill_cols].min(axis=1)
    X['Hillshade_Max'] = X[hill_cols].max(axis=1)
    X_test['Hillshade_Max'] = X_test[hill_cols].max(axis=1)
    X['Hillshade_Range'] = X['Hillshade_Max'] - X['Hillshade_Min']
    X_test['Hillshade_Range'] = X_test['Hillshade_Max'] - X_test['Hillshade_Min']

dist_cols = ['Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Hydrology']
if set(dist_cols).issubset(X.columns):
    hf, rr, hh = dist_cols
    X['DistDiff_Fire_Road'] = (X[hf] - X[rr]).abs()
    X_test['DistDiff_Fire_Road'] = (X_test[hf] - X_test[rr]).abs()
    X['DistDiff_Fire_Hydro'] = (X[hf] - X[hh]).abs()
    X_test['DistDiff_Fire_Hydro'] = (X_test[hf] - X_test[hh]).abs()
    X['DistDiff_Road_Hydro'] = (X[rr] - X[hh]).abs()
    X_test['DistDiff_Road_Hydro'] = (X_test[rr] - X_test[hh]).abs()
    X['DistMean_FRH'] = (X[hf] + X[rr] + X[hh]) / 3.0
    X_test['DistMean_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh]) / 3.0
    X['DistSum_FRH'] = (X[hf] + X[rr] + X[hh])
    X_test['DistSum_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh])
    X['DistMin_FRH'] = X[[hf, rr, hh]].min(axis=1)
    X_test['DistMin_FRH'] = X_test[[hf, rr, hh]].min(axis=1)
    X['DistMax_FRH'] = X[[hf, rr, hh]].max(axis=1)
    X_test['DistMax_FRH'] = X_test[[hf, rr, hh]].max(axis=1)

soil_cols = [c for c in X.columns if c.startswith('Soil_Type_')]
wild_cols = [c for c in X.columns if c.startswith('Wilderness_Area_')]
if soil_cols:
    X['Soil_Type_Count'] = X[soil_cols].sum(axis=1)
    X_test['Soil_Type_Count'] = X_test[soil_cols].sum(axis=1)
if wild_cols:
    X['Wilderness_Area_Count'] = X[wild_cols].sum(axis=1)
    X_test['Wilderness_Area_Count'] = X_test[wild_cols].sum(axis=1)

if 'Aspect' in X.columns:
    X['Aspect_sin'] = np.sin(np.deg2rad(X['Aspect']))
    X_test['Aspect_sin'] = np.sin(np.deg2rad(X_test['Aspect']))
    X['Aspect_cos'] = np.cos(np.deg2rad(X['Aspect']))
    X_test['Aspect_cos'] = np.cos(np.deg2rad(X_test['Aspect']))

params_fast = {
    'objective': 'multiclass',
    'num_class': 7,
    'metric': 'multi_logloss',
    'learning_rate': 0.05,
    'num_leaves': 48,
    'min_data_in_leaf': 96,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'lambda_l1': 1.0,
    'lambda_l2': 2.0,
    'max_bin': 128,
    'bin_construct_sample_cnt': 200000,
    'verbose': -1,
    'seed': SEED_FAST,
    'num_threads': 16,
    'first_metric_only': True,
    'deterministic': True,
    'feature_pre_filter': False
}

skf = StratifiedKFold(n_splits=N_SPLITS_FAST, shuffle=True, random_state=SEED_FAST)
oof = np.zeros((X.shape[0], 7), dtype=np.float32)
tst = np.zeros((X_test.shape[0], 7), dtype=np.float32)
accs = []
print('[FAST] Starting 5-fold LGBM on head subset...'); sys.stdout.flush()
for i, (tr, va) in enumerate(skf.split(X, y), 1):
    fts = time.time()
    print(f'[FAST][FOLD {i}] tr={len(tr)} va={len(va)}'); sys.stdout.flush()
    dtr = lgb_fast.Dataset(X.iloc[tr], label=y[tr], free_raw_data=False)
    dva = lgb_fast.Dataset(X.iloc[va], label=y[va], free_raw_data=False)
    model = lgb_fast.train(
        params_fast,
        dtr,
        num_boost_round=2000,
        valid_sets=[dtr, dva],
        valid_names=['train','valid'],
        callbacks=[
            lgb_fast.early_stopping(stopping_rounds=100, verbose=False),
            lgb_fast.log_evaluation(period=100)
        ]
    )
    pva = model.predict(X.iloc[va], num_iteration=model.best_iteration)
    oof[va] = pva
    pred = np.argmax(pva, axis=1)
    acc = accuracy_score(y[va], pred)
    accs.append(acc)
    print(f'[FAST][FOLD {i}] acc={acc:.6f} best_iter={model.best_iteration} elapsed={time.time()-fts:.1f}s'); sys.stdout.flush()
    pt = model.predict(X_test, num_iteration=model.best_iteration)
    tst += pt / N_SPLITS_FAST
    del dtr, dva, model, pva, pt
    gc.collect()

oof_lbl = np.argmax(oof, axis=1)
cv_acc = accuracy_score(y, oof_lbl)
print(f'[FAST][CV] mean_acc={np.mean(accs):.6f} | OOF={cv_acc:.6f} | total_elapsed={time.time()-t0:.1f}s'); sys.stdout.flush()

sub_fast = pd.DataFrame({'Id': test_full['Id'].values, 'Cover_Type': np.argmax(tst, axis=1) + 1})
sub_fast.to_csv('submission_fast.csv', index=False)
print('[FAST] Saved submission_fast.csv')