In [3]:
# CatBoost full-data training with categorical indices and robust validation
import os, sys, time, json, logging, importlib, subprocess, gc, traceback
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

t0_total = time.time()
os.environ['PYTHONUNBUFFERED'] = '1'
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    handlers=[
                        logging.FileHandler('run_catboost.log', mode='w'),
                        logging.StreamHandler(sys.stdout)
                    ],
                    force=True)

def ensure_package(pkg: str, import_name: str = None):
    name = import_name or pkg
    try:
        return importlib.import_module(name)
    except ImportError:
        logging.info(f'Installing {pkg}...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])
        return importlib.import_module(name)

try:
    logging.info('Importing CatBoost...')
    cb = ensure_package('catboost', 'catboost')
    from catboost import CatBoostClassifier, Pool
    logging.info('CatBoost imported.')

    # Load processed features and target
    logging.info('Loading cached Feather/NumPy artifacts...')
    X = pd.read_feather('X.feather')
    X_test = pd.read_feather('X_test.feather')
    y = np.load('y.npy')  # 0..6
    with open('features.json','r') as f:
        features = json.load(f)
    # Enforce same base feature order
    X = X[features].copy()
    X_test = X_test[features].copy()
    logging.info(f'Data shapes: X={X.shape}, X_test={X_test.shape}, y={y.shape}')

    # Reconstruct categorical indices from one-hot columns; if missing in X, read minimal columns from raw CSVs
    wild_cols = [c for c in X.columns if c.startswith('Wilderness_Area')]
    soil_cols = [c for c in X.columns if c.startswith('Soil_Type')]
    if not wild_cols or not soil_cols:
        logging.info('One-hot Wilderness/Soil columns not found in X; reconstructing from raw CSV...')
        # Determine column names from train.csv header (support names with/without underscore before number)
        header = pd.read_csv('train.csv', nrows=0).columns.tolist()
        wild_cols_all = [c for c in header if c.startswith('Wilderness_Area')]
        soil_cols_all = [c for c in header if c.startswith('Soil_Type')]
        if not wild_cols_all or not soil_cols_all:
            raise RuntimeError('Could not find Wilderness_Area* or Soil_Type* in raw CSV header.')
        usecols_train = wild_cols_all + soil_cols_all
        usecols_test = usecols_train.copy()
        logging.info('Reading minimal one-hot columns from train/test CSVs to build indices...')
        tr_onehot = pd.read_csv('train.csv', usecols=usecols_train)
        te_onehot = pd.read_csv('test.csv', usecols=usecols_test)
        # Sort by numeric suffix for stable argmax order
        def numeric_suffix(col):
            digits = ''.join(ch for ch in col if ch.isdigit())
            return int(digits) if digits else 0
        wild_cols_sorted = sorted(wild_cols_all, key=numeric_suffix)
        soil_cols_sorted = sorted(soil_cols_all, key=numeric_suffix)
        X['Wilderness_Area_Index'] = tr_onehot[wild_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X_test['Wilderness_Area_Index'] = te_onehot[wild_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X['Soil_Type_Index'] = tr_onehot[soil_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X_test['Soil_Type_Index'] = te_onehot[soil_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        del tr_onehot, te_onehot
        gc.collect()
    else:
        def numeric_suffix(col):
            digits = ''.join(ch for ch in col if ch.isdigit())
            return int(digits) if digits else 0
        wild_cols_sorted = sorted(wild_cols, key=numeric_suffix)
        soil_cols_sorted = sorted(soil_cols, key=numeric_suffix)
        # Argmax over one-hot to get integer category indices (0-based)
        X['Wilderness_Area_Index'] = X[wild_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X_test['Wilderness_Area_Index'] = X_test[wild_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X['Soil_Type_Index'] = X[soil_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)
        X_test['Soil_Type_Index'] = X_test[soil_cols_sorted].to_numpy().argmax(axis=1).astype(np.int16)

    # Elevation band (categorical integer bins)
    elev_bins = [0, 2400, 2800, 3200, 10000]
    X['ElevationBand'] = pd.cut(X['Elevation'], bins=elev_bins, labels=False, include_lowest=True).astype('Int8').fillna(0).astype(np.int16)
    X_test['ElevationBand'] = pd.cut(X_test['Elevation'], bins=elev_bins, labels=False, include_lowest=True).astype('Int8').fillna(0).astype(np.int16)

    # Build CatBoost feature lists
    cat_feature_names = ['Wilderness_Area_Index', 'Soil_Type_Index', 'ElevationBand']
    for c in cat_feature_names:
        if c not in X.columns:
            raise RuntimeError(f'Missing categorical feature {c}')
    all_features = X.columns.tolist()
    cat_features_idx = [all_features.index(c) for c in cat_feature_names]
    logging.info(f'Categorical features indices: {cat_features_idx}')

    # Validation split: ensure singleton class (index 4) stays in training
    singleton_class = 4
    singleton_mask = (y == singleton_class)
    singleton_idx = np.where(singleton_mask)[0]
    excl_mask = ~singleton_mask
    X_excl = X.loc[excl_mask].reset_index(drop=True)
    y_excl = y[excl_mask]
    # Use stratify if possible (all classes except singleton present)
    do_strat = True
    try:
        _, strat_counts = np.unique(y_excl, return_counts=True)
        do_strat = (strat_counts.min() >= 2)
    except Exception:
        do_strat = False
    logging.info(f'Splitting hold-out (stratify={do_strat})...')
    X_trn_ex, X_val, y_trn_ex, y_val = train_test_split(
        X_excl, y_excl, test_size=0.10, random_state=42, shuffle=True, stratify=(y_excl if do_strat else None)
    )
    # Concatenate singleton back into training set
    if singleton_idx.size == 1:
        X_trn = pd.concat([X_trn_ex, X.iloc[singleton_idx]], axis=0, ignore_index=True)
        y_trn = np.concatenate([y_trn_ex, y[singleton_idx]], axis=0)
    else:
        X_trn, y_trn = X_trn_ex, y_trn_ex
    logging.info(f'Train/Valid shapes: {X_trn.shape}/{X_val.shape}')

    # Pools for CatBoost
    train_pool = Pool(data=X_trn, label=y_trn, cat_features=cat_features_idx)
    valid_pool = Pool(data=X_val, label=y_val, cat_features=cat_features_idx)

    # CatBoost parameters (CPU)
    params = {
        'loss_function': 'MultiClass',
        'iterations': 3000,
        'depth': 10,
        'learning_rate': 0.03,
        'l2_leaf_reg': 3.0,
        'random_strength': 1.0,
        'bootstrap_type': 'Bayesian',
        'bagging_temperature': 1.0,
        'eval_metric': 'Accuracy',
        'od_type': 'Iter',
        'od_wait': 200,
        'random_seed': 42,
        'task_type': 'CPU',
        'thread_count': min(24, os.cpu_count() or 24),
        'verbose': 100
    }

    logging.info('Training CatBoost (full data, early stopping)...')
    t0 = time.time()
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    logging.info(f'Training complete in {time.time()-t0:.1f}s. Best iters: {model.tree_count_}')

    # Predict test
    logging.info('Predicting on test...')
    test_pool = Pool(data=X_test, cat_features=cat_features_idx)
    test_proba = model.predict_proba(test_pool)
    test_pred = np.argmax(test_proba, axis=1).astype(np.int32)  # 0..6

    # Post-processing override for class 5 (index 4): Wilderness_Area == 4th and Elevation > 3300
    try:
        wild_idx_test = X_test['Wilderness_Area_Index'].to_numpy()
        elev_test = X_test['Elevation'].to_numpy()
        override_mask = (wild_idx_test == 3) & (elev_test > 3300)
        num_over = int(override_mask.sum())
        if num_over > 0:
            test_pred[override_mask] = 4  # zero-based index for class 5
        logging.info(f'Post-processing overrides applied: {num_over}')
    except Exception as e:
        logging.warning(f'Post-processing step skipped due to error: {e}')

    # Save submission
    test_ids = np.load('test_ids.npy')
    sub = pd.DataFrame({'Id': test_ids, 'Cover_Type': test_pred + 1})
    sub.to_csv('submission.csv', index=False)
    logging.info('Saved submission.csv')
    logging.info(f'Done. Total elapsed: {time.time()-t0_total:.1f}s')

except Exception as e:
    logging.error('Exception in CatBoost pipeline: ' + str(e))
    with open('catboost_error.log','w') as f:
        f.write(traceback.format_exc())
    raise

2025-09-08 17:12:37,234 [INFO] Importing CatBoost...


2025-09-08 17:12:37,238 [INFO] CatBoost imported.


2025-09-08 17:12:37,238 [INFO] Loading cached Feather/NumPy artifacts...


2025-09-08 17:12:39,215 [INFO] Data shapes: X=(3600000, 70), X_test=(400000, 70), y=(3600000,)


2025-09-08 17:12:40,405 [INFO] Categorical features indices: [70, 71, 72]


2025-09-08 17:12:41,465 [INFO] Splitting hold-out (stratify=True)...
