In [2]:
import os, sys, time, gc, warnings, subprocess, importlib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 200)

SEED = 42
N_SPLITS = 5
RANDOM_STATE = 42

def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)

def ensure_package(pkg_name: str, import_name: str = None):
    name = import_name or pkg_name
    try:
        return importlib.import_module(name)
    except ImportError:
        print(f"[INFO] Installing {pkg_name}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg_name])
        return importlib.import_module(name)

set_seed(SEED)

xgb = ensure_package('xgboost', 'xgboost')
from xgboost import DMatrix, train as xgb_train

t0 = time.time()
print('[INFO] Loading data...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"[INFO] train shape: {train.shape}, test shape: {test.shape}")
assert 'Cover_Type' in train.columns

# Target and base features
y = train['Cover_Type'].values - 1
feature_cols = [c for c in train.columns if c not in ['Cover_Type', 'Id']]
X = train[feature_cols].copy()
X_test = test[[c for c in test.columns if c != 'Id']].copy()

# --- Feature Engineering (match main/catboost) ---
print('[INFO] Feature engineering...')
elev_threshold = X_test['Elevation'].median()
X['is_high_elevation'] = (X['Elevation'] > elev_threshold).astype(np.int8)
X_test['is_high_elevation'] = (X_test['Elevation'] > elev_threshold).astype(np.int8)

if set(['Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology']).issubset(X.columns):
    X['Hydrology_Euclid'] = np.sqrt(X['Horizontal_Distance_To_Hydrology']**2 + X['Vertical_Distance_To_Hydrology']**2)
    X_test['Hydrology_Euclid'] = np.sqrt(X_test['Horizontal_Distance_To_Hydrology']**2 + X_test['Vertical_Distance_To_Hydrology']**2)
    X['Elev_minus_VertHydro'] = X['Elevation'] - X['Vertical_Distance_To_Hydrology']
    X_test['Elev_minus_VertHydro'] = X_test['Elevation'] - X_test['Vertical_Distance_To_Hydrology']

hill_cols = [c for c in ['Hillshade_9am','Hillshade_Noon','Hillshade_3pm'] if c in X.columns]
if len(hill_cols) == 3:
    X['Hillshade_Mean'] = X[hill_cols].mean(axis=1)
    X_test['Hillshade_Mean'] = X_test[hill_cols].mean(axis=1)
    X['Hillshade_Min'] = X[hill_cols].min(axis=1)
    X_test['Hillshade_Min'] = X_test[hill_cols].min(axis=1)
    X['Hillshade_Max'] = X[hill_cols].max(axis=1)
    X_test['Hillshade_Max'] = X_test[hill_cols].max(axis=1)
    X['Hillshade_Range'] = X['Hillshade_Max'] - X['Hillshade_Min']
    X_test['Hillshade_Range'] = X_test['Hillshade_Max'] - X_test['Hillshade_Min']

dist_cols = ['Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Hydrology']
if set(dist_cols).issubset(X.columns):
    hf, rr, hh = dist_cols
    X['DistDiff_Fire_Road'] = (X[hf] - X[rr]).abs()
    X_test['DistDiff_Fire_Road'] = (X_test[hf] - X_test[rr]).abs()
    X['DistDiff_Fire_Hydro'] = (X[hf] - X[hh]).abs()
    X_test['DistDiff_Fire_Hydro'] = (X_test[hf] - X_test[hh]).abs()
    X['DistDiff_Road_Hydro'] = (X[rr] - X[hh]).abs()
    X_test['DistDiff_Road_Hydro'] = (X_test[rr] - X_test[hh]).abs()
    X['DistMean_FRH'] = (X[hf] + X[rr] + X[hh]) / 3.0
    X_test['DistMean_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh]) / 3.0
    X['DistSum_FRH'] = (X[hf] + X[rr] + X[hh])
    X_test['DistSum_FRH'] = (X_test[hf] + X_test[rr] + X_test[hh])
    X['DistMin_FRH'] = X[[hf, rr, hh]].min(axis=1)
    X_test['DistMin_FRH'] = X_test[[hf, rr, hh]].min(axis=1)
    X['DistMax_FRH'] = X[[hf, rr, hh]].max(axis=1)
    X_test['DistMax_FRH'] = X_test[[hf, rr, hh]].max(axis=1)

soil_cols = [c for c in X.columns if c.startswith('Soil_Type_')]
wild_cols = [c for c in X.columns if c.startswith('Wilderness_Area_')]
if soil_cols:
    X['Soil_Type_Count'] = X[soil_cols].sum(axis=1)
    X_test['Soil_Type_Count'] = X_test[soil_cols].sum(axis=1)
if wild_cols:
    X['Wilderness_Area_Count'] = X[wild_cols].sum(axis=1)
    X_test['Wilderness_Area_Count'] = X_test[wild_cols].sum(axis=1)

if 'Aspect' in X.columns:
    X['Aspect_sin'] = np.sin(np.deg2rad(X['Aspect']))
    X_test['Aspect_sin'] = np.sin(np.deg2rad(X_test['Aspect']))
    X['Aspect_cos'] = np.cos(np.deg2rad(X['Aspect']))
    X_test['Aspect_cos'] = np.cos(np.deg2rad(X_test['Aspect']))

features = X.columns.tolist()
print(f"[INFO] Final feature count: {len(features)}")

# Load consistent folds
fold_file = 'fold_indices.npy'
if os.path.exists(fold_file):
    folds = np.load(fold_file, allow_pickle=True).tolist()
else:
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    folds = list(skf.split(X, y))
    np.save(fold_file, np.array(folds, dtype=object))
    print(f"[INFO] Saved fold indices to {fold_file}")

# XGBoost params with GPU fallback
tree_method = 'gpu_hist'
use_gpu = True
try:
    # quick test if GPU context works by constructing a tiny DMatrix (will still fall back in except)
    _ = xgb.__version__  # silence linter
except Exception:
    use_gpu = False
if not use_gpu:
    tree_method = 'hist'

base_params = {
    'objective': 'multi:softprob',
    'num_class': 7,
    'eval_metric': 'mlogloss',
    'eta': 0.03,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 2.0,
    'alpha': 0.1,
    'tree_method': tree_method,
    'nthread': 8,
    'verbosity': 1,
    'seed': SEED
}

oof_preds = np.zeros((X.shape[0], 7), dtype=np.float32)
test_preds = np.zeros((X_test.shape[0], 7), dtype=np.float32)
fold_acc = []

print(f"[INFO] Starting XGBoost 5-fold CV... (tree_method={tree_method})")
for fold, (trn_idx, val_idx) in enumerate(folds, 1):
    f_t = time.time()
    print(f"[FOLD {fold}/{N_SPLITS}] Train: {len(trn_idx)}, Valid: {len(val_idx)}")
    X_trn = X.iloc[trn_idx]
    y_trn = y[trn_idx]
    X_val = X.iloc[val_idx]
    y_val = y[val_idx]

    dtrain = DMatrix(X_trn, label=y_trn)
    dvalid = DMatrix(X_val, label=y_val)
    dtest = DMatrix(X_test)

    model = xgb_train(
        params=base_params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=200,
        verbose_eval=100
    )

    val_pred_proba = model.predict(dvalid, iteration_range=(0, model.best_iteration+1))
    oof_preds[val_idx] = val_pred_proba
    val_pred = np.argmax(val_pred_proba, axis=1)
    acc = accuracy_score(y_val, val_pred)
    fold_acc.append(acc)
    print(f"[FOLD {fold}] ACC={acc:.6f} | best_iter={model.best_iteration} | elapsed={time.time()-f_t:.1f}s")

    test_fold_pred = model.predict(dtest, iteration_range=(0, model.best_iteration+1))
    test_preds += test_fold_pred / N_SPLITS

    del X_trn, X_val, y_trn, y_val, dtrain, dvalid, dtest, model, val_pred_proba, test_fold_pred
    gc.collect()

oof_pred_labels = np.argmax(oof_preds, axis=1)
cv_acc = accuracy_score(y, oof_pred_labels)
print(f"[CV] XGBoost Mean ACC: {np.mean(fold_acc):.6f}; OOF ACC: {cv_acc:.6f}")

np.save('xgb_oof_preds.npy', oof_preds)
np.save('xgb_test_preds.npy', test_preds)
print('[INFO] Saved xgb_oof_preds.npy and xgb_test_preds.npy')

sub = pd.DataFrame({
    'Id': test['Id'].values if 'Id' in test.columns else np.arange(len(test)),
    'Cover_Type': np.argmax(test_preds, axis=1) + 1
})
sub.to_csv('submission_xgb.csv', index=False)
print('[INFO] Saved submission_xgb.csv')
print(f"[DONE] Total elapsed: {time.time()-t0:.1f}s")