In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import joblib
from lifelines.utils import concordance_index
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

# ==============================
# Data Loading and Preprocessing
# ==============================

# Load datasets
train = pd.read_csv('/content/train.csv', index_col='ID')
test = pd.read_csv('/content/test.csv', index_col='ID')
sub = pd.read_csv('/content/sample_submission.csv', index_col='ID')

# One-hot encode categorical features
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Align train and test sets
train, test = train.align(test, join='left', axis=1, fill_value=0)

# Replace problematic column names
train.columns = train.columns.str.replace('[\\[\\]<>,]', '_', regex=True)
test.columns = test.columns.str.replace('[\\[\\]<>,]', '_', regex=True)

# Drop unnecessary columns from test set
drop_cols = ['efs', 'efs_time', 'naf_label']
test = test.drop(columns=drop_cols, errors='ignore')

# ====================
# XGBoost Training
# ====================

xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.01,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 2500,
    'seed': 42
}

xgb_predictions = np.zeros(test.shape[0])
xgb_oof = np.zeros(train.shape[0])

kf = KFold(n_splits=5, shuffle=True, random_state=42)  # ✅ Proper instantiation

for fold, (train_idx, valid_idx) in enumerate(kf.split(train.drop(columns=drop_cols, errors='ignore'))):
    print(f"XGBoost Fold {fold + 1}")

    X_train = train.iloc[train_idx].drop(columns=drop_cols, errors='ignore')
    X_valid = train.iloc[valid_idx].drop(columns=drop_cols, errors='ignore')
    y_train = train.iloc[train_idx]['efs_time']
    y_valid = train.iloc[valid_idx]['efs_time']

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    dtest = xgb.DMatrix(test)

    model = xgb.train(
        xgb_params, dtrain, num_boost_round=2500,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=100
    )

    xgb_oof[valid_idx] = model.predict(dvalid)
    xgb_predictions += model.predict(dtest) / kf.n_splits

# ==============
# LightGBM Model
# ==============

lgb_params = {
    'learning_rate': 0.01,
    'max_depth': -1,
    'num_leaves': 40,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'lambda_l1': 2,
    'lambda_l2': 5,
    'n_estimators': 2500,
    'objective': 'regression',
    'metric': 'rmse',
    'seed': 42
}

lgb_predictions = np.zeros(test.shape[0])
lgb_oof = np.zeros(train.shape[0])

for fold, (train_idx, valid_idx) in enumerate(kf.split(train.drop(columns=drop_cols, errors='ignore'))):
    print(f"LightGBM Fold {fold + 1}")

    X_train = train.iloc[train_idx].drop(columns=drop_cols, errors='ignore')
    X_valid = train.iloc[valid_idx].drop(columns=drop_cols, errors='ignore')
    y_train = train.iloc[train_idx]['efs_time']
    y_valid = train.iloc[valid_idx]['efs_time']

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid, reference=dtrain)

    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=2500,
        valid_sets=[dtrain, dvalid],
        valid_names=['train', 'valid'],
        callbacks=[lgb.early_stopping(100)]
    )

    lgb_oof[valid_idx] = model.predict(X_valid)
    lgb_predictions += model.predict(test) / kf.n_splits

# ==================
# Model Evaluation
# ==================

# Define RMSE function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Compute RMSE for OOF predictions
xgb_rmse = rmse(train['efs_time'], xgb_oof)
lgb_rmse = rmse(train['efs_time'], lgb_oof)
ensemble_oof = (xgb_oof + lgb_oof) / 2
ensemble_rmse = rmse(train['efs_time'], ensemble_oof)

print(f"XGBoost OOF RMSE: {xgb_rmse:.4f}")
print(f"LightGBM OOF RMSE: {lgb_rmse:.4f}")
print(f"Ensemble OOF RMSE: {ensemble_rmse:.4f}")

# Compute C-Index (Main Kaggle Metric)
c_index = concordance_index(train['efs_time'], ensemble_oof, event_observed=train['efs'])
print(f"Validation C-Index (Kaggle Metric): {c_index:.4f}")

# ========================
# Ensemble and Submission
# ========================

# Simple average ensemble
ensemble_predictions = (xgb_predictions + lgb_predictions) / 2

# Save predictions
sub['prediction'] = ensemble_predictions
sub.to_csv('submission.csv')
print("Submission file saved.")

# Save models
joblib.dump(ensemble_rmse, "ensemble_models.pkl")
print("Training complete. Models saved.")


XGBoost Fold 1
[0]	train-rmse:24.71988	valid-rmse:24.93222
[100]	train-rmse:22.66098	valid-rmse:23.28225
[200]	train-rmse:21.84184	valid-rmse:22.83994
[300]	train-rmse:21.32428	valid-rmse:22.64229
[400]	train-rmse:20.94332	valid-rmse:22.54027
[500]	train-rmse:20.59936	valid-rmse:22.47052
[600]	train-rmse:20.27395	valid-rmse:22.42088
[700]	train-rmse:20.00177	valid-rmse:22.37906
[800]	train-rmse:19.73281	valid-rmse:22.35463
[900]	train-rmse:19.48731	valid-rmse:22.33267
[1000]	train-rmse:19.24639	valid-rmse:22.31992
[1100]	train-rmse:19.02005	valid-rmse:22.30254
[1200]	train-rmse:18.80832	valid-rmse:22.29848
[1300]	train-rmse:18.59787	valid-rmse:22.29437
[1400]	train-rmse:18.39955	valid-rmse:22.28865
[1500]	train-rmse:18.21450	valid-rmse:22.28816
[1534]	train-rmse:18.14826	valid-rmse:22.29044
XGBoost Fold 2
[0]	train-rmse:24.86602	valid-rmse:24.33690
[100]	train-rmse:22.75989	valid-rmse:22.87231
[200]	train-rmse:21.93385	valid-rmse:22.49350
[300]	train-rmse:21.42177	valid-rmse:22.34347
[