In [12]:
# Library Setup
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datetime import datetime
import time

# Data Setup
train = pd.read_csv('/workspace/competitions/playground-series-s5e12/data/input/train.csv')
test = pd.read_csv('/workspace/competitions/playground-series-s5e12/data/input/test.csv')

# X,y Setup
X = train.drop("diagnosed_diabetes", axis=1)
y = train["diagnosed_diabetes"]

In [None]:
# --- Feature Engineering based on Chris Deotte's 1st Place Solution ---

def create_groupby_features(train, test, cat_cols, num_cols):
    """
    Create groupby aggregation features: groupby(COL1)[COL2].agg(STAT)
    """
    print("Creating groupby aggregation features...")
    
    # Combine train and test for groupby calculations
    train['is_train'] = 1
    test['is_train'] = 0
    df = pd.concat([train, test], ignore_index=True)
    
    stats = ['mean', 'std', 'count', 'min', 'max', 'nunique']
    
    for cat_col in cat_cols:
        for num_col in num_cols:
            for stat in stats:
                try:
                    grouped = df.groupby(cat_col)[num_col].agg(stat)
                    new_col = f'{cat_col}_{num_col}_{stat}'
                    train[new_col] = train[cat_col].map(grouped)
                    test[new_col] = test[cat_col].map(grouped)
                except:
                    pass
    
    # Remove temporary column
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)
    
    return train, test


def create_histogram_binning_features(train, test, cat_cols, target_col='diagnosed_diabetes', n_bins=10):
    """
    Create histogram binning features: groupby(COL1)[target].agg(HISTOGRAM BINS)
    """
    print("Creating histogram binning features...")
    
    # Combine train and test
    train['is_train'] = 1
    test['is_train'] = 0
    df = pd.concat([train, test], ignore_index=True)
    
    for cat_col in cat_cols:
        try:
            # Get target values for each category
            grouped = df.groupby(cat_col)[target_col]
            
            # Create bins for each group
            for bin_idx in range(n_bins):
                def get_bin_count(group):
                    if len(group) == 0:
                        return 0
                    hist, _ = np.histogram(group, bins=n_bins, range=(0, 1))
                    return hist[bin_idx] if bin_idx < len(hist) else 0
                
                bin_counts = grouped.agg(get_bin_count)
                new_col = f'{cat_col}_{target_col}_hist_bin_{bin_idx}'
                train[new_col] = train[cat_col].map(bin_counts).fillna(0)
                test[new_col] = test[cat_col].map(bin_counts).fillna(0)
        except:
            pass
    
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)
    
    return train, test


def create_quantile_features(train, test, cat_cols, num_cols, quantiles=[5, 10, 40, 45, 55, 60, 90, 95]):
    """
    Create quantile features: groupby(COL1)[COL2].agg(QUANTILES)
    """
    print("Creating quantile features...")
    
    train['is_train'] = 1
    test['is_train'] = 0
    df = pd.concat([train, test], ignore_index=True)
    
    for cat_col in cat_cols:
        for num_col in num_cols:
            try:
                grouped = df.groupby(cat_col)[num_col]
                for q in quantiles:
                    quantile_vals = grouped.quantile(q / 100.0)
                    new_col = f'{cat_col}_{num_col}_q{q}'
                    train[new_col] = train[cat_col].map(quantile_vals).fillna(0)
                    test[new_col] = test[cat_col].map(quantile_vals).fillna(0)
            except:
                pass
    
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)
    
    return train, test


def create_nan_combination_features(train, test, cols_with_nan):
    """
    Create a single base-2 column from all NANs across multiple columns
    """
    print("Creating NAN combination features...")
    
    # Create binary representation of NAN pattern
    train_nan = train[cols_with_nan].isna().astype(int)
    test_nan = test[cols_with_nan].isna().astype(int)
    
    # Convert to base-2 number
    train['nan_pattern'] = 0
    test['nan_pattern'] = 0
    
    for i, col in enumerate(cols_with_nan):
        train['nan_pattern'] += train_nan[col] * (2 ** i)
        test['nan_pattern'] += test_nan[col] * (2 ** i)
    
    return train, test


def create_binning_features(train, test, num_cols, bins=[1, 2, 5, 10, 20, 50, 100]):
    """
    Put numerical columns into bins using rounding
    """
    print("Creating binning features...")
    
    for num_col in num_cols:
        for bin_size in bins:
            try:
                new_col = f'{num_col}_bin_{bin_size}'
                train[new_col] = (train[num_col] / bin_size).round() * bin_size
                test[new_col] = (test[num_col] / bin_size).round() * bin_size
            except:
                pass
    
    return train, test


def create_digit_extraction_features(train, test, num_cols, n_digits=5):
    """
    Extract digits from float32 numbers
    """
    print("Creating digit extraction features...")
    
    for num_col in num_cols:
        for digit_pos in range(1, n_digits + 1):
            try:
                # Extract digit at position
                new_col = f'{num_col}_digit_{digit_pos}'
                train[new_col] = (train[num_col] * (10 ** digit_pos)).astype(int) % 10
                test[new_col] = (test[num_col] * (10 ** digit_pos)).astype(int) % 10
            except:
                pass
    
    return train, test


def create_categorical_combinations(train, test, cat_cols, max_combinations=28):
    """
    Create combinations of categorical columns
    """
    print("Creating categorical combinations...")
    
    from itertools import combinations
    
    comb_count = 0
    for r in range(2, min(len(cat_cols) + 1, 4)):  # 2 or 3 column combinations
        for combo in combinations(cat_cols, r):
            if comb_count >= max_combinations:
                break
            try:
                new_col = '_'.join(combo) + '_combined'
                train[new_col] = train[list(combo)].apply(lambda x: '_'.join(x.astype(str)), axis=1)
                test[new_col] = test[list(combo)].apply(lambda x: '_'.join(x.astype(str)), axis=1)
                comb_count += 1
            except:
                pass
        if comb_count >= max_combinations:
            break
    
    return train, test


def create_division_features(train, test, feature_cols):
    """
    Create division features from existing engineered features
    """
    print("Creating division features...")
    
    # Select some important features for division
    if len(feature_cols) >= 2:
        for i, col1 in enumerate(feature_cols[:10]):  # Limit to avoid too many features
            for col2 in feature_cols[i+1:min(i+6, len(feature_cols))]:
                try:
                    new_col = f'{col1}_div_{col2}'
                    train[new_col] = train[col1] / (train[col2] + 1e-8)
                    test[new_col] = test[col1] / (test[col2] + 1e-8)
                except:
                    pass
    
    return train, test

In [13]:
# --- Feature Engineering (Chris Deotte Style) ---

# Identify column types
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"Original features: {len(X.columns)}")
print(f"Categorical columns: {len(cat_cols)}")
print(f"Numerical columns: {len(num_cols)}")

# Prepare dataframes for feature engineering
train_fe = train.copy()
test_fe = test.copy()

# 1. Basic categorical encoding
for col in cat_cols:
    train_fe[col] = train_fe[col].astype("category")
    test_fe[col] = test_fe[col].astype("category")
    X[col] = X[col].astype("category")
    test[col] = test[col].astype("category")

# 2. Groupby aggregation features (limited to avoid too many features)
print("\n" + "="*70)
print("Step 1: Groupby Aggregations")
print("="*70)
train_fe, test_fe = create_groupby_features(train_fe, test_fe, cat_cols[:5], num_cols[:5])  # Limit columns

# 3. Quantile features
print("\n" + "="*70)
print("Step 2: Quantile Features")
print("="*70)
train_fe, test_fe = create_quantile_features(train_fe, test_fe, cat_cols[:3], num_cols[:3])

# 4. Binning features
print("\n" + "="*70)
print("Step 3: Binning Features")
print("="*70)
train_fe, test_fe = create_binning_features(train_fe, test_fe, num_cols[:5], bins=[1, 2, 5, 10])

# 5. Digit extraction (for most important numerical column)
print("\n" + "="*70)
print("Step 4: Digit Extraction")
print("="*70)
if len(num_cols) > 0:
    train_fe, test_fe = create_digit_extraction_features(train_fe, test_fe, num_cols[:2], n_digits=3)

# 6. Categorical combinations
print("\n" + "="*70)
print("Step 5: Categorical Combinations")
print("="*70)
train_fe, test_fe = create_categorical_combinations(train_fe, test_fe, cat_cols, max_combinations=10)

# 7. NAN combination features
print("\n" + "="*70)
print("Step 6: NAN Combination Features")
print("="*70)
cols_with_nan = [col for col in train_fe.columns if train_fe[col].isna().any()]
if len(cols_with_nan) > 0:
    train_fe, test_fe = create_nan_combination_features(train_fe, test_fe, cols_with_nan[:10])

# Update X and test with new features
X = train_fe.drop("diagnosed_diabetes", axis=1)
test = test_fe.copy()

# Handle new categorical columns
new_cat_cols = X.select_dtypes(include="object").columns.tolist()
for col in new_cat_cols:
    if col not in cat_cols:  # New categorical columns from combinations
        X[col] = X[col].astype("category")
        test[col] = test[col].astype("category")

# Fill NaN values
X = X.fillna(0)
test = test.fillna(0)

print("\n" + "="*70)
print("Feature Engineering Complete!")
print("="*70)
print(f"Final features: {len(X.columns)}")
print(f"New features created: {len(X.columns) - len(num_cols) - len(cat_cols)}")
print("="*70 + "\n")

In [14]:
# CV Setup 
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

In [15]:
# Model Setup with Progress Tracking and AUC Monitoring
def train_model(model_type, X, y, test_data, params, cat_features=None):
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test_data))
    models = []
    
    # Track AUC for train and validation
    train_aucs = []
    val_aucs = []
    
    print("=" * 70)
    print(f"--- Training {model_type.upper()} ---")
    print("=" * 70)
    
    fold_times = []
    start_time = time.time()
    
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), 1):
        fold_start = time.time()
        print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Fold {fold}/{FOLDS} - Starting...")
        
        X_tr, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]
        
        if model_type == 'lgb':
            # LGBM handles categories natively if dtype is 'category'
            model = lgb.LGBMClassifier(**params)
            # Reduced stopping_rounds from 100 to 50 for faster training
            callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False), lgb.log_evaluation(0)]
            model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], callbacks=callbacks)
            
        elif model_type == 'xgb':
            # XGBoost needs enable_categorical=True for category dtypes
            model = xgb.XGBClassifier(**params, enable_categorical=True)
            model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=False)
        
        # Predictions for both train and validation
        train_preds = model.predict_proba(X_tr)[:, 1]
        val_preds = model.predict_proba(X_val)[:, 1]
        
        oof_preds[val_idx] = val_preds
        test_preds += model.predict_proba(test_data)[:, 1] / FOLDS
        models.append(model)
        
        # Calculate AUC for train and validation
        train_auc = roc_auc_score(y_tr, train_preds)
        val_auc = roc_auc_score(y_val, val_preds)
        train_aucs.append(train_auc)
        val_aucs.append(val_auc)
        
        # Progress tracking
        fold_time = time.time() - fold_start
        fold_times.append(fold_time)
        avg_time = np.mean(fold_times)
        remaining_folds = FOLDS - fold
        estimated_remaining = avg_time * remaining_folds
        
        elapsed_total = time.time() - start_time
        
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Fold {fold}/{FOLDS} - Completed!")
        print(f"  Fold Time: {fold_time:.1f}s")
        print(f"  Train AUC: {train_auc:.5f} | Val AUC: {val_auc:.5f} | Gap: {train_auc - val_auc:.5f}")
        print(f"  Progress: {fold}/{FOLDS} ({fold/FOLDS*100:.1f}%)")
        if remaining_folds > 0:
            print(f"  Estimated Remaining: {estimated_remaining/60:.1f} minutes")
        print(f"  Total Elapsed: {elapsed_total/60:.1f} minutes")
        
    score = roc_auc_score(y, oof_preds)
    total_time = time.time() - start_time
    
    print("\n" + "=" * 70)
    print(f"{model_type.upper()} Training Complete!")
    print(f"OOF AUC: {score:.5f}")
    print(f"Total Time: {total_time/60:.1f} minutes ({total_time:.1f} seconds)")
    print(f"Average Time per Fold: {np.mean(fold_times):.1f} seconds")
    print("\n" + "-" * 70)
    print("AUC Summary:")
    print("-" * 70)
    print(f"{'Fold':<6} {'Train AUC':<12} {'Val AUC':<12} {'Gap':<12}")
    print("-" * 70)
    for i in range(FOLDS):
        gap = train_aucs[i] - val_aucs[i]
        print(f"{i+1:<6} {train_aucs[i]:<12.5f} {val_aucs[i]:<12.5f} {gap:<12.5f}")
    print("-" * 70)
    print(f"{'Mean':<6} {np.mean(train_aucs):<12.5f} {np.mean(val_aucs):<12.5f} {np.mean(train_aucs) - np.mean(val_aucs):<12.5f}")
    print(f"{'Std':<6} {np.std(train_aucs):<12.5f} {np.std(val_aucs):<12.5f}")
    print("=" * 70 + "\n")
    
    return oof_preds, test_preds

In [16]:
# --- Configuration (Optimized for Speed) ---

# LightGBM: Optimized for faster training
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,  # Increased from 0.02 for faster convergence (2.5x faster)
    'num_leaves': 31,       # Slightly smaller leaves to prevent Overfitting
    'n_estimators': 2000,   # Reduced from 5000 (controlled by Early Stopping)
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
}

# XGBoost: Optimized for faster training
xgb_params = {
    'n_estimators': 2000,   # Reduced from 5000
    'learning_rate': 0.05,  # Increased from 0.02 for faster convergence (2.5x faster)
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'eval_metric': 'auc',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',  # Faster training
    'early_stopping_rounds': 50  # Reduced from 100 for faster stopping
}

# --- Execution ---
oof_lgb, pred_lgb = train_model('lgb', X, y, test, lgb_params)
oof_xgb, pred_xgb = train_model('xgb', X, y, test, xgb_params)

--- Training LGB ---

[11:25:43] Fold 1/10 - Starting...
[11:26:35] Fold 1/10 - Completed!
  Fold Time: 51.3s
  Train AUC: 0.74519 | Val AUC: 0.72731 | Gap: 0.01787
  Progress: 1/10 (10.0%)
  Estimated Remaining: 7.7 minutes
  Total Elapsed: 0.9 minutes

[11:26:35] Fold 2/10 - Starting...
[11:27:31] Fold 2/10 - Completed!
  Fold Time: 56.4s
  Train AUC: 0.74461 | Val AUC: 0.72810 | Gap: 0.01650
  Progress: 2/10 (20.0%)
  Estimated Remaining: 7.2 minutes
  Total Elapsed: 1.8 minutes

[11:27:31] Fold 3/10 - Starting...
[11:28:49] Fold 3/10 - Completed!
  Fold Time: 78.3s
  Train AUC: 0.75633 | Val AUC: 0.72625 | Gap: 0.03008
  Progress: 3/10 (30.0%)
  Estimated Remaining: 7.2 minutes
  Total Elapsed: 3.1 minutes

[11:28:49] Fold 4/10 - Starting...
[11:30:00] Fold 4/10 - Completed!
  Fold Time: 70.2s
  Train AUC: 0.74880 | Val AUC: 0.72640 | Gap: 0.02240
  Progress: 4/10 (40.0%)
  Estimated Remaining: 6.4 minutes
  Total Elapsed: 4.3 minutes

[11:30:00] Fold 5/10 - Starting...
[11:31:23] 

In [18]:
# --- Ensemble: Find Best Combination and Create Submission ---

print("=" * 70)
print("Ensembling LightGBM and XGBoost")
print("=" * 70)

# Calculate OOF AUC for each model
lgb_oof_auc = roc_auc_score(y, oof_lgb)
xgb_oof_auc = roc_auc_score(y, oof_xgb)

print(f"\nIndividual Model Performance:")
print(f"  LightGBM OOF AUC: {lgb_oof_auc:.5f}")
print(f"  XGBoost OOF AUC:  {xgb_oof_auc:.5f}")

# Test different weight combinations to find the best
print(f"\n{'='*70}")
print("Testing Different Weight Combinations")
print(f"{'='*70}")

best_auc = max(lgb_oof_auc, xgb_oof_auc)
best_weights = None
best_method = "Single Model (Best Individual)"

# Test different weight combinations (more granular search)
weight_combinations = []
for lgb_w in np.arange(0.0, 1.01, 0.05):  # 0.0 to 1.0 in steps of 0.05
    xgb_w = 1.0 - lgb_w
    ensemble_oof_test = lgb_w * oof_lgb + xgb_w * oof_xgb
    ensemble_auc_test = roc_auc_score(y, ensemble_oof_test)
    weight_combinations.append({
        'lgb_weight': lgb_w,
        'xgb_weight': xgb_w,
        'auc': ensemble_auc_test
    })
    
    if ensemble_auc_test > best_auc:
        best_auc = ensemble_auc_test
        best_weights = (lgb_w, xgb_w)
        best_method = f"Ensemble (LGB: {lgb_w:.2f}, XGB: {xgb_w:.2f})"

# Sort by AUC to show top combinations
weight_combinations.sort(key=lambda x: x['auc'], reverse=True)

print(f"\nTop 5 Weight Combinations:")
print(f"{'Rank':<6} {'LGB Weight':<12} {'XGB Weight':<12} {'OOF AUC':<12}")
print("-" * 50)
for i, combo in enumerate(weight_combinations[:5], 1):
    marker = " <-- BEST" if combo['auc'] == best_auc else ""
    print(f"{i:<6} {combo['lgb_weight']:<12.2f} {combo['xgb_weight']:<12.2f} {combo['auc']:<12.5f}{marker}")

# Use the best combination for final predictions
if best_weights:
    final_pred = best_weights[0] * pred_lgb + best_weights[1] * pred_xgb
    final_oof = best_weights[0] * oof_lgb + best_weights[1] * oof_xgb
    final_auc = best_auc
else:
    # Use the best single model if no improvement
    if lgb_oof_auc > xgb_oof_auc:
        final_pred = pred_lgb
        final_oof = oof_lgb
        final_auc = lgb_oof_auc
        best_method = "LightGBM (Best Single)"
    else:
        final_pred = pred_xgb
        final_oof = oof_xgb
        final_auc = xgb_oof_auc
        best_method = "XGBoost (Best Single)"

print(f"\n{'='*70}")
print("Final Results")
print(f"{'='*70}")
print(f"Best Method: {best_method}")
print(f"Final OOF AUC: {final_auc:.5f}")
print(f"Best Single Model AUC: {max(lgb_oof_auc, xgb_oof_auc):.5f}")
if best_weights:
    print(f"Improvement: {final_auc - max(lgb_oof_auc, xgb_oof_auc):.5f}")
print(f"{'='*70}\n")

# Create submission file
import os
submission_dir = '/workspace/competitions/playground-series-s5e12/submissions'
os.makedirs(submission_dir, exist_ok=True)

submission = pd.DataFrame({
    'id': test['id'],
    'diagnosed_diabetes': final_pred
})

submission_path = os.path.join(submission_dir, 'ensemble_submission.csv')
submission.to_csv(submission_path, index=False)

print("=" * 70)
print("Submission File Created")
print("=" * 70)
print(f"File saved to: {submission_path}")
print(f"Shape: {submission.shape}")
print(f"Prediction range: [{submission['diagnosed_diabetes'].min():.5f}, {submission['diagnosed_diabetes'].max():.5f}]")
print(f"Mean prediction: {submission['diagnosed_diabetes'].mean():.5f}")
print(f"Method used: {best_method}")
print(f"OOF AUC: {final_auc:.5f}")
print("=" * 70)

Ensembling LightGBM and XGBoost

Individual Model Performance:
  LightGBM OOF AUC: 0.72755
  XGBoost OOF AUC:  0.72671

Testing Different Weight Combinations

Top 5 Weight Combinations:
Rank   LGB Weight   XGB Weight   OOF AUC     
--------------------------------------------------
1      0.65         0.35         0.72792      <-- BEST
2      0.60         0.40         0.72792     
3      0.70         0.30         0.72791     
4      0.55         0.45         0.72790     
5      0.75         0.25         0.72789     

Final Results
Best Method: Ensemble (LGB: 0.65, XGB: 0.35)
Final OOF AUC: 0.72792
Best Single Model AUC: 0.72755
Improvement: 0.00037

Submission File Created
File saved to: /workspace/competitions/playground-series-s5e12/submissions/ensemble_submission.csv
Shape: (300000, 2)
Prediction range: [0.06720, 0.99037]
Mean prediction: 0.62253
Method used: Ensemble (LGB: 0.65, XGB: 0.35)
OOF AUC: 0.72792
