In [10]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import optuna
import torch
import torch.nn as nn

# Try to import TabPFN and TabM with fallbacks
try:
    from tabpfn import TabPFNRegressor
    TABPFN_AVAILABLE = True
except ImportError:
    print("⚠️  TabPFN not available. Install with: pip install tabpfn")
    TABPFN_AVAILABLE = False

# TabM proper import - check if it's available and get the correct class
try:
    from tabm import TabM
    TABM_AVAILABLE = True
    print("✅ TabM imported successfully")
except ImportError:
    print("⚠️  TabM not available. Install from GitHub: pip install git+https://github.com/yandex-research/tabm.git")
    TABM_AVAILABLE = False

print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train.drop([f'BlendProperty{i}' for i in range(1, 11)], axis=1)
y = train[[f'BlendProperty{i}' for i in range(1, 11)]]
X_test = test.drop(['ID'], axis=1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("🚀 Starting Level 1 training...")

# Level 1: Advanced models as base models - ALWAYS include XGBoost
level1_names = ['XGB']  # XGBoost is always included in Level 1

# Add available advanced models to level 1
if TABM_AVAILABLE:
    level1_names.append('TabM')
if TABPFN_AVAILABLE:
    level1_names.append('TabPFN')

# Add additional strong models to Level 1 for better diversity
level1_names.extend(['LGBM', 'RandomForest', 'ExtraTrees'])

print(f"📊 Level 1 models: {', '.join(level1_names)}")

level1_oof = {name: np.zeros(y.shape) for name in level1_names}
level1_test = {name: np.zeros((X_test.shape[0], y.shape[1])) for name in level1_names}

# Device for TabM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device for TabM: {device}")

for t in range(y.shape[1]):
    print(f"🎯 Training Level 1 for target BlendProperty{t+1}...")
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        print(f"  Fold {fold+1}/5")
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx, t], y.iloc[val_idx, t]

        # XGBoost - ALWAYS included with optimized parameters
        xgb_model = xgb.XGBRegressor(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbosity=0,
            n_jobs=-1
        )
        xgb_model.fit(X_tr, y_tr)
        level1_oof['XGB'][val_idx, t] = xgb_model.predict(X_val)
        level1_test['XGB'][:, t] += xgb_model.predict(X_test) / kf.n_splits

        # LightGBM with optimized parameters
        lgb_model = lgb.LGBMRegressor(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
            n_jobs=-1
        )
        lgb_model.fit(X_tr, y_tr)
        level1_oof['LGBM'][val_idx, t] = lgb_model.predict(X_val)
        level1_test['LGBM'][:, t] += lgb_model.predict(X_test) / kf.n_splits

        # RandomForest with increased estimators
        rf = RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
        rf.fit(X_tr, y_tr)
        level1_oof['RandomForest'][val_idx, t] = rf.predict(X_val)
        level1_test['RandomForest'][:, t] += rf.predict(X_test) / kf.n_splits

        # ExtraTrees with increased estimators
        et = ExtraTreesRegressor(
            n_estimators=200,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
        et.fit(X_tr, y_tr)
        level1_oof['ExtraTrees'][val_idx, t] = et.predict(X_val)
        level1_test['ExtraTrees'][:, t] += et.predict(X_test) / kf.n_splits

        # TabM (if available) - Fixed implementation
        if 'TabM' in level1_names and TABM_AVAILABLE:
            try:
                # Prepare data for TabM
                scaler_X = StandardScaler()
                scaler_y = StandardScaler()
                
                X_tr_scaled = scaler_X.fit_transform(X_tr.values)
                y_tr_scaled = scaler_y.fit_transform(y_tr.values.reshape(-1, 1)).flatten()
                X_val_scaled = scaler_X.transform(X_val.values)
                X_test_scaled = scaler_X.transform(X_test.values)
                
                # Create TabM model with proper parameters
                tabm_model = TabM.make(
                    n_num_features=X_tr_scaled.shape[1],
                    cat_cardinalities=None,
                    d_out=1,  # Single target for regression
                    k=32  # Ensemble size
                )
                tabm_model.to(device)
                
                # Convert to tensors
                X_tr_tensor = torch.tensor(X_tr_scaled, dtype=torch.float32).to(device)
                y_tr_tensor = torch.tensor(y_tr_scaled, dtype=torch.float32).to(device)
                X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
                X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
                
                # Training TabM
                optimizer = torch.optim.AdamW(tabm_model.parameters(), lr=1e-3, weight_decay=1e-4)
                criterion = nn.MSELoss()
                
                # Simple training loop for TabM
                tabm_model.train()
                for epoch in range(50):  # Limited epochs for efficiency
                    optimizer.zero_grad()
                    preds = tabm_model(X_tr_tensor)
                    if preds.dim() > 1 and preds.shape[1] > 1:
                        preds = torch.median(preds, dim=1)[0]
                    elif preds.dim() > 1:
                        preds = preds.squeeze()
                    loss = criterion(preds, y_tr_tensor)
                    loss.backward()
                    optimizer.step()
                
                # Prediction
                tabm_model.eval()
                with torch.no_grad():
                    val_preds = tabm_model(X_val_tensor)
                    if val_preds.dim() > 1 and val_preds.shape[1] > 1:
                        val_preds = torch.median(val_preds, dim=1)[0]
                    elif val_preds.dim() > 1:
                        val_preds = val_preds.squeeze()
                    
                    test_preds = tabm_model(X_test_tensor)
                    if test_preds.dim() > 1 and test_preds.shape[1] > 1:
                        test_preds = torch.median(test_preds, dim=1)[0]
                    elif test_preds.dim() > 1:
                        test_preds = test_preds.squeeze()
                
                # Scale back predictions
                val_preds_scaled = scaler_y.inverse_transform(val_preds.cpu().numpy().reshape(-1, 1)).flatten()
                test_preds_scaled = scaler_y.inverse_transform(test_preds.cpu().numpy().reshape(-1, 1)).flatten()
                
                level1_oof['TabM'][val_idx, t] = val_preds_scaled
                level1_test['TabM'][:, t] += test_preds_scaled / kf.n_splits
                
                print(f"    TabM completed successfully for fold {fold+1}")
                
            except Exception as e:
                print(f"    Error with TabM: {e}")
                # Remove TabM from level1_names if it fails
                if 'TabM' in level1_names:
                    level1_names.remove('TabM')
                    print(f"    TabM removed from ensemble due to error")

        # TabPFN (if available)
        if 'TabPFN' in level1_names and TABPFN_AVAILABLE:
            try:
                tabpfn = TabPFNRegressor(device='cuda' if torch.cuda.is_available() else 'cpu')
                tabpfn.fit(X_tr, y_tr.values)
                level1_oof['TabPFN'][val_idx, t] = tabpfn.predict(X_val)
                level1_test['TabPFN'][:, t] += tabpfn.predict(X_test) / kf.n_splits
            except Exception as e:
                print(f"    Error with TabPFN: {e}")
                if 'TabPFN' in level1_names:
                    level1_names.remove('TabPFN')

# Remove empty entries from level1 dictionaries
level1_names = [name for name in level1_names if name in level1_oof and level1_oof[name].sum() != 0]
level1_oof = {name: level1_oof[name] for name in level1_names}
level1_test = {name: level1_test[name] for name in level1_names}

print(f"\n📊 Level 1 MAPE Scores (using {len(level1_names)} models):")
for name in level1_names:
    mape = mean_absolute_percentage_error(y, level1_oof[name])
    print(f"  {name}: {mape:.6f}")

print("\n🔄 Preparing Level 2 inputs from Level 1 outputs...")
stack_X = np.concatenate([level1_oof[name] for name in level1_names], axis=1)
stack_X_test = np.concatenate([level1_test[name] for name in level1_names], axis=1)

print("\n🚀 Starting Level 2 stacking (Traditional ML Models)...")

# Level 2: Traditional ML models as stacking models
level2_names = [
    'Ridge', 'Lasso', 'ElasticNet', 'BayesianRidge', 'Huber',
    'RandomForest_L2', 'ExtraTrees_L2', 'AdaBoost', 'GradientBoost', 'Bagging',
    'KNN', 'SVR', 'XGB_L2', 'LGBM_L2'  # Added L2 suffix to avoid confusion
]
level2_oof = {name: np.zeros(y.shape) for name in level2_names}
level2_test = {name: np.zeros((X_test.shape[0], y.shape[1])) for name in level2_names}

for t in range(y.shape[1]):
    print(f"🎯 Level 2 stacking for BlendProperty{t+1}...")
    for fold, (tr_idx, val_idx) in enumerate(kf.split(stack_X)):
        print(f"  Fold {fold+1}/5")
        X_tr, X_val = stack_X[tr_idx], stack_X[val_idx]
        y_tr, y_val = y.iloc[tr_idx, t], y.iloc[val_idx, t]

        # Ridge
        ridge = Ridge(alpha=1.0)
        ridge.fit(X_tr, y_tr)
        level2_oof['Ridge'][val_idx, t] = ridge.predict(X_val)
        level2_test['Ridge'][:, t] += ridge.predict(stack_X_test) / kf.n_splits

        # Lasso
        lasso = Lasso(alpha=0.1, max_iter=1000)
        lasso.fit(X_tr, y_tr)
        level2_oof['Lasso'][val_idx, t] = lasso.predict(X_val)
        level2_test['Lasso'][:, t] += lasso.predict(stack_X_test) / kf.n_splits

        # ElasticNet
        elastic = ElasticNet(alpha=0.1, max_iter=1000)
        elastic.fit(X_tr, y_tr)
        level2_oof['ElasticNet'][val_idx, t] = elastic.predict(X_val)
        level2_test['ElasticNet'][:, t] += elastic.predict(stack_X_test) / kf.n_splits

        # BayesianRidge
        bayesian = BayesianRidge()
        bayesian.fit(X_tr, y_tr)
        level2_oof['BayesianRidge'][val_idx, t] = bayesian.predict(X_val)
        level2_test['BayesianRidge'][:, t] += bayesian.predict(stack_X_test) / kf.n_splits

        # Huber
        huber = HuberRegressor()
        huber.fit(X_tr, y_tr)
        level2_oof['Huber'][val_idx, t] = huber.predict(X_val)
        level2_test['Huber'][:, t] += huber.predict(stack_X_test) / kf.n_splits

        # RandomForest (Level 2)
        rf = RandomForestRegressor(n_estimators=50, random_state=42)
        rf.fit(X_tr, y_tr)
        level2_oof['RandomForest_L2'][val_idx, t] = rf.predict(X_val)
        level2_test['RandomForest_L2'][:, t] += rf.predict(stack_X_test) / kf.n_splits

        # ExtraTrees (Level 2)
        et = ExtraTreesRegressor(n_estimators=50, random_state=42)
        et.fit(X_tr, y_tr)
        level2_oof['ExtraTrees_L2'][val_idx, t] = et.predict(X_val)
        level2_test['ExtraTrees_L2'][:, t] += et.predict(stack_X_test) / kf.n_splits

        # AdaBoost
        ada = AdaBoostRegressor(random_state=42, n_estimators=50)
        ada.fit(X_tr, y_tr)
        level2_oof['AdaBoost'][val_idx, t] = ada.predict(X_val)
        level2_test['AdaBoost'][:, t] += ada.predict(stack_X_test) / kf.n_splits

        # GradientBoosting
        gb = GradientBoostingRegressor(random_state=42, n_estimators=100)
        gb.fit(X_tr, y_tr)
        level2_oof['GradientBoost'][val_idx, t] = gb.predict(X_val)
        level2_test['GradientBoost'][:, t] += gb.predict(stack_X_test) / kf.n_splits

        # Bagging
        bag = BaggingRegressor(random_state=42, n_estimators=50)
        bag.fit(X_tr, y_tr)
        level2_oof['Bagging'][val_idx, t] = bag.predict(X_val)
        level2_test['Bagging'][:, t] += bag.predict(stack_X_test) / kf.n_splits

        # KNN
        knn = KNeighborsRegressor(n_neighbors=5)
        knn.fit(X_tr, y_tr)
        level2_oof['KNN'][val_idx, t] = knn.predict(X_val)
        level2_test['KNN'][:, t] += knn.predict(stack_X_test) / kf.n_splits

        # SVR
        svr = SVR(kernel='rbf', gamma='scale')
        svr.fit(X_tr, y_tr)
        level2_oof['SVR'][val_idx, t] = svr.predict(X_val)
        level2_test['SVR'][:, t] += svr.predict(stack_X_test) / kf.n_splits

        # XGBoost (Level 2)
        xgb_model = xgb.XGBRegressor(random_state=42, verbosity=0)
        xgb_model.fit(X_tr, y_tr)
        level2_oof['XGB_L2'][val_idx, t] = xgb_model.predict(X_val)
        level2_test['XGB_L2'][:, t] += xgb_model.predict(stack_X_test) / kf.n_splits

        # LightGBM (Level 2)
        lgb_model = lgb.LGBMRegressor(random_state=42, verbose=-1)
        lgb_model.fit(X_tr, y_tr)
        level2_oof['LGBM_L2'][val_idx, t] = lgb_model.predict(X_val)
        level2_test['LGBM_L2'][:, t] += lgb_model.predict(stack_X_test) / kf.n_splits

print("\n📊 Level 2 MAPE Scores:")
level2_scores = {}
for name in level2_names:
    mape = mean_absolute_percentage_error(y, level2_oof[name])
    level2_scores[name] = mape
    print(f"  {name}: {mape:.6f}")

print("\n🔍 Starting Dynamic Level 3 optimization...")

# Dynamic selection of top performers (configurable top K models)
TOP_K = min(6, len(level2_names))  # Select top 6 models or all if less than 6
sorted_models = sorted(level2_scores.items(), key=lambda x: x[1])
top_models = [model[0] for model in sorted_models[:TOP_K]]

print(f"\n🎯 Selected top {len(top_models)} models for Level 3:")
for i, model in enumerate(top_models):
    print(f"  {i+1}. {model}: {level2_scores[model]:.6f}")

# Prepare Level 3 inputs with top performers
level3_oof = np.concatenate([level2_oof[model] for model in top_models], axis=1)
level3_test = np.concatenate([level2_test[model] for model in top_models], axis=1)

def objective(trial):
    # Dynamically suggest weights for each top model
    weights = {}
    for model in top_models:
        weights[model] = trial.suggest_float(f'w_{model}', 0.0, 1.0)
    
    # Normalize weights
    total_weight = sum(weights.values())
    if total_weight == 0:
        return float('inf')
    
    normalized_weights = {k: v/total_weight for k, v in weights.items()}
    
    # Create weighted ensemble
    ensemble_pred = sum(normalized_weights[model] * level2_oof[model] 
                       for model in top_models)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(y, ensemble_pred)
    return mape

# Optimize weights
print("🔧 Optimizing ensemble weights...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200, show_progress_bar=True)

print(f"\n✅ Best MAPE: {study.best_value:.6f}")
print("🎯 Best weights:")
best_params = study.best_params
for param, value in best_params.items():
    print(f"  {param}: {value:.4f}")

# Normalize best weights
total_weight = sum(best_params.values())
normalized_weights = {k: v/total_weight for k, v in best_params.items()}

print("\n📊 Normalized weights:")
for param, value in normalized_weights.items():
    print(f"  {param}: {value:.4f}")

# Create final ensemble predictions
final_test = sum(normalized_weights[f'w_{model}'] * level2_test[model] 
                 for model in top_models)

# Final validation score
final_oof = sum(normalized_weights[f'w_{model}'] * level2_oof[model] 
                for model in top_models)

final_mape = mean_absolute_percentage_error(y, final_oof)
print(f"\n🎉 Final ensemble MAPE: {final_mape:.6f}")

print("\n📊 Individual target MAPE scores:")
for i in range(y.shape[1]):
    target_mape = mean_absolute_percentage_error(y.iloc[:, i], final_oof[:, i])
    print(f"  BlendProperty{i+1}: {target_mape:.6f}")

# Save submission with descriptive filename
submission = pd.DataFrame(final_test, columns=[f'BlendProperty{i}' for i in range(1, 11)])
submission.insert(0, 'ID', test['ID'])
submission_filename = f"submission_xgb_level1_ensemble_top{TOP_K}.csv"
submission.to_csv(submission_filename, index=False)
print(f"\n💾 Submission file saved as '{submission_filename}'")

print(f"\n🎯 Advanced Dynamic Ensemble Summary:")
print(f"  Level 1: {len(level1_names)} strong models including XGBoost: {', '.join(level1_names)}")
print(f"  Level 2: {len(level2_names)} traditional ML models stacking on Level 1 outputs") 
print(f"  Level 3: Dynamic optimization with top {len(top_models)} performers: {', '.join(top_models)}")
print(f"  Final MAPE: {final_mape:.6f}")
print(f"  Submission saved as: {submission_filename}")

# Instructions for installing missing packages
print("\n📝 Installation Instructions:")
if not TABM_AVAILABLE:
    print("  For TabM: pip install git+https://github.com/yandex-research/tabm.git")
if not TABPFN_AVAILABLE:
    print("  For TabPFN: pip install tabpfn")

✅ TabM imported successfully
Loading data...
🚀 Starting Level 1 training...
📊 Level 1 models: XGB, TabM, TabPFN, LGBM, RandomForest, ExtraTrees
Using device for TabM: cuda
🎯 Training Level 1 for target BlendProperty1...
  Fold 1/5
    TabM completed successfully for fold 1
  Fold 2/5
    TabM completed successfully for fold 2
  Fold 3/5
    TabM completed successfully for fold 3
  Fold 4/5
    TabM completed successfully for fold 4
  Fold 5/5
    TabM completed successfully for fold 5
🎯 Training Level 1 for target BlendProperty2...
  Fold 1/5
    TabM completed successfully for fold 1
  Fold 2/5
    TabM completed successfully for fold 2
  Fold 3/5
    TabM completed successfully for fold 3
  Fold 4/5
    TabM completed successfully for fold 4
  Fold 5/5
    TabM completed successfully for fold 5
🎯 Training Level 1 for target BlendProperty3...
  Fold 1/5
    TabM completed successfully for fold 1
  Fold 2/5
    TabM completed successfully for fold 2
  Fold 3/5
    TabM completed succes

KeyboardInterrupt: 