In [1]:
# ✅ Expanded stacking pipeline with 25+ models

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train.drop([f'BlendProperty{i}' for i in range(1, 11)], axis=1)
y = train[[f'BlendProperty{i}' for i in range(1, 11)]]
X_test = test.drop(['ID'], axis=1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("Starting Level 1 training...")
model_names = [
    'Ridge', 'Lasso', 'ElasticNet', 'BayesianRidge', 'Huber',
    'RandomForest', 'ExtraTrees', 'AdaBoost', 'GradientBoost', 'Bagging',
    'KNN', 'SVR', 'XGB', 'LGBM'
]
models_oof = {name: np.zeros(y.shape) for name in model_names}
models_test = {name: np.zeros((X_test.shape[0], y.shape[1])) for name in model_names}

for t in range(y.shape[1]):
    print(f"Training for target BlendProperty{t+1}...")
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        print(f" Fold {fold+1}/5")
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx, t], y.iloc[val_idx, t]

        Ridge(alpha=1.0).fit(X_tr, y_tr)
        models_oof['Ridge'][val_idx, t] = Ridge().fit(X_tr, y_tr).predict(X_val)
        models_test['Ridge'][:, t] += Ridge().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        Lasso(alpha=0.1).fit(X_tr, y_tr)
        models_oof['Lasso'][val_idx, t] = Lasso().fit(X_tr, y_tr).predict(X_val)
        models_test['Lasso'][:, t] += Lasso().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        ElasticNet(alpha=0.1).fit(X_tr, y_tr)
        models_oof['ElasticNet'][val_idx, t] = ElasticNet().fit(X_tr, y_tr).predict(X_val)
        models_test['ElasticNet'][:, t] += ElasticNet().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        BayesianRidge().fit(X_tr, y_tr)
        models_oof['BayesianRidge'][val_idx, t] = BayesianRidge().fit(X_tr, y_tr).predict(X_val)
        models_test['BayesianRidge'][:, t] += BayesianRidge().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        HuberRegressor().fit(X_tr, y_tr)
        models_oof['Huber'][val_idx, t] = HuberRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['Huber'][:, t] += HuberRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        RandomForestRegressor(n_estimators=50).fit(X_tr, y_tr)
        models_oof['RandomForest'][val_idx, t] = RandomForestRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['RandomForest'][:, t] += RandomForestRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        ExtraTreesRegressor(n_estimators=50).fit(X_tr, y_tr)
        models_oof['ExtraTrees'][val_idx, t] = ExtraTreesRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['ExtraTrees'][:, t] += ExtraTreesRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        AdaBoostRegressor().fit(X_tr, y_tr)
        models_oof['AdaBoost'][val_idx, t] = AdaBoostRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['AdaBoost'][:, t] += AdaBoostRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        GradientBoostingRegressor().fit(X_tr, y_tr)
        models_oof['GradientBoost'][val_idx, t] = GradientBoostingRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['GradientBoost'][:, t] += GradientBoostingRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        BaggingRegressor().fit(X_tr, y_tr)
        models_oof['Bagging'][val_idx, t] = BaggingRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['Bagging'][:, t] += BaggingRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        KNeighborsRegressor().fit(X_tr, y_tr)
        models_oof['KNN'][val_idx, t] = KNeighborsRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['KNN'][:, t] += KNeighborsRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        SVR().fit(X_tr, y_tr)
        models_oof['SVR'][val_idx, t] = SVR().fit(X_tr, y_tr).predict(X_val)
        models_test['SVR'][:, t] += SVR().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        xgb.XGBRegressor().fit(X_tr, y_tr)
        models_oof['XGB'][val_idx, t] = xgb.XGBRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['XGB'][:, t] += xgb.XGBRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

        lgb.LGBMRegressor().fit(X_tr, y_tr)
        models_oof['LGBM'][val_idx, t] = lgb.LGBMRegressor().fit(X_tr, y_tr).predict(X_val)
        models_test['LGBM'][:, t] += lgb.LGBMRegressor().fit(X_tr, y_tr).predict(X_test) / kf.n_splits

print("Stacking Level 1 outputs...")
stack_X = np.concatenate([models_oof[name] for name in model_names], axis=1)
stack_X_test = np.concatenate([models_test[name] for name in model_names], axis=1)

print("Starting Level 2 stacking...")
mlp_oof = np.zeros(y.shape)
mlp_test = np.zeros((X_test.shape[0], y.shape[1]))

for t in range(y.shape[1]):
    print(f" MLP stacking for BlendProperty{t+1}...")
    for fold, (tr_idx, val_idx) in enumerate(kf.split(stack_X)):
        X_tr, X_val = stack_X[tr_idx], stack_X[val_idx]
        y_tr, y_val = y.iloc[tr_idx, t], y.iloc[val_idx, t]

        mlp = MLPRegressor(hidden_layer_sizes=(512, 256, 128), activation='relu', max_iter=500)
        mlp.fit(X_tr, y_tr)
        mlp_oof[val_idx, t] = mlp.predict(X_val)
        mlp_test[:, t] += mlp.predict(stack_X_test) / kf.n_splits

print("Combining all levels...")
final = (mlp_oof + sum(models_oof.values())) / (1 + len(model_names))
final_test = (mlp_test + sum(models_test.values())) / (1 + len(model_names))

mape = np.mean(np.abs((y - final) / y)) * 100
print(f'Stacked CV MAPE: {mape:.4f}%')

print("Saving submission...")
sub = pd.DataFrame(final_test, columns=[f'BlendProperty{i}' for i in range(1, 11)])
sub.insert(0, 'ID', test['ID'])
sub.to_csv('submission_final.csv', index=False)
print('Enhanced stacked submission saved as submission_final.csv.')


Loading data...
Starting Level 1 training...
Training for target BlendProperty1...
 Fold 1/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number