In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.drop(columns=['uid'], errors='ignore', inplace=True)
test_df.drop(columns=['uid'], errors='ignore', inplace=True)

train_df = pd.get_dummies(train_df, columns=['day'], dummy_na=True)
test_df = pd.get_dummies(test_df, columns=['day'], dummy_na=True)

X_train = train_df.drop(columns=['output_electricity_generation'], errors='ignore')
y_train = train_df['output_electricity_generation']
X_test = test_df.copy()

for col in set(X_train.columns) - set(X_test.columns):
    X_test[col] = 0
X_test = X_test[X_train.columns]

imputer = SimpleImputer(strategy='median')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.03, max_depth=7, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.03, max_depth=7, subsample=0.8, colsample_bytree=0.8, random_state=42, tree_method='hist')
cat_model = CatBoostRegressor(n_estimators=1000, learning_rate=0.03, depth=7, random_state=42, verbose=0)
rf_model = RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42)
ext_model = ExtraTreesRegressor(n_estimators=500, max_depth=10, random_state=42)

meta_learner = Ridge(alpha=1.0)

stacked_model = StackingRegressor(
    estimators=[('lgb', lgb_model), ('xgb', xgb_model), ('cat', cat_model), ('rf', rf_model), ('ext', ext_model)],
    final_estimator=meta_learner, n_jobs=-1
)

voting_model = VotingRegressor([('lgb', lgb_model), ('xgb', xgb_model), ('cat', cat_model), ('rf', rf_model), ('ext', ext_model)])

rmse_scores = {}

for model, name in zip([stacked_model, voting_model], ["Stacking", "Voting"]):
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    rmse_scores[name] = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{name} RMSE: {rmse_scores[name]}")

base_models = [lgb_model, xgb_model, cat_model, rf_model, ext_model]
for base in base_models:
    base.fit(X_train_split, y_train_split)
y_train_blend = np.column_stack([model.predict(X_val) for model in base_models])
meta_learner.fit(y_train_blend, y_val)
y_pred_blend = meta_learner.predict(y_train_blend)
rmse_scores["Blending"] = np.sqrt(mean_squared_error(y_val, y_pred_blend))
print(f"Blending RMSE: {rmse_scores['Blending']}")

best_model = min(rmse_scores, key=rmse_scores.get)
print(f"Best Model: {best_model} with RMSE: {rmse_scores[best_model]}")

Stacking RMSE: 4.172440638526773
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2385
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 14
[LightGBM] [Info] Start training from score 832.300201
Voting RMSE: 3.713324832023432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2385
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 14
[LightGBM] [Info] Start training from score 832.300201
Blending RMSE: 3.0553029782303596
Best Model: Blending with RMSE: 3.0553029782303596
