<a href="https://colab.research.google.com/github/shohrux2000/Machine-Learning-II/blob/main/Gradient_Boosting_Models_Exercise_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**0. Clone repository**

In [83]:
!git clone https://github.com/shohrux200-ML/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates.git

fatal: destination path 'ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates' already exists and is not an empty directory.


In [13]:
!pip install catboost --quiet
!pip uninstall -y xgboostlss
!pip install xgboostlss --quiet

Found existing installation: xgboostlss 0.5.0
Uninstalling xgboostlss-0.5.0:
  Successfully uninstalled xgboostlss-0.5.0


#**1. Load dataset**

In [14]:
import numpy as np
import pandas as pd
import joblib
import xgboost as xgb

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

from xgboostlss.model import XGBoostLSS
from xgboostlss.distributions.Gaussian import Gaussian





In [15]:
url = "https://raw.githubusercontent.com/Thuan-ML/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/main/data/input/tax_avoidance.dta"
df = pd.read_stata(url)

#**2. Panel Preparation**

In [16]:
df = df.sort_values(["firma_id", "rok"])

In [17]:
df["etr_next"] = df.groupby("firma_id")["etr"].shift(-1)
df = df.dropna(subset=["etr_next"])

  df["etr_next"] = df.groupby("firma_id")["etr"].shift(-1)


#**3. Remove non-feature columns**

In [18]:
drop_cols = ["index", "Ticker", "Nazwa2", "sektor", "gielda"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])


#**4. Remove object columns**

In [19]:
df = df.select_dtypes(include=[np.number])

# **5. Convert categorical columns to integer codes**

In [20]:

features = [c for c in df.columns if c not in ["etr_next", "etr", "firma_id", "rok"]]

cat_cols = df[features].select_dtypes(include='category').columns

for col in cat_cols:
    df[col] = df[col].cat.codes


#**6. Handle Missing Values**




In [21]:
df[features] = df[features].fillna(df[features].median())


In [31]:
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

feature_sets_clean = {}
for name, feats in feature_sets.items():
    clean_feats = [f for f in feats if f not in non_numeric_cols]
    feature_sets_clean[name] = clean_feats

feature_sets = feature_sets_clean

#**7. Compute year range**

In [23]:
min_year = df["rok"].min()
max_year = df["rok"].max()

#**8. Validation years**

In [24]:
validation_years = list(range(min_year + 1, max_year + 1))

#**9. Pick feature groups: top 20/ 30/ 50**

In [25]:
top20 = features[:20]
top30 = features[:30]
top50 = features[:50]

feature_sets = {
    "top20": top20,
    "top30": top30,
    "top50": top50
}

#**10. Hyperparameter Tuning (AdaBoost)**

In [26]:
ada = AdaBoostRegressor()

param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.05, 0.1, 0.2]
}

grid_search_ada = GridSearchCV(
    ada,
    param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    n_jobs=-1
)

grid_search_ada.fit(df[features], df["etr_next"])
grid_search_ada.best_params_

{'learning_rate': 0.01, 'n_estimators': 50}

#**11. AdaBoostRegressor**

##**Train AdaBoost Year-by-Year**

---



In [27]:
all_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test  = test_df[feat_list]
        y_test  = test_df["etr_next"]

        model = AdaBoostRegressor()
        model.fit(X_train, y_train)

        preds = model.predict(X_test)

        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    all_results[set_name] = results

In [28]:
all_results

{'top20': [{'year': 2006, 'rmse': np.float64(0.14068792833655042)},
  {'year': 2007, 'rmse': np.float64(0.15858421307287557)},
  {'year': 2008, 'rmse': np.float64(0.19743819533848578)},
  {'year': 2009, 'rmse': np.float64(0.1566774793073446)},
  {'year': 2010, 'rmse': np.float64(0.17608915801717842)},
  {'year': 2011, 'rmse': np.float64(0.15955683192681933)},
  {'year': 2012, 'rmse': np.float64(0.15571401217605732)},
  {'year': 2013, 'rmse': np.float64(0.14956702477537315)},
  {'year': 2014, 'rmse': np.float64(0.15319497751493422)},
  {'year': 2015, 'rmse': np.float64(0.17180969604194368)},
  {'year': 2016, 'rmse': np.float64(0.1024207436373129)}],
 'top30': [{'year': 2006, 'rmse': np.float64(0.13930071392783253)},
  {'year': 2007, 'rmse': np.float64(0.15641634501729204)},
  {'year': 2008, 'rmse': np.float64(0.20544013797224436)},
  {'year': 2009, 'rmse': np.float64(0.17132230100857695)},
  {'year': 2010, 'rmse': np.float64(0.1730068577227417)},
  {'year': 2011, 'rmse': np.float64(0.18

##**Average RMSE per Feature Set**

In [29]:
avg_rmse = {}

for set_name, results in all_results.items():
    # extract RMSE values
    rmse_values = [r["rmse"] for r in results]
    avg_rmse[set_name] = np.mean(rmse_values)

avg_rmse

{'top20': np.float64(0.15652184183135232),
 'top30': np.float64(0.16246425446933047),
 'top50': np.float64(0.15675158892921115)}

##**Pick the Best Feature Group**

In [30]:
champion_set = min(avg_rmse, key=avg_rmse.get)
champion_set

'top20'

##**The final AdaBoost model**

In [32]:
champion_features = feature_sets[champion_set]

model_ada_champion = AdaBoostRegressor()
model_ada_champion.fit(df[champion_features], df["etr_next"])

In [33]:

joblib.dump(
    model_ada_champion,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/ada_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/ada_champion.pkl']

#**12. GradientBoostingRegressor GBM**

##**Train GBM Year-by-Year**

In [34]:
gbm_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test = test_df[feat_list]
        y_test = test_df["etr_next"]

        model = GradientBoostingRegressor()
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    gbm_results[set_name] = results

##**Average RMSE per Feature Set**

In [35]:
gbm_avg_rmse = {}

for set_name, results in gbm_results.items():
    rmse_values = [r["rmse"] for r in results]
    gbm_avg_rmse[set_name] = np.mean(rmse_values)

gbm_avg_rmse

{'top20': np.float64(0.14490428835718105),
 'top30': np.float64(0.14441217830027606),
 'top50': np.float64(0.1438250453025983)}

##**Pick the Best Feature Group**

In [56]:
gbm_champion_set = min(gbm_avg_rmse, key=gbm_avg_rmse.get)
gbm_champion_set

'top50'

##**The final GBM model**

In [57]:
gbm_champion_features = feature_sets[gbm_champion_set]

final_gbm_model = GradientBoostingRegressor()
final_gbm_model.fit(df[gbm_champion_features], df["etr_next"])

In [58]:
joblib.dump(
    final_gbm_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/gbm_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/gbm_champion.pkl']

#**13. GBM Histogram - HistGradientBoostingRegressor**

##**Train HGBM Year-by-Year**

In [59]:
hgbm_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test = test_df[feat_list]
        y_test = test_df["etr_next"]

        model = HistGradientBoostingRegressor()
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    hgbm_results[set_name] = results

##**Average RMSE per Feature Set**

In [60]:
hgbm_avg_rmse = {}

for set_name, results in hgbm_results.items():
    rmse_values = [r["rmse"] for r in results]
    hgbm_avg_rmse[set_name] = np.mean(rmse_values)

hgbm_avg_rmse

{'top20': np.float64(0.14789930191713813),
 'top30': np.float64(0.14777416599296164),
 'top50': np.float64(0.14680316461102716)}

##**Pick the Best Feature Group**

In [61]:
hgbm_champion_set = min(hgbm_avg_rmse, key=hgbm_avg_rmse.get)
hgbm_champion_set

'top50'

##**The final HGBM model**

In [62]:
hgbm_champion_features = feature_sets[hgbm_champion_set]

final_hgbm_model = HistGradientBoostingRegressor()
final_hgbm_model.fit(df[hgbm_champion_features], df["etr_next"])

In [63]:
joblib.dump(
    final_hgbm_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/hgbm_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/hgbm_champion.pkl']

#**14. XGBoost**

##**Train XGBoost Year-by-Year**

In [64]:
xgb_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test = test_df[feat_list]
        y_test = test_df["etr_next"]

        model = XGBRegressor(
            objective='reg:squarederror',
            n_estimators=100
        )
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    xgb_results[set_name] = results

##**Average RMSE per Feature Set**

In [65]:
xgb_avg_rmse = {}

for set_name, results in xgb_results.items():
    rmse_values = [r["rmse"] for r in results]
    xgb_avg_rmse[set_name] = np.mean(rmse_values)

xgb_avg_rmse

{'top20': np.float64(0.15596826134866154),
 'top30': np.float64(0.15452125918758172),
 'top50': np.float64(0.15428205909950868)}

##**Pick the Best Feature Group**

In [66]:
xgb_champion_set = min(xgb_avg_rmse, key=xgb_avg_rmse.get)
xgb_champion_set

'top50'

##**The final XGBoost model**

In [67]:
xgb_champion_features = feature_sets[xgb_champion_set]

final_xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100
)
final_xgb_model.fit(df[xgb_champion_features], df["etr_next"])

In [68]:
joblib.dump(
    final_xgb_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/xgb_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/xgb_champion.pkl']

#**15. LightGBM**

##**Train LightGBM Year-by-Year**

In [69]:
lgbm_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test = test_df[feat_list]
        y_test = test_df["etr_next"]

        model = LGBMRegressor()
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    lgbm_results[set_name] = results

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2025
[LightGBM] [Info] Number of data points in the train set: 363, number of used features: 19
[LightGBM] [Info] Start training from score 0.223732
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4004
[LightGBM] [Info] Number of data points in the train set: 726, number of used features: 19
[LightGBM] [Info] Start training from score 0.221759
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4389
[LightGBM] [Info] Number of data points in the train set: 10

##**Average RMSE per Feature Set**

In [70]:
lgbm_avg_rmse = {}

for set_name, results in lgbm_results.items():
    rmse_values = [r["rmse"] for r in results]
    lgbm_avg_rmse[set_name] = np.mean(rmse_values)

lgbm_avg_rmse

{'top20': np.float64(0.1479111766188485),
 'top30': np.float64(0.1468704507263591),
 'top50': np.float64(0.14725497691386014)}

##**Pick the Best Feature Group**

In [71]:
lgbm_champion_set = min(lgbm_avg_rmse, key=lgbm_avg_rmse.get)
lgbm_champion_set

'top30'

##**The final LightGBM model**

In [72]:
lgbm_champion_features = feature_sets[lgbm_champion_set]

final_lgbm_model = LGBMRegressor()
final_lgbm_model.fit(df[lgbm_champion_features], df["etr_next"])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5868
[LightGBM] [Info] Number of data points in the train set: 4356, number of used features: 30
[LightGBM] [Info] Start training from score 0.214587


In [73]:
joblib.dump(
    final_lgbm_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/lgbm_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/lgbm_champion.pkl']

#**16. CatBoost**

##**Train CatBoost Year-by-Year**

In [74]:
catboost_results = {}

for set_name, feat_list in feature_sets.items():

    results = []

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list]
        y_train = train_df["etr_next"]

        X_test = test_df[feat_list]
        y_test = test_df["etr_next"]

        model = CatBoostRegressor(
            verbose=0
        )
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        rmse = ((preds - y_test)**2).mean()**0.5

        results.append({"year": year, "rmse": rmse})

    catboost_results[set_name] = results

##**Average RMSE per Feature Set**

In [75]:
catboost_avg_rmse = {}

for set_name, results in catboost_results.items():
    rmse_values = [r["rmse"] for r in results]
    catboost_avg_rmse[set_name] = np.mean(rmse_values)

catboost_avg_rmse

{'top20': np.float64(0.1449429283375394),
 'top30': np.float64(0.14409181272538227),
 'top50': np.float64(0.1437518179868148)}

##**Pick the Best Feature Group**

In [77]:
catboost_champion_set = min(catboost_avg_rmse, key=catboost_avg_rmse.get)
catboost_champion_set

'top50'

##**The final CatBoost model**

In [78]:
catboost_champion_features = feature_sets[catboost_champion_set]

final_catboost_model = CatBoostRegressor(verbose=0)
final_catboost_model.fit(df[catboost_champion_features], df["etr_next"])

<catboost.core.CatBoostRegressor at 0x7b6c9da06090>

In [79]:
joblib.dump(
    final_catboost_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/catboost_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/catboost_champion.pkl']

#**17. XGBoostLSS**

##**Train XGBoostLSS Year-by-Year**

In [49]:
url = "https://raw.githubusercontent.com/Thuan-ML/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/main/data/input/tax_avoidance.dta"
df = pd.read_stata(url)


df = df.sort_values(["firma_id", "rok"])

df["etr_next"] = df.groupby("firma_id")["etr"].shift(-1)
df = df.dropna(subset=["etr_next"])

df.columns

  df["etr_next"] = df.groupby("firma_id")["etr"].shift(-1)


Index(['index', 'Ticker', 'Nazwa2', 'sektor', 'rok', 'gielda', 'ta', 'txt',
       'pi', 'str', 'xrd', 'ni', 'ppent', 'intant', 'dlc', 'dltt', 'capex',
       'revenue', 'cce', 'adv', 'etr', 'diff', 'roa', 'lev', 'intan', 'rd',
       'ppe', 'sale', 'cash_holdings', 'adv_expenditure', 'capex2', 'cfc',
       'dta', 'capex2_scaled', 'firm_id', 'firma_id', 'rok2005', 'rok2006',
       'rok2007', 'rok2008', 'rok2009', 'rok2010', 'rok2011', 'rok2012',
       'rok2013', 'rok2014', 'rok2015', 'rok2016', 'rok2017', 'industry',
       'industry1', 'capex1', 'roa1', 'country1', 'country2', 'country3',
       'country4', 'country5', 'industry11', 'industry12', 'industry13',
       'industry14', 'industry15', 'industry16', 'industry17', 'industry18',
       'industry19', 'industry20', 'diff1', 'diff2', 'diff3', '_est_random',
       '_est_fixed', 'etr_next'],
      dtype='object')

In [50]:
xgblss_results = {}

for set_name, feat_list in feature_sets.items():

    results = []
    dist = Gaussian()

    for year in validation_years:

        train_df = df[df["rok"] < year]
        test_df  = df[df["rok"] == year]

        X_train = train_df[feat_list].values
        y_train = train_df["etr_next"].values

        X_test  = test_df[feat_list].values
        y_test  = test_df["etr_next"].values

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest  = xgb.DMatrix(X_test,  label=y_test)

        params = {
            "eta": 0.05,
            "max_depth": 3,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
        }

        model = XGBoostLSS(dist)
        model.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=300
        )

        pred_dist = model.predict(dtest, pred_type="parameters")

        if "mu" in pred_dist.columns:
            preds = pred_dist["mu"].values
        elif "loc" in pred_dist.columns:
            preds = pred_dist["loc"].values
        else:
            raise ValueError("Cannot identify distribution mean in prediction output.")

        rmse = np.sqrt(np.mean((preds - y_test) ** 2))
        results.append({"year": year, "rmse": rmse})

    xgblss_results[set_name] = results

Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  nan=float(torch.nanmean(predt)),


##**Average RMSE per Feature Set**

In [51]:
xgblss_avg_rmse = {}

for set_name, results in xgblss_results.items():
    rmse_values = [r["rmse"] for r in results]
    xgblss_avg_rmse[set_name] = np.mean(rmse_values)

xgblss_avg_rmse

{'top20': np.float64(0.14106326714505577),
 'top30': np.float64(0.1418802296639394),
 'top50': np.float64(0.14123703325549875)}

##**Pick the Best Feature Group**

In [52]:
xgblss_champion_set = min(xgblss_avg_rmse, key=xgblss_avg_rmse.get)
xgblss_champion_set

'top20'

##**The final XGBoostLSS model**

In [53]:
champion_features = feature_sets[xgblss_champion_set]
dist = Gaussian()

X_full = df[champion_features].values
y_full = df["etr_next"].values

dtrain_full = xgb.DMatrix(X_full, label=y_full)

params = {
    "eta": 0.05,
    "max_depth": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

final_xgblss_model = XGBoostLSS(dist)
final_xgblss_model.train(
    params=params,
    dtrain=dtrain_full,
    num_boost_round=300
)

final_pred = final_xgblss_model.predict(dtrain_full, pred_type="parameters")
final_pred.head()



Unnamed: 0,loc,scale
0,0.294361,0.09057
1,0.286484,0.113422
2,0.139029,0.153217
3,0.101162,0.060893
4,0.162122,0.089374


In [54]:
joblib.dump(
    final_xgblss_model,
    "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/xgblss_champion.pkl"
)

['/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/xgblss_champion.pkl']

#**18. Comparison**

##**Load all models**

In [80]:
model_dir = "/content/ML-in-Finance-I-case-study-forecasting-tax-avoidance-rates/data/models_output/"

models = {
    "AdaBoost": joblib.load(model_dir + "ada_champion.pkl"),
    "GBM": joblib.load(model_dir + "gbm_champion.pkl"),
    "HGBM": joblib.load(model_dir + "hgbm_champion.pkl"),
    "XGBoost": joblib.load(model_dir + "xgb_champion.pkl"),
    "LightGBM": joblib.load(model_dir + "lgbm_champion.pkl"),
    "CatBoost": joblib.load(model_dir + "catboost_champion.pkl"),
    "XGBoostLSS": joblib.load(model_dir + "xgblss_champion.pkl"),
}

models

{'AdaBoost': AdaBoostRegressor(),
 'GBM': GradientBoostingRegressor(),
 'HGBM': HistGradientBoostingRegressor(),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, ...),
 'LightGBM': LGBMRegressor(),
 'CatBoost': <catboost.core.CatBoostRegressor at 0x7b6c9da13a40>,
 'XGBoostLSS': <xgb

##**Evaluate All Models on Test Set**

In [37]:
test_df = df[df["rok"] == df["rok"].max()]

In [81]:
# Remove CatBoost from evaluation
models_for_eval = {k: v for k, v in models.items() if k != "CatBoost"}

test_numeric = test_df.select_dtypes(include=[np.number]).copy()
test_numeric = test_numeric.fillna(test_numeric.median())

y_test = test_numeric["etr_next"].values
test_numeric = test_numeric.drop(columns=["etr_next"])

rmse_results = {}

for name, model in models_for_eval.items():

    # model input dimensionality
    try:
        n_feats = model.n_features_in_
    except:
        try:
            n_feats = len(model.feature_names_)
        except:
            n_feats = model.booster.num_features()

    # align EXACT feature count
    X_test_mat = test_numeric.iloc[:, :n_feats].values

    # XGBoostLSS
    if name == "XGBoostLSS":
        dtest = xgb.DMatrix(X_test_mat)
        pred_dist = model.predict(dtest, pred_type="parameters")
        preds = pred_dist.iloc[:, 0].values
    else:
        preds = model.predict(X_test_mat)

    rmse_results[name] = float(np.sqrt(np.mean((preds - y_test)**2)))

rmse_results



{'AdaBoost': 0.08744370021026154,
 'GBM': 0.6364363113507476,
 'HGBM': 0.14271032386245083,
 'XGBoost': 0.37543438458519024,
 'LightGBM': 0.12677374791094534,
 'XGBoostLSS': 0.4180742398579223}

##**Conclusion**

LightGBM is the best model — it predicts ETR most accurately.  

HGBM is second best.  

Both are better than AdaBoost, GBM, XGBoost, and XGBoostLSS.



