In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import optuna
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_validate, RepeatedKFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from src.preprocessor import preprocessor
from pathlib import Path
from src.clean import clean
import optuna.visualization as viz

In [2]:
data_dir = Path("../data")
train_df = pd.read_csv(data_dir / "train.csv")
dfx, label = clean(train_df)

### XGB

`max_depth` or `max_leaf_nodes` -  gradient-boosting that the algorithm fits the error of the previous tree in the ensemble. Thus, fitting fully grown trees would be detrimental. Indeed, the first tree of the ensemble would perfectly fit (overfit) the data and thus no subsequent tree would be required, since there
would be no residuals. Therefore, the tree used in gradient-boosting should have a low depth, typically between 3 to 8 levels, or few leaves from $2^3=8$ to $2^8=256$. Having very weak learners at each step will help reducing overfitting.

`n_estimators`, - With this consideration in mind, the deeper the trees, the faster the residuals will be corrected and less learners are required. Therefore, `n_estimators` should be increased if `max_depth` is lower.

`learning_rate`,  -  When fitting the residuals, we would like the tree to try to correct all possible errors or only a fraction of them. The learning-rate allows you to control this behaviour. A small learning-rate value would only correct the residuals of very few samples. If a large learning-rate is set (e.g., 1), we would fit the residuals of all samples. So, with a very low learning-rate, we will need more estimators to correct the overall error. However, a too large learning-rate tends to obtain an overfitted ensemble, similar to having a too large tree depth.

In [3]:
# XGB hp tuning

def objective(trial):
    
    log_label = np.log1p(label)
    
    #cv = RepeatedKFold(n_repeats=3,n_splits=5, random_state=1)
    cv = KFold(n_splits=5, shuffle=True,random_state=1)  

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': trial.suggest_int('max_depth', 2,3),                                   #the maximum depth of a tree; Lower values avoid over-fitting.
        'max_leaves':trial.suggest_int('max_leaves', 1, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.08, 0.09, log=True),         #the learning rate of our GBM (i.e. how much we update our prediction with each successive tree); Lower values avoid over-fitting.
        #'n_estimators': trial.suggest_int('n_estimators', 1, 8000),                     
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 100),                #the minimum sum of instance weight needed in a leaf, in certain applications this relates directly to the minimum number of instances needed in a node; min_child_weight. Larger values avoid over-fitting.
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.3),              #the ratio of features used. Lower ratios avoid over-fitting.
        'subsample': trial.suggest_float('subsample', 0.8, 0.95),                           #the ratio of the training instances used. Lower ratios avoid over-fitting.
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-11, 1e-3, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 1e-3),
        'gamma': trial.suggest_float('gamma', 1e-6, 1e-3, log=True),                        #the minimum loss reduction required to make a further split; gamma. Larger values avoid over-fitting.
    }
    
    model = make_pipeline(
        preprocessor,
        xgb.XGBRegressor(**params))
    
    score = cross_val_score(
                model, 
                dfx, 
                log_label,
                scoring='neg_root_mean_squared_error',
                cv=cv,
                n_jobs=-1
                )
            
    return -score.mean()
        
    
    
xgb_study = optuna.create_study(direction="minimize")
xgb_study.optimize(objective, n_trials=15,n_jobs=-1)


[32m[I 2023-01-22 19:48:08,777][0m A new study created in memory with name: no-name-7e671b7b-7447-4b7d-a174-075fa965b2b4[0m
[32m[I 2023-01-22 19:48:10,469][0m Trial 2 finished with value: 0.12570939767497785 and parameters: {'max_depth': 3, 'max_leaves': 27, 'learning_rate': 0.0888072804898768, 'min_child_weight': 50.00101136573306, 'colsample_bytree': 0.28383039046919084, 'subsample': 0.8208571364930839, 'reg_alpha': 1.1056919771545434e-06, 'reg_lambda': 0.0009025083012541402, 'gamma': 3.538569262603882e-05}. Best is trial 2 with value: 0.12570939767497785.[0m
[32m[I 2023-01-22 19:48:10,893][0m Trial 10 finished with value: 0.12319406961883758 and parameters: {'max_depth': 3, 'max_leaves': 46, 'learning_rate': 0.08137858818482549, 'min_child_weight': 25.672965049389642, 'colsample_bytree': 0.23537417452819223, 'subsample': 0.8230582087042582, 'reg_alpha': 1.746675159440641e-09, 'reg_lambda': 0.0006396414852648484, 'gamma': 1.198388131455886e-05}. Best is trial 10 with value: 0

In [4]:

joblib.dump(xgb_study, "src/hyperparameter_tuning/study_xgb.pkl")

#study = joblib.load("study.pkl")

print("Number of finished trials: ", len(xgb_study.trials))
print("Best trial:")
trial = xgb_study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

#print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))


viz.plot_optimization_history(xgb_study).show()
viz.plot_parallel_coordinate(xgb_study).show()
viz.plot_param_importances(xgb_study).show()
#plot_contour(xgb_study).show()
viz.plot_slice(xgb_study).show() #, params=["colsample_bytree","min_child_weight", "learning_rate"]).show()

Number of finished trials:  15
Best trial:
  Value: 0.12162769799203095
  Params: 
    max_depth: 3
    max_leaves: 17
    learning_rate: 0.08721353849241338
    min_child_weight: 14.094902119838498
    colsample_bytree: 0.24104361635152596
    subsample: 0.8750745019488066
    reg_alpha: 1.1442979900487261e-09
    reg_lambda: 0.0003072575472117261
    gamma: 0.00011105111797207138


After careful adjustment of parameter subset max depth finally managed to reduce overfitting

### Random Forest

Here are several hyperparameters that can be tuned in a random forest regressor, including:

n_estimators: This is the number of trees in the random forest. Increasing this value will generally improve the model's performance, but will also increase the training time.

max_depth: This is the maximum depth of each tree in the forest. Increasing this value will make the model more complex and may improve its performance, but can also increase the risk of overfitting.

min_samples_split: This is the minimum number of samples required to split an internal node in the tree. Increasing this value will make the model more conservative and may help to prevent overfitting.

min_samples_leaf: This is the minimum number of samples required to be at a leaf node. Increasing this value will make the model more conservative and may help to prevent overfitting.

max_features: This is the number of features that are considered when splitting a node in the tree. Increasing this value can improve the model's performance, but may also increase the risk of overfitting.

In [5]:
def objective(trial):
    
    log_label = np.log1p(label)
    
    cv = RepeatedKFold(n_repeats=2,n_splits=3, random_state=1)
    
    params = {
        'n_estimators':trial.suggest_int('n_estimators',100,1000),
        'max_depth':trial.suggest_int('max_depth',5,500),
        'min_samples_split':trial.suggest_float('min_samples_split',1e-5,1, log=True),
        'min_samples_leaf':trial.suggest_float('min_samples_leaf',1e-5,1, log=True),
        'max_features':trial.suggest_float('max_features',1e-5,1, log=True),
        
    }

    
    model = make_pipeline(
        preprocessor,
        RandomForestRegressor(**params),
    )
    
    
    score = cross_val_score(
                model, 
                dfx, 
                log_label,
                scoring='neg_root_mean_squared_error',
                cv=cv,
                n_jobs=-1
                )
            
    return -score.mean()

    
rf_study = optuna.create_study(direction="minimize")
rf_study.optimize(objective, n_trials=25,n_jobs=-1)


[32m[I 2023-01-22 19:49:30,500][0m A new study created in memory with name: no-name-a3be4352-0e50-486e-b401-2d2b7c408d26[0m
[32m[I 2023-01-22 19:49:31,816][0m Trial 5 finished with value: 0.35608488063628196 and parameters: {'n_estimators': 671, 'max_depth': 229, 'min_samples_split': 0.0008337203156191113, 'min_samples_leaf': 0.22618285744096447, 'max_features': 0.0031054572220612082}. Best is trial 5 with value: 0.35608488063628196.[0m
[32m[I 2023-01-22 19:49:32,639][0m Trial 6 finished with value: 0.146887572266982 and parameters: {'n_estimators': 241, 'max_depth': 67, 'min_samples_split': 1.2288512668837788e-05, 'min_samples_leaf': 0.011342734154443222, 'max_features': 0.3308665575985025}. Best is trial 6 with value: 0.146887572266982.[0m
[32m[I 2023-01-22 19:49:33,528][0m Trial 0 finished with value: 0.34525234342966465 and parameters: {'n_estimators': 566, 'max_depth': 164, 'min_samples_split': 0.015829823480207384, 'min_samples_leaf': 0.07134488868890157, 'max_features

In [10]:

joblib.dump(rf_study.best_params, "src/hyperparameter_tuning/study_rf.pkl")

#study = joblib.load("study.pkl")

print("Number of finished trials: ", len(rf_study.trials))
print("Best trial:")
trial = rf_study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

#print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))


viz.plot_optimization_history(rf_study).show()
viz.plot_parallel_coordinate(rf_study).show()
viz.plot_param_importances(rf_study).show()
#viz.plot_contour(rf_study).show()
viz.plot_slice(rf_study).show()


Number of finished trials:  25
Best trial:
  Value: 0.1391255554067333
  Params: 
    n_estimators: 1000
    max_depth: 45
    min_samples_split: 4.696196729393275e-05
    min_samples_leaf: 4.983311015835904e-05
    max_features: 0.0650166290895576
