# Model Selection and HyperParameter Tuning

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import pickle

import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from xgboost.sklearn import XGBRegressor

from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

  from collections import Mapping, Set, Iterable


In [3]:
filename = './data/3_FtSet_XGImportance.pkl'
infile = open(filename, 'rb')
data_set1 = pickle.load(infile)
infile.close()

In [4]:
filename = './data/3_Target.pkl'
infile = open(filename, 'rb')
target = pickle.load(infile)
infile.close()

## Split Data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data_set1, target, test_size=0.2, random_state=0)

## XGBoost, Hyperparameter Tuning

In [21]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.array([0.01, 0.05, 0.1])),
    'max_depth':        hp.choice('max_depth',        np.array([3, 5, 7])),
    'subsample':        hp.choice('subsample',        np.array([0.8, 1])),
    'n_estimators':     hp.choice('n_estimators',     np.array([300, 500, 1000, 2000], dtype=int))
}

In [22]:
def objective(space):

    reg = XGBRegressor(
        n_estimators = space['n_estimators'],
        learning_rate= space['learning_rate'],
        max_depth= space['max_depth'],
        subsample= space['subsample'],
        n_jobs= -1
    )
        
    eval_set  = [(X_train, y_train), (X_test, y_test)]

    reg.fit(X_train, y_train,
            eval_set = eval_set, 
            eval_metric = 'rmse',
            verbose = False,
            early_stopping_rounds= 5)

    pred = reg.predict(X_test)
    
    cv_scores = cross_val_score(reg, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')
    loss = abs(cv_scores.mean())
    
    return {'loss': loss, 'status': STATUS_OK}

trials = Trials()

best = fmin(fn= objective,
            space= xgb_reg_params,
            algo= tpe.suggest,
            max_evals= 50,
            trials= trials)

100%|███████████████████████████████████████████████████| 50/50 [05:50<00:00,  7.32s/it, best loss: 1.8880251393493432]


In [28]:
results_summary = pd.DataFrame(columns=['XGBoost Baseline', 'XGBoost Optimizado'])
results_summary.loc['Error medio absoluto'] = [2.214, 1.888]
results_summary

Unnamed: 0,XGBoost Baseline,XGBoost Optimizado
Error medio absoluto,2.214,1.888


In [34]:
best

{'learning_rate': 0, 'max_depth': 2, 'n_estimators': 2, 'subsample': 0}

In [35]:
best_model = XGBRegressor(
        n_estimators = 1000,
        learning_rate= 0.01,
        max_depth= 7,
        subsample= 0.8,
        n_jobs= -1)

## Exportar Exit Point

In [36]:
filename = './data/5_XGB_Reg_Model.pkl'
outfile = open(filename, 'wb')
pickle.dump(best_model, outfile)
outfile.close()