# Model training

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd

df = pd.read_csv("../../datasets/FICO/heloc_dataset_v1.csv")

In [4]:
TARGET = 'RiskPerformance'

In [5]:
df[TARGET] = df[TARGET].factorize()[0]

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RiskPerformance,10459.0,0.478057,0.499542,0.0,0.0,0.0,1.0,1.0
ExternalRiskEstimate,10459.0,67.425758,21.121621,-9.0,63.0,71.0,79.0,94.0
MSinceOldestTradeOpen,10459.0,184.205373,109.683816,-9.0,118.0,178.0,249.5,803.0
MSinceMostRecentTradeOpen,10459.0,8.543455,13.301745,-9.0,3.0,5.0,11.0,383.0
AverageMInFile,10459.0,73.843293,38.782803,-9.0,52.0,74.0,95.0,383.0
NumSatisfactoryTrades,10459.0,19.428052,13.004327,-9.0,12.0,19.0,27.0,79.0
NumTrades60Ever2DerogPubRec,10459.0,0.042738,2.51391,-9.0,0.0,0.0,1.0,19.0
NumTrades90Ever2DerogPubRec,10459.0,-0.142843,2.367397,-9.0,0.0,0.0,0.0,19.0
PercentTradesNeverDelq,10459.0,86.661536,25.999584,-9.0,87.0,96.0,100.0,100.0
MSinceMostRecentDelq,10459.0,6.762406,20.50125,-9.0,-7.0,-7.0,14.0,83.0


# XGBoost model

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.25, random_state=42)

In [8]:
train_x = train[df.columns[~train.columns.isin([TARGET])]]
train_y = train[TARGET]

In [9]:
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from typing import Tuple

def find_best_xgboost_model(train_x: pd.DataFrame, train_y: pd.Series) -> Tuple[dict, float]:
    scale_pos_weight = (len(train_y) - train_y.sum()) / train_y.sum()

    param_test = {
            'max_depth': [1, 2, 4, 8],
            'learning_rate': [0.05, 0.06, 0.07],
            'n_estimators': [10, 100, 200]
        }

    gsearch = GridSearchCV(estimator=XGBClassifier(
        verbosity = 0, silent=True,
        objective='reg:logistic',
        scale_pos_weight=scale_pos_weight,
        seed=27),
        param_grid=param_test, scoring='roc_auc', n_jobs=-1, cv=8)

    gsearch.fit(train_x, train_y)

    return gsearch.best_params_, gsearch.best_score_

In [10]:
# best_params, best_score = find_best_xgboost_model(train_x, train_y)
best_params = {'learning_rate': 0.07, 'max_depth': 2, 'n_estimators': 200}
best_score = 0.8

In [11]:
best_params

{'learning_rate': 0.07, 'max_depth': 2, 'n_estimators': 200}

In [12]:
best_score

0.8

In [13]:
scale_pos_weight = (len(train_y) - train_y.sum()) / train_y.sum()

model = XGBClassifier(objective='reg:logistic',
                              seed=42,
                              max_depth=best_params['max_depth'],
                              learning_rate=best_params['learning_rate'],
                              n_estimators=best_params['n_estimators'],
                              scale_pos_weight=scale_pos_weight,
                              )

In [16]:
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.07, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, objective='reg:logistic', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1.0917333333333332, seed=42, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
model.save_model("../../models/xgboost.json")