# 003: Hyperparameter Tuning

This notebook takes our optimal model, as selected during model selection, and performs a grid search across three key hyperparameters to understand the best combination of hyperparameters.

In [18]:
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import numpy as np
import joblib
from sklearn.metrics import classification_report

In [2]:
dict_types = {
    'Customer ID': 'category',
    'Age': 'int64',
    'Professional experience': 'int64',
    'Salary': 'int64',
    'ZIP code': 'category',
    'Family size': 'float64',
    'Average credit cards spend': 'float64',
    'Education level': 'category',
    'Mortgage': 'float64',
    'Loan': 'bool',
    'Securities account': 'bool',
    'Certificate of Deposit account': 'bool',
    'Online Banking': 'bool',
    'Credit Card': 'bool'
    }
data = pd.read_csv('../data/preprocessed/preprocessed_data.csv', dtype=dict_types)

In [3]:
data.dtypes

Customer ID                       category
Age                                  int64
Professional experience              int64
Salary                               int64
ZIP code                          category
Family size                        float64
Average credit cards spend         float64
Education level                   category
Mortgage                           float64
Loan                                  bool
Securities account                    bool
Certificate of Deposit account        bool
Online Banking                        bool
Credit Card                           bool
dtype: object

In [4]:
# load the dataset
def load_dataset(full_path):
    dict_types = {
    'Customer ID': 'category',
    'Age': 'int64',
    'Professional experience': 'int64',
    'Salary': 'int64',
    'ZIP code': 'category',
    'Family size': 'float64',
    'Average credit cards spend': 'float64',
    'Education level': 'category',
    'Mortgage': 'float64',
    'Loan': 'bool',
    'Securities account': 'bool',
    'Certificate of Deposit account': 'bool',
    'Online Banking': 'bool',
    'Credit Card': 'bool'
    }
    dataframe = pd.read_csv(full_path, na_values='?', dtype=dict_types)
    dataframe = dataframe.drop(columns=['Customer ID', 'ZIP code'], axis=1)
    y = dataframe['Loan']
    X = dataframe.drop('Loan', axis=1)
    cat_ix = X.select_dtypes(include=['category']).columns
    num_ix = X.select_dtypes(include=['int64', 'float64', 'bool']).columns
    return X, y, cat_ix, num_ix

In [5]:
# evaluate a model
def evaluate_model(X, y, model):
 # define evaluation procedure
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 # evaluate model
 scores = cross_validate(model, X, y, scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'], cv=cv, n_jobs=-1)
 return scores

In [6]:
# load the dataset
X, y, cat_ix, num_ix = load_dataset('../data/preprocessed/preprocessed_data.csv')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
accuracies = list()
f1_scores = list()
# evaluate each model

numeric_transformer = Pipeline(
		steps=[("num_imputer", SimpleImputer(strategy="median")), ("scaler", MinMaxScaler())]
		)
categorical_transformer = Pipeline(
		steps=[("cat_imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]
	)

ct = ColumnTransformer(
transformers=[
	("num", numeric_transformer, num_ix),
	("cat", categorical_transformer, cat_ix),
]
)
# wrap the model in a pipeline
pipeline = Pipeline(steps=[('t',ct), ('over', SMOTE()), ('m', XGBClassifier())])

grid_params = {
	'm__gamma': [0, 0.1, 1, 10, 100],
	'm__max_depth': [None, 10, 100],
	'm__n_estimators': [10, 100, 1000]
}

clf = GridSearchCV(pipeline, grid_params, verbose=5, scoring='f1')
clf.fit(X_train, y_train)

print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END m__gamma=0, m__max_depth=None, m__n_estimators=10;, score=0.941 total time=   0.1s
[CV 2/5] END m__gamma=0, m__max_depth=None, m__n_estimators=10;, score=0.943 total time=   0.1s
[CV 3/5] END m__gamma=0, m__max_depth=None, m__n_estimators=10;, score=0.910 total time=   0.1s
[CV 4/5] END m__gamma=0, m__max_depth=None, m__n_estimators=10;, score=0.918 total time=   0.1s
[CV 5/5] END m__gamma=0, m__max_depth=None, m__n_estimators=10;, score=0.879 total time=   0.1s
[CV 1/5] END m__gamma=0, m__max_depth=None, m__n_estimators=100;, score=0.941 total time=   0.7s
[CV 2/5] END m__gamma=0, m__max_depth=None, m__n_estimators=100;, score=0.960 total time=   0.6s
[CV 3/5] END m__gamma=0, m__max_depth=None, m__n_estimators=100;, score=0.914 total time=   0.6s
[CV 4/5] END m__gamma=0, m__max_depth=None, m__n_estimators=100;, score=0.953 total time=   0.6s
[CV 5/5] END m__gamma=0, m__max_depth=None, m__n_estimators=100;, scor

In [10]:
# show optimal hyperparameters following tuning
clf.best_params_

{'m__gamma': 0.1, 'm__max_depth': 10, 'm__n_estimators': 1000}

In [14]:
# create pipeline with optimal parameters
pipeline = Pipeline(
    steps=[
        ('t',ct),
        ('over', SMOTE()),
        ('m', XGBClassifier(
            gamma=clf.best_params_['m__gamma'],
            max_depth=clf.best_params_['m__max_depth'],
            n_estimators=clf.best_params_['m__n_estimators']
            )
        )
    ]
)

In [15]:
# fit pipeline with training data
pipeline.fit(X_train, y_train)

In [16]:
# use fitted pipeline to make predictions on test data
y_preds = pipeline.predict(X_test)

In [19]:
# create confusion matrix
confmat = confusion_matrix(y_pred=y_preds, y_true=y_test)
confmat

array([[892,   7],
       [  9,  89]])

In [20]:
# create classification report
print(classification_report(y_true=y_test, y_pred=y_preds))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       899
        True       0.93      0.91      0.92        98

    accuracy                           0.98       997
   macro avg       0.96      0.95      0.95       997
weighted avg       0.98      0.98      0.98       997



In [22]:
# save trained, tuned model
joblib.dump(pipeline, '../models/tuned_pipeline.pkl')

['../models/tuned_pipeline.pkl']