# Model Selection and Tuning

This file is concerned with trying different ML algorithms for classification, focusing exclusively on predictive power and starting from the simplest model to the most complicated. We will use the processed data created from the data_preparation.ipynb file, which is a csv named 'train_data_processed.csv'.

The objective of the models is to accurately predict if a client with certain characteristics will default on a loan when asking for it. As we do not know the associated costs with a client not paying back nor the benefits of a client repaying, we will focus on Area Under Curve as the performance metric.

Every model will be trained, tuned and cross-validated using the sci-kit learn library.

In [68]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

jobs = os.cpu_count()-4 ## A lot of power.

In [2]:
df = pd.read_csv('aggregated_train_data.csv', index_col= 0)
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,CC_SK_DPD_MEAN,CC_SK_DPD_DEF_MEAN,CC_NAME_CONTRACT_STATUS_Active_MEAN,CC_NAME_CONTRACT_STATUS_Approved_MEAN,CC_NAME_CONTRACT_STATUS_Completed_MEAN,CC_NAME_CONTRACT_STATUS_Demand_MEAN,CC_NAME_CONTRACT_STATUS_Refused_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_nan_MEAN
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,,,,,,,,,,
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,,,,,,,,,,
2,100004,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,,,,,,,,,,
3,100006,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,...,,,,,,,,,,


In [3]:
df.rename(columns = {col: col.lower() for col in df.columns.values}, inplace = True)
X = df.drop(columns=['target','sk_id_curr'])
y = df.target.copy()

performance_metrics = {}

### First Model: **Logistic Regression**

For this model we will first fit a logistic regression to all the parameters, and take the mean of the AUC score from a 5 fold cross validation. We will then run a logistic regression regularized by Lasso to try and do feature extraction. We will finally compute the regression without the penalty but using only the parameters extracted from the constrained regression.

In [4]:
from sklearn.linear_model import SGDClassifier

In [6]:
## Logistic Regression and other models do not accept NaN values. Will use sklearn's preprocessing to impute mean where necessary.

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X.replace([np.inf, -np.inf], np.nan, inplace=True)
transformations = Pipeline([('impute', SimpleImputer(strategy= 'mean')), ('scale', StandardScaler())])
transformed_data = transformations.fit_transform(X)

All variables

In [46]:
start = time.time()
logistic_regression_full = SGDClassifier(loss = 'log_loss', l1_ratio= 1, n_jobs= jobs, random_state= 0, early_stopping= True, class_weight='balanced')
res = cross_validate(logistic_regression_full, transformed_data,y, cv = 5, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['logistic_regression_full'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

Regularized Logistic regression

In [48]:
param_grid = [{'alpha' : [0.0001,0.0005,0.001]}]
logistic_regression_l1 = SGDClassifier(loss = 'log_loss', penalty = 'l1', l1_ratio = 1, n_jobs= jobs, random_state= 0, early_stopping= True, class_weight='balanced')
grid_search = GridSearchCV(logistic_regression_l1, param_grid, scoring=['recall','roc_auc','accuracy'], refit = 'recall')
grid_search.fit(transformed_data,y)
grid_search.best_params_

{'alpha': 0.0001}

In [49]:
grid_search.best_score_

0.6283987915407855

In [50]:
start = time.time()
logistic_regression_l1 = SGDClassifier(loss = 'log_loss', alpha= 0.0001, penalty = 'l1', l1_ratio= 1, n_jobs= jobs, random_state= 0, early_stopping= True, class_weight='balanced')
res = cross_validate(logistic_regression_l1, transformed_data,y, cv = 5, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['logistic_regression_l1'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

### Second Model: **Support Vector Machines**

In [12]:
from sklearn.linear_model import SGDClassifier

In [52]:
svc = SGDClassifier(loss = 'hinge', random_state = 0, max_iter = 10000, early_stopping = True, class_weight='balanced')
param_grid = [{'alpha': [0.01,0.05,0.1]}]
grid_search = GridSearchCV(svc, param_grid, n_jobs = jobs, cv = 5, scoring = 'recall')
grid_search.fit(transformed_data,y)
grid_search.best_params_

{'alpha': 0.05}

In [53]:
grid_search.best_score_

0.6960322255790533

In [57]:
start = time.time()
svc = SGDClassifier(loss = 'hinge', alpha= 0.05,random_state= 0, max_iter = 10000, early_stopping= True, class_weight='balanced')
res = cross_validate(svc, transformed_data,y, cv = 5, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['SVC'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

### Third Model: **Decision Tree**

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
clf = DecisionTreeClassifier(random_state = 0, class_weight='balanced')
param_grid = [{'max_depth':[11,13,15]}]
grid_search = GridSearchCV(clf, param_grid, n_jobs = jobs, cv = 5, scoring = 'recall')
grid_search.fit(transformed_data,y)
grid_search.best_params_

{'max_depth': 11}

In [43]:
grid_search.best_score_

0.6298489425981872

In [44]:
start = time.time()
dec_tree = DecisionTreeClassifier(max_depth = 11, random_state = 0, class_weight='balanced')
res = cross_validate(dec_tree, transformed_data,y, cv = 5, n_jobs = jobs, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['decision_tree'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

### Fourth Model: **Random Forest**

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
start = time.time()
clf = RandomForestClassifier(n_estimators = 1000, max_depth = 12, min_samples_leaf = 7, n_jobs = jobs,random_state= 0, class_weight='balanced')
res = cross_validate(clf, transformed_data,y, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['random_forest'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

### Fifth Model: **XGBoost**

In [74]:
from xgboost import XGBClassifier

In [78]:
y.value_counts()[0] / y.value_counts()[1]

11.387150050352467

In [81]:
start = time.time()
xgbclf = XGBClassifier(
 learning_rate =0.02,
 n_estimators= 10000,
 max_depth= 12,
 min_child_weight=30,
 gamma = 0.2,
 subsample=0.9,
 colsample_bytree=0.9,
 reg_alpha = 0.05,
 reg_lambda = 0.05,
 verbosity = 0,
 objective= 'binary:logistic',
 nthread= jobs,
 scale_pos_weight= 11,
 seed=0)
res = cross_validate(xgbclf, X,y, scoring=['recall','roc_auc','accuracy'])
end = time.time()
performance_metrics['xgboost'] = {'recall' : np.mean(res['test_recall']),'roc_auc': np.mean(res['test_roc_auc']),'accuracy':np.mean(res['test_accuracy']),'runtime': end-start}

KeyboardInterrupt: 

### Comparing model performance with runtime:

All of the ROC AUC scores were obtained with kfold cross-validation, so they are more robust than regular train scores.

In [None]:
results = pd.DataFrame(performance_metrics).transpose()
results.sort_values(by = ['recall'], ascending= False)