## Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

### Import packages

In [None]:
# standard Python tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# special tools for working in Kaggle
import joblib   # save and load ML models
import gc       # garbage collection
import os 
import sklearn

# preprocessing steps
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# machine learning models and tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# cross validation and metrics - remember this competition is scored as area under curve
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

# surely there will be a lot more packages loaded by the time we are done!

# First look at training data set

### Read the training data

In [None]:
MainDir = "../input/../input/home-credit-default-risk"
print(os.listdir(MainDir))

# Main table
train = pd.read_csv(f'{MainDir}/application_train.csv')

train.head(5)

### Training data - select smaller sample and limited features for first run

In [None]:
# Create a stratified sample: reduce number of observations to 10,000, but keep original proportion of 0s and 1s in TARGET
print(1 - train['TARGET'].mean())   # full data set is 91.93% zero and 8.07% one.

n = 10000         # set sample size
train10K = train.groupby('TARGET', group_keys=False).apply(lambda x: x.sample(int(np.rint(n*len(x)/len(train))))).sample(frac=1).reset_index(drop=True)

# did that work? Yes, 9193 out of 10,000 in our sample are zeroes.
(train10K['TARGET'].value_counts() / len(train10K)).to_frame()

### Feature selection (just a few variables for now) and pipeline

In [None]:
# Selected features:
num_features = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'OWN_CAR_AGE']
cat_features = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
features = num_features + cat_features

#Pipeline:
Pipe_num = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())       ])

Pipe_cat = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'Unknown')),
    ('onehot', OneHotEncoder())        ])

#ColumnTransformer:

preprocessor = ColumnTransformer(
    transformers = [
        ('num', Pipe_num, num_features),
        ('cat', Pipe_cat, cat_features)])

### Build Model Scoreboard

In [None]:
# set up table
results = pd.DataFrame(columns = ['Model Type','AreaUnderCurve', 'Accuracy', 'Hyperparameters'])
results

In [None]:
preprocessor.fit(train10K[features])
X_train = preprocessor.transform(train10K[features])

y_train = train10K.TARGET.values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)

# Models

### Logistic Regression

In [None]:
%%time

# sorted(sklearn.metrics.SCORERS.keys())    <--- need metric roc_auc

lr_clf = LogisticRegression(max_iter=1000, solver='saga', penalty = 'elasticnet')

lr_parameters = {'l1_ratio':[0, 0.3, 0.6, 1], 'C': [0.01, 0.1, 0.3, 1, 3]}

lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=10, refit='True', n_jobs=-1, verbose=10, scoring='roc_auc')
lr_grid.fit(X_train, y_train)

lr_model = lr_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Logistic Regression',
                          'AreaUnderCurve' : lr_grid.best_score_,
                          'Accuracy' : lr_model.score(X_train, y_train),
                          'Hyperparameters' : lr_grid.best_params_},
                        ignore_index=True)
results

### Decision Tree

In [None]:
%%time

dt_clf = DecisionTreeClassifier(random_state=1)

dt_parameters = {
    'max_depth': [4, 8, 12, 16, 20, 24],
    'min_samples_leaf': [2, 4, 6, 8]
}

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=10, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
dt_grid.fit(X_train, y_train)

dt_model = dt_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Decision Tree',
                          'AreaUnderCurve' : dt_grid.best_score_,
                          'Accuracy' : dt_model.score(X_train, y_train),
                          'Hyperparameters' : dt_grid.best_params_},
                        ignore_index=True)
results

### Random Forest

In [None]:
%%time

rf_clf = RandomForestClassifier(random_state=1, n_estimators=100)

rf_parameters = {'max_depth': [4, 8, 12, 16, 20],  'min_samples_leaf': [4, 6, 8, 10, 12]}

rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=10, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
rf_grid.fit(X_train, y_train)

rf_model = rf_grid.best_estimator_

# update model scoreboard
results = results.append({'Model Type' : 'Random Forest',
                          'AreaUnderCurve' : rf_grid.best_score_,
                          'Accuracy' : rf_model.score(X_train, y_train),
                          'Hyperparameters' : rf_grid.best_params_},
                        ignore_index=True)
results

### Model plots

In [None]:
plt.figure(figsize=[18,4])
plt.subplot(1,3,1)
lr_summary = pd.DataFrame(lr_grid.cv_results_['params'])
lr_summary['cv_score'] = lr_grid.cv_results_['mean_test_score']

for r in lr_parameters['l1_ratio']:
    temp = lr_summary.query(f'l1_ratio == {r}')
    plt.plot(temp.C, temp.cv_score, label=r)
plt.xscale('log')
plt.xlabel('Regularization Parameter (C)')
plt.ylabel('CV Score')
plt.legend(title='L1 Ratio', loc='lower right')
plt.grid()

plt.subplot(1,3,2)
dt_summary = pd.DataFrame(dt_grid.cv_results_['params'])
dt_summary['cv_score'] = dt_grid.cv_results_['mean_test_score']

for ms in dt_parameters['min_samples_leaf']:
    temp = dt_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()

plt.subplot(1,3,3)
rf_summary = pd.DataFrame(rf_grid.cv_results_['params'])
rf_summary['cv_score'] = rf_grid.cv_results_['mean_test_score']

for ms in rf_parameters['min_samples_leaf']:
    temp = rf_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

### Sidebar - confusion matrix

In [None]:
# The decision tree model shows a handful of people as having a >50% chance of default. But we are scored on AUC, not on accuracy.

# Generate confusion matrix
predictions = dt_model.predict(X_train)
matrix = pd.DataFrame(confusion_matrix(train10K['TARGET'], predictions)) 

# plot as seaborn heatmap
ax= plt.subplot()
sns.heatmap(matrix, annot = True, cmap = "BuPu", fmt='g', cbar = False)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
plt.show()

### Final Model Selection - save data

In [None]:
final_model = RandomForestClassifier(random_state=1, n_estimators=25, max_depth = 4, min_samples_leaf = 2)
final_model.fit(X_train, y_train)

joblib.dump(preprocessor, 'default_preprocessor.joblib') 
joblib.dump(final_model, 'default_model_01.joblib')