In [6]:
import os
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import graphviz

# User-defined functions
def model_summary(y_test, dtc_predict, dtc_cv_score):
    print("=== Confusion Matrix ===")
    print(confusion_matrix(y_test, dtc_predict))
    print('\n')
    print("=== Classification Report ===")
    print(classification_report(y_test, dtc_predict))
    print('\n')
    print("=== All AUC Scores ===")
    print(dtc_cv_score)
    print('\n')
    print("=== Mean AUC Score ===")
    print("Mean AUC Score - Random Forest: ", dtc_cv_score.mean())

In [3]:
# Import and prepare data

# FICO Home Equity Line of Credit (HELOC) Dataset (https://community.fico.com/s/explainable-machine-learning-challenge)
heloc = pd.read_csv('heloc_dataset_v1.csv')

# The target variable to predict is a binary variable called RiskPerformance. 
heloc.loc[heloc['RiskPerformance']=='Good','Risk'] = 0
heloc.loc[heloc['RiskPerformance']=='Bad','Risk'] = 1
heloc.drop(columns=['RiskPerformance'], inplace=True)

# Define predcitors and target variable
X = heloc.drop(columns=['Risk'])
y = heloc['Risk']

# Implement train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)

In [19]:
# Create a simple tree model
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train, y_train)
dtc_predict = dtc.predict(X_test) # predictions
dtc_cv_score = cross_val_score(dtc, X, y, cv=10, scoring='roc_auc') #scores

# Model summary
model_summary(y_test, dtc_predict, dtc_cv_score)

=== Confusion Matrix ===
[[ 968  694]
 [ 615 1175]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.61      0.58      0.60      1662
         1.0       0.63      0.66      0.64      1790

    accuracy                           0.62      3452
   macro avg       0.62      0.62      0.62      3452
weighted avg       0.62      0.62      0.62      3452



=== All AUC Scores ===
[0.64215568 0.58665934 0.61470147 0.60341392 0.63930403 0.67995238
 0.62679487 0.61712271 0.61697985 0.64800183]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.6275086083946635


In [22]:
dot_data = tree.export_graphviz(dtc, out_file=None, filled = True)
graph = graphviz.Source(dot_data)
graph.render("image",view=True)

'digraph Tree {\nnode [shape=box, style="filled", color="black"] ;\n0 [label="X[0] <= 73.5\\ngini = 0.499\\nsamples = 7007\\nvalue = [3338, 3669]", fillcolor="#edf6fd"] ;\n1 [label="X[0] <= 67.5\\ngini = 0.425\\nsamples = 4138\\nvalue = [1270, 2868]", fillcolor="#91c8f1"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="X[22] <= 47.5\\ngini = 0.373\\nsamples = 2742\\nvalue = [680, 2062]", fillcolor="#7abdee"] ;\n1 -> 2 ;\n3 [label="X[14] <= 0.5\\ngini = 0.488\\nsamples = 546\\nvalue = [231, 315]", fillcolor="#cae5f8"] ;\n2 -> 3 ;\n4 [label="X[8] <= 10.0\\ngini = 0.483\\nsamples = 504\\nvalue = [205, 299]", fillcolor="#c1e0f7"] ;\n3 -> 4 ;\n5 [label="X[1] <= 388.5\\ngini = 0.489\\nsamples = 470\\nvalue = [200, 270]", fillcolor="#cce6f8"] ;\n4 -> 5 ;\n6 [label="X[17] <= 50.5\\ngini = 0.487\\nsamples = 466\\nvalue = [196, 270]", fillcolor="#c9e4f8"] ;\n5 -> 6 ;\n7 [label="X[4] <= 32.5\\ngini = 0.489\\nsamples = 459\\nvalue = [196, 263]", fillcolor="#cde6f8"] ;\n

In [20]:
# Create a Random Forest Model

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train) #fit
rfc_predict = rfc.predict(X_test) # predictions
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc') #scores

# Model summary
model_summary(y_test, rfc_predict, rfc_cv_score)

=== Confusion Matrix ===
[[1088  574]
 [ 404 1386]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.73      0.65      0.69      1662
         1.0       0.71      0.77      0.74      1790

    accuracy                           0.72      3452
   macro avg       0.72      0.71      0.71      3452
weighted avg       0.72      0.72      0.72      3452



=== All AUC Scores ===
[0.78328938 0.62908608 0.80142491 0.77339194 0.79255678 0.82602747
 0.79245788 0.73740476 0.80682784 0.82320183]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7765668867829418


In [None]:
# Tuning Hyperparameters

# Optimize select hyperparamaters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # number of trees in random forest
max_features = ['auto', 'sqrt'] # number of features at every split
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)] # max depth
max_depth.append(None)

# Create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, 
                                cv = 3, verbose=2, random_state=42, n_jobs = -1)
rfc_random.fit(X_train, y_train) # Fit the model
print(rfc_random.best_params_) # Print results


In [None]:
# Final Model
# Refit the model with tuned parameters
rfc = RandomForestClassifier(n_estimators=1400, max_depth=100, max_features='auto')
rfc.fit(X_train,y_train) #fit
rfc_predict = rfc.predict(X_test) # predictions
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc') #scores

# Model summary
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())


In [None]:
# Feature Importance

importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X.columns, rfc.feature_importances_):
    feats[feature] = importance 
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})

importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90) #Plot
importances.head() # Print the feature ranking