# Table of Contents
* [Target Exploration](#1)
* [Numerical Features](#2)
* [Categorical Features](#3)
* [Target vs Features](#4)
* [Fit Model](#5)
* [Partial Dependence Plots](#6)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# ML tools
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# import data
df = pd.read_csv('../input/loan-data/loan_data.csv')
df.head()

In [None]:
# data overview
df.info()

<a id='1'></a>
# Target Exploration

In [None]:
# basic stats
target = 'not.fully.paid'
print(df[target].value_counts())
df[target].value_counts().plot(kind='bar')
plt.grid()
plt.show()

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['int.rate', 'installment', 'log.annual.inc',
                'dti', 'fico', 'days.with.cr.line',
                'revol.bal', 'revol.util',
                'inq.last.6mths', 'delinq.2yrs']

In [None]:
# basic stats
df[features_num].describe()

In [None]:
# plot distribution of numerical features
for f in features_num:
    plt.figure(figsize=(8,4))
    df[f].plot(kind='hist', bins=30)
    plt.title(f)
    plt.grid()
    plt.show()

### More details:

In [None]:
plt.figure(figsize=(8,2))
plt.boxplot(df['revol.bal'], vert=False)
plt.title('revol.bal - Boxplot')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8,2))
plt.boxplot(np.log10(1+df['revol.bal']), vert=False)
plt.title('revol.bal - Boxplot / log-transformed')
plt.grid()
plt.show()

### Correlations:

In [None]:
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

fig = plt.figure(figsize = (9,7))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (9,7))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

### Scatter Plots:

In [None]:
# example of scatter plot
sns.jointplot(data=df, x='fico', y='int.rate', kind='hex')
plt.show()

In [None]:
# example of scatter plot
sns.jointplot(data=df, x='revol.util', y='int.rate', kind='hex')
plt.show()

<a id='3'></a>
# Categorical Features

In [None]:
features_cat = ['credit.policy', 'purpose', 'pub.rec']

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(14,4))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

<a id='4'></a>
# Target vs Features

### Numerical Features

In [None]:
# plot target vs binned numerical features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_num:
    
    # add binned version of each numerical feature first
    new_var = f + '_bin'
    df[new_var] = pd.qcut(df[f], 10, duplicates='drop')
    
    # then create mosaic plot
    plt.rcParams['figure.figsize'] = (16,5) # increase plot size for mosaics
    mosaic(df, [new_var, target], title='Target vs ' + f + ' [binned]')
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

### Categorical Features

In [None]:
# plot target vs features using mosaic plot
plt_para_save = plt.rcParams['figure.figsize'] # remember plot settings

for f in features_cat:
    plt.rcParams['figure.figsize'] = (16,5) # increase plot size for mosaics
    mosaic(df, [f, target], title='Target vs ' + f + ' [binned]',
           label_rotation=90)
    plt.show()
    
# reset plot size again
plt.rcParams['figure.figsize'] = plt_para_save

<a id='5'></a>
# Fit Model

In [None]:
# select predictors
predictors = features_num + features_cat
print('Number of predictors: ', len(predictors))
print(predictors)

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df)

# force categorical target
df_hex[target] = df_hex[target].asfactor()

# train / test split (70/30)
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

In [None]:
# define Linear Model
fit_1 = H2OGeneralizedLinearEstimator(family = 'binomial',
                                      nfolds = 5,
                                      alpha = 0, # 0:Ridge (L2), 1:LASSO (L1)
                                      lambda_search = True,
                                      score_each_iteration = True,                                          
                                      seed=12345)

In [None]:
# train model
t1 = time.time()
fit_1.train(x=predictors,
            y=target,
            training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show coefficients
fit_1.coef()

### Variable Importance

In [None]:
# variable importance plot
fit_1.varimp_plot(-1)

### Training/CV - Performance

In [None]:
# confusion matrix with manual threshold - here we try to achieve a symmetric outcome
tt = 0.2419
conf_train_man = fit_1.confusion_matrix(train=True, thresholds=tt)
conf_train_man.show()

In [None]:
# confusion matrix on cross validation
conf_cv_man = fit_1.confusion_matrix(xval=True, thresholds=tt)
conf_cv_man.show()

In [None]:
# training performance - AUC
perf_train = fit_1.model_performance(train=True)
perf_train.plot()

In [None]:
# cross validation performance - AUC
perf_cv = fit_1.model_performance(xval=True)
perf_cv.plot()

### Test Set - Performance

In [None]:
# calc performance on test test
perf_test = fit_1.model_performance(test_hex)

# ROC Curve - Test Set
perf_test.plot()

In [None]:
# confusion matrix using our manual threshold
conf_test_man = perf_test.confusion_matrix(thresholds=tt)
conf_test_man.show()

In [None]:
# calc corresponding accuracy
conf_list_temp = conf_test_man.to_list()
n_matrix = sum(conf_list_temp[0]) + sum(conf_list_temp[1]) 
acc_t1_test = (conf_list_temp[0][0]+conf_list_temp[1][1]) / n_matrix
print('Accuracy:', np.round(acc_t1_test,6))

In [None]:
# predict on test set (extract probabilities only)
pred_test = fit_1.predict(test_hex)['p1']
pred_test = pred_test.as_data_frame().p1

In [None]:
# plot test set predictions (probabilities)
plt.figure(figsize=(7,5))
plt.hist(pred_test, bins=100)
plt.title('Predictions on Test Set')
plt.grid()
plt.show()

<a id='6'></a>
# Partial Dependence Plots

In [None]:
# partial dependence plot (on training data)
fit_1.pd_plot(train_hex, column='credit.policy');

In [None]:
# partial dependence plot (on training data)
fit_1.pd_plot(train_hex, column='int.rate');

In [None]:
# partial dependence plot (on training data)
fit_1.pd_plot(train_hex, column='fico');

In [None]:
# partial dependence plot (on training data)
fit_1.pd_plot(train_hex, column='purpose');