# Data loading

_This part was adapted from a notebook of @Xiao Song_

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd    
import matplotlib.pyplot as plt
from tabulate import tabulate
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
credit = pd.read_csv('../input/credit-card-approval-prediction/credit_record.csv')  
application = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv') 
credit

In [None]:
application

In [None]:
print(f"how many unique ID in application record? {len(set(application['ID']))}")
print(f"how many unique ID in credit record? {len(set(credit['ID']))}")
print(f"how many IDs do two tables share? {len(set(application['ID']).intersection(set(credit['ID'])))}")

# Definition of "Bad" client

Detailed explanation could be seen [here](https://www.listendata.com/2019/09/credit-risk-vintage-analysis.html). 


_This part was adapted from a notebook of @Xiao Song_

In [None]:
grouped = credit.groupby('ID')
### convert credit data to wide format which every ID is a row
pivot_tb = credit.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
pivot_tb['open_month'] = grouped['MONTHS_BALANCE'].min() # smallest value of MONTHS_BALANCE, is the month when loan was granted
pivot_tb['end_month'] = grouped['MONTHS_BALANCE'].max() # biggest value of MONTHS_BALANCE, might be observe over or canceling account
pivot_tb['ID'] = pivot_tb.index
pivot_tb = pivot_tb[['ID', 'open_month', 'end_month']]
pivot_tb['window'] = pivot_tb['end_month'] - pivot_tb['open_month'] # calculate observe window
pivot_tb.reset_index(drop = True, inplace = True)
credit = pd.merge(credit, pivot_tb, on = 'ID', how = 'left') # join calculated information
credit0 = credit.copy()
credit = credit[credit['window'] > 20] # delete users whose observe window less than 20
credit['status'] = np.where((credit['STATUS'] == '2') | (credit['STATUS'] == '3' )| (credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 1, 0) # analyze > 60 days past due 
credit['status'] = credit['status'].astype(np.int8) # 1: overdue 0: not
credit['month_on_book'] = credit['MONTHS_BALANCE'] - credit['open_month'] # calculate month on book: how many months after opening account
credit.sort_values(by = ['ID','month_on_book'], inplace = True)

##### denominator
denominator = pivot_tb.groupby(['open_month']).agg({'ID': ['count']}) # count how many users in every month the account was opened
denominator.reset_index(inplace = True)
denominator.columns = ['open_month','sta_sum']

##### ventage table
vintage = credit.groupby(['open_month','month_on_book']).agg({'ID': ['count']}) 
vintage.reset_index(inplace = True)
vintage.columns = ['open_month','month_on_book','sta_sum'] 
vintage['due_count'] = np.nan
vintage = vintage[['open_month','month_on_book','due_count']] # delete aggerate column
vintage = pd.merge(vintage, denominator, on = ['open_month'], how = 'left') # join sta_sum colun to vintage table
vintage

In [None]:
larger_window = abs(vintage['open_month'].min())
for j in range(-larger_window,1): # outer loop: month in which account was opened
    ls = []
    for i in range(0,larger_window+1): # inner loop time after the credit card was granted
        due = list(credit[(credit['status'] == 1) & (credit['month_on_book'] == i) & (credit['open_month'] == j)]['ID']) # get ID which satisfy the condition
        ls.extend(due) # As time goes, add bad customers
        vintage.loc[(vintage['month_on_book'] == i) & (vintage['open_month'] == j), 'due_count'] = len(set(ls)) # calculate non-duplicate ID numbers using set()
        
vintage['sta_rate']  = vintage['due_count'] / vintage['sta_sum'] # calculate cumulative % of bad customers
vintage        

Using `pivot` to convert long data to wide data:

In [None]:
### Vintage wide table
vintage_wide = vintage.pivot(index = 'open_month',
                             columns = 'month_on_book',
                             values = 'sta_rate')

# plot vintage line chart
vintage0 = vintage_wide.replace(0,np.nan)
lst = [i for i in range(0,larger_window + 1)]
vintage_wide[lst].T.plot(legend = False, grid = True, title = 'Cumulative % of Bad Customers (> 60 Days Past Due)')
#plt.axvline(30)
#plt.axvline(25)
#plt.axvline(20)
plt.xlabel('Months on Books')
plt.ylabel('Cumulative % > 60 Days Past Due')
plt.show()

## Observe Window Analysis

Because of two reasons, account cancellation and observe over, our observe on accounts will be truncated. Observe window is a significant parameter to be considered. If observe window is too short, users' behavior will not fully show off, which will bring unnecessary noise to our data.

In order to observe how many accounts increase as observe window extend, we plot this. 

In [None]:
lst = []
for i in range(0,larger_window + 1):
    ratio = len(pivot_tb[pivot_tb['window'] < i]) / len(set(pivot_tb['ID']))
    lst.append(ratio)
    
pd.Series(lst).plot(legend = False, grid = True, title = ' ')
plt.xlabel('Observe Window')
plt.ylabel('account ratio')
plt.show()

We could see that a 60 months observe window covers all appliers, while 20 months window contains about 52% records.

In [None]:
def calculate_observe(credit, command):
    '''calculate observe window
    '''
    larger_window = abs(credit['MONTHS_BALANCE'].min())
    id_sum = len(set(pivot_tb['ID']))
    credit['status'] = 0
    exec(command)
    #credit.loc[(credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1
    credit['month_on_book'] = credit['MONTHS_BALANCE'] - credit['open_month']
    minagg = credit[credit['status'] == 1].groupby('ID')['month_on_book'].min()
    minagg = pd.DataFrame(minagg)
    minagg['ID'] = minagg.index
    obslst = pd.DataFrame({'month_on_book':range(0,larger_window + 1), 'rate': None})
    lst = []
    for i in range(0,larger_window + 1):
        due = list(minagg[minagg['month_on_book']  == i]['ID'])
        lst.extend(due)
        obslst.loc[obslst['month_on_book'] == i, 'rate'] = len(set(lst)) / id_sum 
    return obslst['rate']

command = "credit.loc[(credit['STATUS'] == '0') | (credit['STATUS'] == '1') | (credit['STATUS'] == '2') | (credit['STATUS'] == '3' )| (credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1"   
morethan1 = calculate_observe(credit, command)
command = "credit.loc[(credit['STATUS'] == '1') | (credit['STATUS'] == '2') | (credit['STATUS'] == '3' )| (credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1"   
morethan30 = calculate_observe(credit, command)
command = "credit.loc[(credit['STATUS'] == '2') | (credit['STATUS'] == '3' )| (credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1"
morethan60 = calculate_observe(credit, command)
command = "credit.loc[(credit['STATUS'] == '3' )| (credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1"
morethan90 = calculate_observe(credit, command)
command = "credit.loc[(credit['STATUS'] == '4' )| (credit['STATUS'] == '5'), 'status'] = 1"
morethan120 = calculate_observe(credit, command)
command = "credit.loc[(credit['STATUS'] == '5'), 'status'] = 1"
morethan150 = calculate_observe(credit, command)

In [None]:
obslst = pd.DataFrame({'past due more than 30 days': morethan30,
                       'past due more than 60 days': morethan60,
                       'past due more than 90 days': morethan90,
                       'past due more than 120 days': morethan120,
                       'past due more than 150 days': morethan150
                        })
obslst.plot(grid = True, title = 'Cumulative % of Bad Customers Analysis')
plt.xlabel('Months on Books')
plt.ylabel('Cumulative %')
plt.show()

This plot could be seen as a average (across open-month) version of vintage plot.
For longer past due date, it needs longer observe window. For example, more than 150 days past due needs at least 5 months until first *bad customer* appears. For most situation, a 20-months observe window could cover most *bad customer*. However, For 30 days past due, we could see that after 30 months on books, there still are new *bad customer* join in the list. So a 20 MOB observe window will be appropriate. Those who exists shorter than the observe window should be excluded from our analysis, thus you could see I deleted users whose observe window less than 20 on last section (the window could be changed).

## Overall Past-due Ratio

Calculating overall past-due rate. Respectively, we analyze 1 day past due, 20 days past due, 60 days past due, 90 days past due, 120 days past due, 150 days past due. This analysis could help us to define who are *bad customers*. We could see that almost 87% users have past due more than 1 day, which is too common, thus it's inappropriate to be a standard. What about 150 days overdue? Only 0.4% of accounts appear to past due that long. If we use that, we will left many *bad customers* in our scrutiny. A table like that will help you to determine what  will be the most suitable standard of *bad customers*.

In [None]:
def calculate_rate(pivot_tb, command): 
    '''calculate bad customer rate
    '''
    credit0['status'] = None
    exec(command) # excuate input code
    sumagg = credit0.groupby('ID')['status'].agg(sum)
    pivot_tb = pd.merge(pivot_tb, sumagg, on = 'ID', how = 'left')
    pivot_tb.loc[pivot_tb['status'] > 1, 'status'] = 1
    rate = pivot_tb['status'].sum() / len(pivot_tb)
    return round(rate, 5)

command = "credit0.loc[(credit0['STATUS'] == '0') | (credit0['STATUS'] == '1') | (credit0['STATUS'] == '2') | (credit0['STATUS'] == '3' )| (credit0['STATUS'] == '4' )| (credit0['STATUS'] == '5'), 'status'] = 1"   
morethan1 = calculate_rate(pivot_tb, command)
command = "credit0.loc[(credit0['STATUS'] == '1') | (credit0['STATUS'] == '2') | (credit0['STATUS'] == '3' )| (credit0['STATUS'] == '4' )| (credit0['STATUS'] == '5'), 'status'] = 1"   
morethan30 = calculate_rate(pivot_tb, command)
command = "credit0.loc[(credit0['STATUS'] == '2') | (credit0['STATUS'] == '3' )| (credit0['STATUS'] == '4' )| (credit0['STATUS'] == '5'), 'status'] = 1"
morethan60 = calculate_rate(pivot_tb, command)
command = "credit0.loc[(credit0['STATUS'] == '3' )| (credit0['STATUS'] == '4' )| (credit0['STATUS'] == '5'), 'status'] = 1"
morethan90 = calculate_rate(pivot_tb, command)
command = "credit0.loc[(credit0['STATUS'] == '4' )| (credit0['STATUS'] == '5'), 'status'] = 1"
morethan120 = calculate_rate(pivot_tb, command)
command = "credit0.loc[(credit0['STATUS'] == '5'), 'status'] = 1"
morethan150 = calculate_rate(pivot_tb, command)

summary_dt = pd.DataFrame({'situation':['past due more than 1 day',
                               'past due more than 30 days',
                               'past due more than 60 days',
                               'past due more than 90 days',
                               'past due more than 120 days',
                               'past due more than 150 days'],
                      'bad customer ratio':[morethan1,
                               morethan30,
                               morethan60,
                               morethan90, 
                               morethan120,
                               morethan150, 
                      ]})
summary_dt

## Definition of targets

Possibilities : 
1. Past due more than X days
2. Past more than Y% of dues

In [None]:
#"Bad" client are identified as client that past due more than 30 days
y = credit0[['ID','STATUS','status']]
y['status'] = 0 #0 is the label for a "good" client
exec("y.loc[(y['STATUS'] == '1') | (y['STATUS'] == '2') | (y['STATUS'] == '3' )| (y['STATUS'] == '4' )| (y['STATUS'] == '5'), 'status'] = 1") #1 is the label for a "Bad" client
y = y[['ID','status']].rename(columns={"ID": "ID", "status": "target"})
y

## Features engineering

In [None]:
application = pd.read_csv('../input/credit-card-approval-prediction/application_record.csv') 

In [None]:
#Numerical features
application = application.replace(['N','Y'],[0,1]) #Converts Yes/No in 1/0
application = application.rename(columns={"CODE_GENDER":"F", "NAME_EDUCATION_TYPE":"EDUCATION"})
application = application.replace(['F','M'],[1,0]) #Converts Female/Male in 1/0
application = application.replace(['Academic degree', 'Higher education', 'Incomplete higher', 
                                   'Secondary / secondary special', 'Lower secondary'],
                                  [4,3,2,1,0]) #Converts education level into numerical features

In [None]:
#Convert categorical variables
application['OCCUPATION_TYPE'] = application['OCCUPATION_TYPE'].apply(lambda x : 'Unknown' if pd.isnull(x) else x)
application = pd.get_dummies(application)

# Machine Learning

In [None]:
#Merge both datasets
df = application
df = df.merge(y, on="ID", how="inner").drop(columns=["ID"])
#df = df[:10000] #Test only

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.model_selection import cross_validate

n_tree = 200

#Create a random forest classifier
RandomForest = RandomForestClassifier(n_estimators=n_tree,
                              max_depth=15,
                              min_samples_leaf=5, 
                              max_samples=0.10,
                              bootstrap=False, #Dataset is large enought (Bootstrap add some variability)
                              class_weight="balanced" #Give a larger weight to unrepresented class
                              )

In [None]:
#Split into train and test set
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,balanced_accuracy_score,confusion_matrix

train, test = train_test_split(df, shuffle=True, test_size=0.2)

X_train, y_train = train.drop(columns=['target']), train.target
X_test, y_test = test.drop(columns=['target']), test.target

RandomForest.fit(X_train, y_train)
y_predict_train = RandomForest.predict(X_train)
y_predict = RandomForest.predict(X_test)

print('Training Score is {:.2%}'.format(accuracy_score(y_train, y_predict_train)))
print('Balanced training Score is {:.2%}'.format(balanced_accuracy_score(y_train, y_predict_train)))
display(pd.DataFrame(confusion_matrix(y_train,y_predict_train,normalize="all")).style.format('{:.2%}'))

print('Accuracy Score is {:.2%}'.format(accuracy_score(y_test, y_predict)))
print('Balanced accuracy Score is {:.2%}'.format(balanced_accuracy_score(y_test, y_predict)))
display(pd.DataFrame(confusion_matrix(y_test,y_predict,normalize="all")).style.format('{:.2%}'))

_Extra trees are less performant. The following code is commented_

<code>
ExtraTree = ExtraTreesClassifier(n_estimators=200,
                              max_depth=15,
                              min_samples_leaf=16, 
                              class_weight="balanced" #Give a larger weight to unrepresented class
                              )
</code><code>
#Split into train and test set
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,balanced_accuracy_score,confusion_matrix
</code><code>
train, test = train_test_split(df, shuffle=True, test_size=0.1)
</code><code>
X_train, y_train = train.drop(columns=['target']), train.target
X_test, y_test = test.drop(columns=['target']), test.target
</code><code>
ExtraTree.fit(X_train, y_train)
y_predict_train = ExtraTree.predict(X_train)
y_predict = ExtraTree.predict(X_test)
</code><code>
print('Training Score is {:.2%}'.format(accuracy_score(y_train, y_predict_train)))
print('Balanced training Score is {:.2%}'.format(balanced_accuracy_score(y_train, y_predict_train)))
display(pd.DataFrame(confusion_matrix(y_train,y_predict_train,normalize="all")).style.format('{:.2%}'))
</code><code>
print('Accuracy Score is {:.2%}'.format(accuracy_score(y_test, y_predict)))
print('Balanced accuracy Score is {:.2%}'.format(balanced_accuracy_score(y_test, y_predict)))
display(pd.DataFrame(confusion_matrix(y_test,y_predict,normalize="all")).style.format('{:.2%}'))
</code>

# Explicability

In [None]:
#Parameters
features = X_train.columns
nb_features = 5

### General information

In [None]:
#Plot one complete decision tree

estimator = RandomForest.estimators_[5] #5 is an arbitrary choice

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = X_train.columns,
                class_names = ['0','1'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

In [None]:
# Return the top 10 features in importance order (%)
disp_df = pd.DataFrame(RandomForest.feature_importances_.T, columns=['gini'], index=features)
disp_df['gini'] = disp_df.gini.apply(lambda x : round(x*100,2))

print('Top %s most important features' % nb_features)
disp_tab = disp_df.gini.nlargest(nb_features)
print(tabulate(np.array([disp_tab.index, disp_tab.values]).T, headers=['Features','%']))

print('\n%s least important features' % nb_features)
disp_tab = disp_df.gini.nsmallest(nb_features)
print(tabulate(np.array([disp_tab.index, disp_tab.values]).T, headers=['Features','%']))

### Focusing on specific candidates

In [None]:
#Select samples
nb_samples = 5
target = 1 #0 loan accepted, 1 refused

y_predict_df = pd.DataFrame(y_predict, columns=['target'])
index = y_predict_df[y_predict_df.target==target].sample(n=nb_samples).index
df_accepted = X_test.iloc[index]
samples = df_accepted.to_numpy()

In [None]:
#Print the confidence across samples
import matplotlib.pyplot as plt

confidence = RandomForest.predict_proba(df_accepted).T
configs = confidence[0]
N = len(configs)
ind = np.arange(N)

width = 0.4

p1 = plt.bar(ind, confidence[0]*n_tree, width, color='g')
p2 = plt.bar(ind, confidence[1]*n_tree, width, bottom=confidence[0]*n_tree, color='r')

plt.ylim([0,1.2*n_tree])
plt.ylabel('Number of estimators', fontsize=12)
plt.xlabel('Samples', fontsize=12)
plt.legend((p1[0], p2[0]), ('Accepted', 'Refused'), fontsize=12, ncol=2, framealpha=0, fancybox=True)
plt.show()

In [None]:
from collections import Counter

#Code to display the most important features for each samples 
dict_feat = dict()
for sample_id in range(len(samples)) : 
    dict_feat[sample_id] = Counter()
    

for estimator in RandomForest.estimators_ :
    
    # Using those arrays, we can parse the tree structure:
    node_indicator = estimator.decision_path(samples)
    feature = estimator.tree_.feature
    threshold = estimator.tree_.threshold
    impurity = estimator.tree_.impurity

    # Similarly, we can also have the leaves ids reached by each sample.
    leave_id = estimator.apply(samples)
    
    for sample_id in range(len(samples)) : 
        node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

        for node_id in node_index:
            if leave_id[sample_id] == node_id:
                continue

            if (samples[sample_id, feature[node_id]] <= threshold[node_id]):
                threshold_sign = " - lower"
            else:
                threshold_sign = " - larger"

            key = features[feature[node_id]] + threshold_sign + ":" + str(samples[sample_id, feature[node_id]])

            #dict_feat[sample_id][key] += 1
            dict_feat[sample_id][key] += (0.5-impurity[node_id]) #the lesser gini is, the more determinant is the feature


#Print rules
for sample_id in range(len(samples)) : 
    print('\nDeterminant rules to predict sample %s: ' % sample_id)
    disp_tab = []
    for k, v in dict_feat[sample_id].most_common(nb_features):
        k_tab = k.split(':')
        #disp_tab.append([k_tab[0],k_tab[1],'{0:.0%}'.format(v/n_tree)])
        disp_tab.append([k_tab[0],k_tab[1],'{:.2}'.format(v/n_tree)])
    print(tabulate(disp_tab, headers=['Reasons','Values', 'Gini metrics'])) 