In [233]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score

import warnings
def ignore_warn(*args, **kwargs):
    pass

from IPython.display import Image
import pickle
import os 

# Fetures /  Target

In [235]:
# import clients data
df = pd.read_csv('./dataset/data_prepared.csv', index_col=0)



# Normalize data
X = df.drop(labels = ["_will_pay"],axis = 1)
y = df['_will_pay']

X = X.drop(['_is_account_recent','_is_common_type','log_income'], axis=1,inplace = False)

scaler = StandardScaler()
X_SS = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns) 

X_train, X_test, y_train, y_test = train_test_split(X_SS, y, test_size=0.2, random_state=42)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Mined / Engineered Features

- monthly_income:                
- monthly_outcome: 
- ratio_outcome_income: monthly_outcome/monthly_income
- sqrt_income:  square-root of monthly income
- sqrt_outco:                  
- log_income: natural log of montly income                    
- log_outcome: 
- total_credit_payments:  Total number of payments        
- payments_per_year: Number of payment per year             
- loan_term:    Lengh of payment in years                    
- loan_amount:  Total amount lent                
                  
- worst_previous_delinquency: Worst registered loan delinquency (amount) before account opening
- worst_previous_fraction: Worst registered loan delinquency (amount) before account opening       
- _is_account_recent: Open after 2012            
- _is_common_type: Belong to the most typical operations (75 percentile)               
         


Bolean Target Feature: 

- _will_pay = (worst_delinquency_past_due_estimated / loan_amount) < 0.15

Important: Class good/bad client was completely ignored 
because it had not clear meaning.

This model was choosen such that use the provided data to simulate the 
lending business: predict if a given client with this data ask us for 
a certain {loan_amount, loan_term, payments_per_year} and predict 
if this will pay



#### Important notes

Important: The only users kept were those that provided enough
information such that loan_amount and term could be estimated. 
These are essencial features we need to reach the objectives of this
project. See data_understanding.ipyn for details.

In particular, the loan_amount was estimated as follows:

- If loan IS delinquent:

loan_amount_estimated = total_credit_payments * past_due_balance / number_of_payments_due 
            = total_credit_payments * worst_delinquency_past_due_balance / worst_delinquency 

- If loan is NOT delinquent and current_balance $\ne$ 0

$\text{loan_amount} = \text{total_credit_payments} \times \text{amount_to_pay_next_payment}$

- If loan is NOT delinquent and current_balance $=$ 0

loan_amount =? maximum_credit_amount

where =? here means the maximum estimate of the loan.

# Load models

In [236]:
filenames_models = os.listdir('./trained_models/')
filenames_tables = [i for i in os.listdir('./plots_tables/') if '.csv' in i]
filenames_plots = [i for i in os.listdir('./plots_tables/') if '.pdf' in i]

In [237]:
trained_models_dic={}
for mol in filenames_models:
    trained_models_dic[mol]=pickle.load(open('./trained_models/'+mol, 'rb'))

In [238]:
# Get training scores 

scores_train = [] ; std_train = [] ; scores_test = [] ; models_names = []

for mod in trained_models_dic:
    models_names.append(mod)
    
    acc = cross_val_score(trained_models_dic[mod], X_train, y_train, scoring = "precision", cv = 4)
    scores_train.append(acc.mean())
    
    std_train.append(acc.std())

    acc = precision_score(y_test, trained_models_dic[mod].predict(X_test))
    scores_test.append(acc)



# Evaluation of Clasification: will_pay 

The classification metric needed is without a doubt precision, as we are trying to 
minimize the false positive because this cost us money! The racionaly behind this choise is 
that in absence of expertise I aim to minimize possible loosses instead of maximizing possible gains. 

- The precision reach by the models used is similar: ~ 89% +- 4% . 
- The std of the blended is very small because results from combining the 6 best models through soft votes.

The details on modeling.ipyn, can be summarize by saying that some models
certainly have a variance problem and there is an imbalace of will_pay 




In [239]:
# Creating a table of results, ranked highest to lowest
results = pd.DataFrame({
    'Model': models_names,
    'Train-Precision Score': scores_train,
    'training std': std_train,
    'Test-Precision Score': scores_test})

result_df = results.sort_values(by='Test-Precision Score', ascending=False).reset_index(drop=True)

result_df.head(12)

Unnamed: 0,Model,Train-Precision Score,training std,Test-Precision Score
0,K_Nearest_Neighbour,0.903034,0.011705,0.93133
1,blended,0.896947,0.004886,0.910112
2,XGBoost,0.904422,0.003665,0.906367
3,Random_Forest,0.892319,0.005902,0.904059
4,Gradient_Boosting,0.904191,0.007341,0.901515
5,Extra_Trees,0.877564,0.006634,0.89781
6,AdaBoost,0.889859,0.006729,0.886861
7,SVC,0.876788,0.008946,0.882784
8,Bagging_Classifier,0.873307,0.014143,0.877698
9,Gaussian_Process,0.845644,0.003902,0.863014


# Answer 1er question: Pick the best clients 

Given some new clients (say our test set), that provide the feature of our model, I 
select those clients with the classifier:

In [240]:
df_clients_report = pd.DataFrame(scaler.inverse_transform(X_test), index=X_test.index, columns=X_test.columns)

df_clients_report.reset_index(level=0, inplace=True)

df_clients_report['_will_pay_predicted'] = trained_models_dic['blended'].predict(X_test)


In [241]:
df_clients_report.head(1)

Unnamed: 0,index,monthly_income,monthly_outcome,total_credit_payments,payments_per_year,loan_term,loan_amount,worst_previous_delinquency,worst_previous_fraction,ratio_outcome_income,sqrt_income,sqrt_outcome,log_outcome,_will_pay_predicted
0,11124,3500.0,7444.0,19.0,24.0,0.791667,46588.0,25.0,0.0125,0.470177,59.160798,86.278618,8.915164,1


# Answer 2nd question: Propose amount and term

Notice that, our (business) model allows, for each new client, that request centain

- total_credit_payments
- payments_per_year 
- loan_term 
- loan_amount = (money_requested)*(1+interest)

whether this client is or not likely to pay back. More on the interest below. 

In fact, if our client does not classify, we can even modify this information
so that our prediction system give us a high probabily of payback.

Example, let us take a bad client:

In [232]:
df_clients_report[df_clients_report.index==1176]

Unnamed: 0,index,monthly_income,monthly_outcome,total_credit_payments,payments_per_year,loan_term,loan_amount,worst_previous_delinquency,worst_previous_fraction,ratio_outcome_income,sqrt_income,sqrt_outcome,log_outcome,_will_pay_predicted


In [192]:
df_clients_report[df_clients_report.user_id==1176] 
scaler().fit_transform(X)

Unnamed: 0,user_id,monthly_income,monthly_outcome,total_credit_payments,payments_per_year,loan_term,loan_amount,worst_previous_delinquency,worst_previous_fraction,ratio_outcome_income,sqrt_income,sqrt_outcome,log_outcome,_will_pay_predicted
118,1176,750.0,37.0,12.0,12.0,1.0,4800.0,0.0,0.0,20.27027,27.386128,6.082763,3.610918,0


In [None]:
Image(filename='./plots_tables/The_most_important_features_in_predicting_client_class:_Gaussian_Bayes.png')