In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics
import pyreadr

In [None]:
loan = pd.read_csv('./Loan Data/Loan Classification Information.csv')
borrower = pd.read_csv('./Loan Data/Borrower Information.csv')
payment = pd.read_csv('./Loan Data/Loan Payment Information.csv')

In [None]:
payment.to_csv('./Loan Data/Loan Payment Information.csv')

In [None]:
#transform emp_length to int
borrower = borrower[borrower['emp_length'].notna()]
borrower.loc[borrower['emp_length']=='< 1 year','emp_length']=0
borrower.loc[borrower['emp_length']=='10+ years','emp_length']=10
borrower.loc[borrower['emp_length']=='1 year','emp_length']=1
borrower.loc[~borrower['emp_length'].isin((0,1,10)),'emp_length']=borrower.loc[~borrower['emp_length'].isin((0,1,10)),'emp_length'].apply(lambda x:int(x[0]))

borrower = borrower[borrower['open_acc'].notna()]
borrower = borrower[borrower['addr_state'].notna()]

#transform earliest_cr_line from date to number of month between the date and today
borrower['earliest_cr_line'] = pd.to_datetime(borrower['earliest_cr_line'])
borrower['earliest_cr_line'] = ((pd.Timestamp.today() - borrower.earliest_cr_line)/ np.timedelta64(1, 'M')).apply(lambda x : int(x))


del borrower['Unnamed: 0'] #remove the line number
del borrower['zip_code'] #the state will be used, the zip code is to specific
del borrower['emp_title'] #too many values to be used

borrower = borrower.loc[borrower['home_ownership'].isin(('RENT', 'OWN', 'MORTGAGE', 'OTHER'))] #remove rows that are non conform to business rule

#use dummies for categories
borrower = pd.concat([borrower, pd.get_dummies(borrower['home_ownership'])], axis=1)
borrower = pd.concat([borrower, pd.get_dummies(borrower['addr_state'])], axis=1)

del borrower['home_ownership']
del borrower['addr_state']

In [None]:
#remove loan on which we do not know the outcome from the training set
loan = loan[loan.loan_status != 'Issued']
loan = loan[loan.loan_status != 'Current']

loan = loan[loan['loan_status'].notna()]
loan['loan_status']=loan['loan_status'].str.replace('Does not meet the credit policy. Status:','') #clean the loan status

#risk will be our prediction. The higher, the more likely the loan is going to default.
loan['risk'] = loan['loan_status'].apply(lambda x : 0 if x=='Fully Paid' else (2 if x=='Charged Off' or x=='Default' else 1)) 
del loan['loan_status']

#cast duration from str to int
loan = loan[loan['term'].notna()]
loan['term']=loan['term'].apply(lambda x : int(x[1:3]))

#cast pymnt_loan from str to bool
loan = loan[loan['pymnt_plan'].notna()]
loan['pymnt_plan'] = loan['pymnt_plan'].apply(lambda x : x=='y')

loan = pd.concat([loan, pd.get_dummies(loan['purpose'])], axis=1)#dummies for categories
del loan['purpose']

#focus only on individual applications, too few values for the joint
loan=loan[loan['application_type'] == 'INDIVIDUAL']
del loan['application_type']

#correlated to loan_amount
del loan['funded_amnt']
del loan['funded_amnt_inv']

#close to what we want to predict
del loan['grade']
del loan['sub_grade']

#no valuable informations
del loan['Unnamed: 0']
del loan['issue_d']

#unusable information and similar to purpose
del loan['desc']
del loan['title']

In [None]:
loan_borrower = pd.merge(loan, borrower, on='member_id')

payment_loan_borrower = pd.merge(payment, loan_borrower, on='id')
df = payment_loan_borrower

In [None]:

#too few values
del df["open_acc_6m"]
del df["open_il_12m"]
del df["open_il_24m"]
del df["open_il_6m"]
del df["mths_since_rcnt_il"]
del df["total_bal_il"]
del df["il_util"]
del df["open_rv_12m"]
del df["open_rv_24m"]
del df["max_bal_bc"]
del df["all_util"]
del df["inq_fi"]
del df["total_cu_tl"]
del df["inq_last_12m"]
del df["mths_since_last_delinq"]
del df["mths_since_last_record"]
del df["mths_since_last_major_derog"]

#related to joint application
del df["dti_joint"]
del df["annual_inc_joint"]

#information not available at loan approval
del df["total_pymnt_inv"]
del df["total_pymnt"]
del df["total_rec_prncp"]
del df["total_rec_late_fee"]
del df["total_rec_int"]
del df["last_pymnt_amnt"]
del df["last_credit_pull_d"]
del df["last_pymnt_d"]
del df["next_pymnt_d"]
del df["out_prncp"]
del df["out_prncp_inv"]
del df["recoveries"]
del df["collection_recovery_fee"]

df = df[df['collections_12_mths_ex_med'].notna()]
df = df[df['tot_coll_amt'].notna()]
df = df[df['tot_cur_bal'].notna()]
df = df[df['revol_util'].notna()]

#strong correlation between loan_amnt and installment (>85%)
#merging the two data
df["tm_full_paid"] = df["loan_amnt"]/df["installment"]
del df["loan_amnt"]
del df["installment"]

In [None]:
outcome = 'risk'
#using random forest for its handling of binary values and large amount of features.
model = RandomForestClassifier(n_estimators=100, min_samples_leaf=50, max_features=None, n_jobs=-1)
#the feature vector has been resized after analysis of their importance
predictors = ['dti',
 'inq_last_6mths',
 'revol_bal',
 'revol_util',
 'tot_coll_amt',
 'tot_cur_bal',
 'term',
 'int_rate',
 'credit_card',
 'debt_consolidation',
 'small_business',
 'emp_length',
 'annual_inc',
 'open_acc',
 'pub_rec',
 'total_acc',
 'delinq_2yrs',
 'earliest_cr_line',
 'MORTGAGE',
 'OWN',
 'RENT',
 'CA',
 'NY',
 'tm_full_paid']

In [None]:
data = df

In [None]:
#Fit the model:
model.fit(data[predictors],data[outcome])

#Make predictions on training set:
predictions = model.predict(data[predictors])

#Print accuracy
accuracy = metrics.accuracy_score(predictions,data[outcome])
print ("Accuracy :"+str("{0:.3%}".format(accuracy)))

In [None]:
#Perform k-fold cross-validation with 5 folds
kf = KFold(n_splits=5)
error = []
for train, test in kf.split(data):
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])

    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]

    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)

    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))

print ("Cross-Validation Score : "+str("{0:.3%}".format(np.mean(error))))

#Fit the model again so that it can be refered outside the function:
model.fit(data[predictors],data[outcome]) 