In [15]:
import os
import pandas as pd
import numpy as np
import datetime as dt
from collections import Counter
import random
import warnings
warnings.filterwarnings('ignore')

In [39]:
loans = pd.read_csv(os.getcwd()+'/files/loan.csv')

<h1>Remove outliers</h1>

In [40]:
loans = loans[loans['loan_status'].isin(['Fully Paid','Default','Charged Off'])]
loans['loan_status'].unique()
loans = loans[loans['int_rate']<=25]
loans = loans[loans['loan_amnt'] <=35000]
print(loans.shape)
print(max(loans['int_rate']))
print(max(loans['loan_amnt']))

(1270094, 145)
24.99
35000


<h1> Managing the loan status and loan term </h1>

In [41]:
for key, value in Counter(loans['loan_status']).items():
    print(key + ' : ' + str(value) + ' rows')

Fully Paid : 1022121 rows
Charged Off : 247948 rows
Default : 25 rows


In [42]:
loans = loans.replace({'loan_status':'Charged Off'},'Default')
loans = loans.replace({'term':' 36 months'},36)
loans = loans.replace({'term':' 60 months'},60)


loans = loans.replace({'emp_length':'5 years'},5)
loans = loans.replace({'emp_length':'10+ years'},12)
loans = loans.replace({'emp_length':'4 years'},4)
loans = loans.replace({'emp_length':'3 years'},3)
loans = loans.replace({'emp_length':'1 year'},1)
loans = loans.replace({'emp_length':'< 1 year'},0.5)
loans = loans.replace({'emp_length':'8 years'},8)
loans = loans.replace({'emp_length':'2 years'},2)
loans = loans.replace({'emp_length':'6 years'},6)
loans = loans.replace({'emp_length':'9 years'},9)
loans = loans.replace({'emp_length':'7 years'},7)

<h1>Prepare a datasets for borrowers and investors</h1>

In [43]:
loans_borrowers = loans[['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade',
                        'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status','dti','delinq_2yrs',
                        'earliest_cr_line','mths_since_last_record','open_acc','acc_now_delinq','total_acc',
                        'mths_since_last_major_derog','annual_inc_joint', 'dti_joint','acc_now_delinq','tot_cur_bal','open_acc_6m', 'open_act_il', 
                        'open_il_12m', 'open_il_24m']]

loans_investors = loans[['loan_amnt','funded_amnt', 'funded_amnt_inv','term','int_rate','grade','out_prncp', 'out_prncp_inv','total_pymnt', 'total_pymnt_inv',
                         'total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries', 'collection_recovery_fee','tot_coll_amt']]

<h1> Handling Null Values </h1>

In [44]:
loans_borrowers.columns[loans_borrowers.isnull().any()]

Index(['member_id', 'emp_length', 'dti', 'mths_since_last_record',
       'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint',
       'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m',
       'open_il_24m'],
      dtype='object')

In [45]:
member_id = []

for i in range(1,len(loans_borrowers.index)+1):
    member_id.append(i)

loans_borrowers.loc[np.isnan(loans_borrowers['member_id']), 'member_id'] = member_id

emp_length = int(loans_borrowers['emp_length'].mean(skipna=True))
loans_borrowers.loc[np.isnan(loans_borrowers['emp_length']), 'emp_length'] = emp_length

dti_mean = loans_borrowers['dti'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['dti']), 'dti'] = dti_mean

mts_record_mean = loans_borrowers['mths_since_last_record'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['mths_since_last_record']), 'mths_since_last_record'] = mts_record_mean

derog_mean = loans_borrowers['mths_since_last_major_derog'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['mths_since_last_major_derog']), 'mths_since_last_major_derog'] = derog_mean

annual_inc_joint = loans_borrowers['annual_inc_joint'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['annual_inc_joint']), 'annual_inc_joint'] = annual_inc_joint

dti_joint = loans_borrowers['dti_joint'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['dti_joint']), 'dti_joint'] = dti_joint

tot_cur_bal = loans_borrowers['tot_cur_bal'].mean(skipna=True)
loans_borrowers.loc[np.isnan(loans_borrowers['tot_cur_bal']), 'tot_cur_bal'] = tot_cur_bal


loans_borrowers.loc[np.isnan(loans_borrowers['open_acc_6m']), 'open_acc_6m'] = 0
loans_borrowers.loc[np.isnan(loans_borrowers['open_act_il']), 'open_act_il'] = 0
loans_borrowers.loc[np.isnan(loans_borrowers['open_il_12m']), 'open_il_12m'] = 0
loans_borrowers.loc[np.isnan(loans_borrowers['open_il_24m']), 'open_il_24m'] = 0


loans_borrowers[loans_borrowers.isnull().any(axis = 1)]

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,home_ownership,...,total_acc,mths_since_last_major_derog,annual_inc_joint,dti_joint,acc_now_delinq,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m


In [46]:
loans_investors.columns[loans_investors.isnull().any()]

Index(['tot_coll_amt'], dtype='object')

In [47]:
total_coll_amt = loans_investors['tot_coll_amt'].mean(skipna=True)
loans_investors.loc[np.isnan(loans_investors['tot_coll_amt']), 'tot_coll_amt'] = total_coll_amt

loans_investors[loans_investors.isnull().any(axis = 1)]

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,grade,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,tot_coll_amt


<h1> Feature Engineering </h1>
<h3> Calculated returns for investor in percentages formula == (p-f/f)*(12/term) </h3>

In [68]:
# principal_amt = loans_investors['funded_amnt'] -  loans_investors['total_pymnt']
loans_investors['return'] = (((loans_investors['total_pymnt'] - loans_investors['funded_amnt'])/loans_investors['funded_amnt']) * (12/loans_investors['term']) ) * 100

In [71]:
loans_borrowers.to_csv(os.getcwd()+'/data/borrowers.csv')
loans_investors.to_csv(os.getcwd()+'/data/investors.csv')