# Import libraries

In [39]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [40]:
import pandas as pd
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt

# Load data

In [41]:
# My computer has some memory issues, so the data will be saved and loaded regularly during this notebook

In [42]:
start = time.time()
df = pd.read_csv('./rawData/accepted_2007_to_2018Q4.csv')
print(f'Load time : {time.time() - start}')

  interactivity=interactivity, compiler=compiler, result=result)


Load time : 21.69942331314087


# Data cleaning

In [43]:
def is_integer(x):
    '''
    This function returns True if x is an integer, and False otherwise
    '''
    try:
        return (int(x) == float(x))
    except:
        return False

In [44]:
date_cols = ['issue_d', 'earliest_cr_line', 'last_pymnt_d']

def clean_date(x):
    if pd.isnull(x):
        return None
    else:
        return datetime.datetime.strptime(x, "%b-%Y").date()

start = time.time()
for i in date_cols:
    try:
        df[i] = df[i].apply( clean_date )
    except:
        pass
print(f'Time : {time.time() - start}')

Time : 47.096351146698


In [45]:
df = df[~df.term.isna()]
df.shape

(2260668, 151)

# Data selection

In [47]:
relevant_cols = [
# ------------ loan application and LC terms                 
#     'id',  # not a predictor
    'desc', # not a predictor, mapped to new var _has_desc
    'title', # not a predictor
    'application_type',   # binary  {'Individual':0, 'Joint App':1}
    'purpose',  # cardinal
    'grade',    # ordinal, map to N
    'sub_grade', # ordinal, map to N
    'term',      # binary  {'36 months':0, '60 months':1}
    'loan_amnt', # numeric
    'int_rate',  # numeric
    'installment',  # numeric
    'issue_d',   # not a predictor
    'initial_list_status',  # binary  {'w':0, 'f':1}
    'disbursement_method',  # binary  {'Cash':0, 'DirectPay':1}

# ------------ borrower (self-declared except for verification status)
    'member_id', # not a predictor
    'addr_state', # cardinal
    'zip_code',   # not a predictor, b/c 956 zip_codes, too granular
    'home_ownership',  # cardinal, leave 4 categories: MORTGAGE, RENT, OWN, OTHER(+ANY,+NONE)
    'emp_length', # ordinal, map to N
    'emp_title',  # not a predictor
    'annual_inc', # numeric
    'annual_inc_joint',  # not a predictor (mapped to annual_inc)
    'verification_status',  # ordinal: {'Not Verified':0, 'Source Verified':1, 'Verified':2}
    'verification_status_joint',  # not a predictor (mapped to verification_status)

# ------------ hard credit inquiry at origination
    'fico_range_high',  # remove. fico_high=fico_low+4 or, very rarely, fico_high=fico_low+5
    'fico_range_low',  # numeric
    'delinq_2yrs',# numeric
    'dti', # numeric
    'dti_joint',  # not a predictor (mapped to dti)
    'earliest_cr_line', # not a predictor (mapped to new var _credit_hist)
    'open_acc', # numeric
    'pub_rec', # numeric
    'revol_bal', # numeric
    'revol_util', # numeric

# ------------ targets / target components 
    'loan_status',
    'funded_amnt',
    'total_pymnt',  # includes recoveries
    'last_pymnt_d',
    'last_credit_pull_d', # proxy if last_pymnt_d is NaN
    'recoveries',
]

In [48]:
print(f'{len(relevant_cols)} features selected')

# new dataframe of features
df = df[relevant_cols]

39 features selected


In [50]:
print(df[df.last_pymnt_d.isna()]['loan_status'].value_counts())
df.last_pymnt_d = np.where(df.last_pymnt_d.isna(), df.last_credit_pull_d, df.last_pymnt_d)
df.drop('last_credit_pull_d', inplace=True, axis=1)

Charged Off                                            2313
Late (31-120 days)                                      102
Does not meet the credit policy. Status:Charged Off      12
Name: loan_status, dtype: int64


In [51]:
# Joint Applications
# when the loan is joint (legally, a joint obligation), the joint numbers supercede the single numbers :
#    dti_joint  --> if exists, replace dti  
#    annual_inc_joint --> if exists, replace annual_inc
#    verification_status_joint --> if exists AND is 'verified', replace verification_status 

df.dti = np.where(df.application_type=='Joint App', df.dti_joint, df.dti)
df.annual_inc = np.where(df.application_type=='Joint App', df.annual_inc_joint, df.annual_inc)
df.verification_status = np.where(df.verification_status_joint=='Verified', 
                                  df.verification_status_joint, df.verification_status)

# drop the joint numbers, they will not be used anymore
df.drop(['dti_joint', 'annual_inc_joint', 'verification_status_joint'], inplace=True, axis=1)

In [53]:
df.loan_status.value_counts()

Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: loan_status, dtype: int64

In [54]:
remove_loans = ['Current','Late (31-120 days)','Late (16-30 days)','In Grace Period']
df = df[~df.loan_status.isin(remove_loans)]

In [55]:
df.shape

(1348099, 35)

In [56]:
df.loan_status.value_counts(dropna=False, normalize=True)

Fully Paid                                             0.798718
Charged Off                                            0.199213
Does not meet the credit policy. Status:Fully Paid     0.001475
Does not meet the credit policy. Status:Charged Off    0.000564
Default                                                0.000030
Name: loan_status, dtype: float64

# Data imputation

In [57]:
na = df.isna().sum().sort_values(ascending=False)
na = na[na>0].reset_index()
na

Unnamed: 0,index,0
0,member_id,1348099
1,desc,1222144
2,emp_title,85950
3,emp_length,78550
4,title,16661
5,revol_util,897
6,pub_rec,29
7,open_acc,29
8,earliest_cr_line,29
9,delinq_2yrs,29


In [68]:
df.drop(columns=['member_id'], inplace=True)

In [58]:
df.zip_code.fillna('355xx', inplace=True)

In [73]:
df.revol_util.fillna(0, inplace=True)

In [74]:
df.desc.fillna('', inplace=True)
df.emp_title.fillna('', inplace=True)
df.title.fillna('', inplace=True)

In [86]:
df['earliest_cr_line'].dropna(inplace=True)

In [94]:
na = df.isna().sum().sort_values(ascending=False)
na = na[na>0].reset_index()
na

Unnamed: 0,index,0
0,emp_length,78550
1,pub_rec,29
2,delinq_2yrs,29
3,earliest_cr_line,29
4,open_acc,29
5,annual_inc,4
6,dti,3


# Targets

## Classification target

In [15]:
# do not produce numpy targets yet b/c dataframe rows will be modified
good_loans = ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']
df['_good'] = df.loan_status.isin(good_loans).astype(int)

In [16]:
df._good.value_counts(normalize=True)

1    0.800193
0    0.199807
Name: _good, dtype: float64

## Regression target

## Returns

Returns are computed with a fixed time horizon (T months), such that the cash flow is reinvested each month at the yearly rate i%, compounded monthly.\

$$
     R = \frac{12}{T} \frac{1}{f} \left( \left( \frac{p}{m} \left( \frac{1-(1+i)^m}{1-(1+i)} \right) \right)(1+i)^{T-m} - f \right)
$$

- T : horizon of the investment
- f : total amount invested in the loan
- p : total amount repaid and recovered by the loan
- m : actual lenght of the loan
- i : yearly rate

In [17]:
df['loan_length'] = (df['last_pymnt_d'] - df['issue_d']) / np.timedelta64(1, 'M')

In [18]:
def return_reinvest_compound(T, i):
    # Assuming that the total amount paid back was paid at equal
    # intervals during the duration of the loan, calculate the
    # size of each of these installment
    actual_installment = (df.total_pymnt - df.recoveries) / df['loan_length']

    # Assuming the amount is immediately re-invested at the prime
    # rate, find the total amount of money we'll have by the end
    # of the loan
    cash_by_end_of_loan = actual_installment * (1 - pow(1 + i, df.loan_length)) / ( 1 - (1 + i) )
    
    cash_by_end_of_loan = cash_by_end_of_loan + df.recoveries
    
    # Assuming that cash is then re-invested at the prime rate,
    # with monthly re-investment, until T months from the start
    # of the loan
    remaining_months = T - df['loan_length']
    final_return = cash_by_end_of_loan * pow(1 + i, remaining_months)

    # Find the percentage return
    return( (12/T) * ( ( final_return - df['funded_amnt'] ) / df['funded_amnt'] ) )

df['returns1'] = return_reinvest_compound(5*12, .001)
df['returns2'] = return_reinvest_compound(5*12, .0025)
df['returns3'] = return_reinvest_compound(5*12, .005)

# Feature engineering

In [None]:
# create a new field 'has description' (proxy for extra transparency?)
df['_has_desc'] = np.vectorize(len)(df.desc)>0
df._has_desc = df._has_desc.astype(int)
# create a new field for credit history in Years
df['_credit_hist'] = credit_history

# Visualize variables

In [29]:
ordinal = [
    'application_type',
    'grade',
    'sub_grade',
    'term',
    'initial_list_status',  # LC claims this is purely random: chi2 test!
    'emp_length',
    'verification_status',
    '_has_desc',
]

df.home_ownership.replace({'ANY':'OTHER', 'NONE':'OTHER'}, inplace=True)

nominal = [
  'purpose',
  'addr_state', # if linear models do not pick up this feature, revisit amd remomve dummies (probably too much noise)
  'home_ownership',
  'disbursement_method', #highly imbalanced and probably irrelevant
]

numeric = [   
    'loan_amnt', # numeric
    'int_rate',  # numeric
    'installment',  # numeric
    'annual_inc', # numeric
    'fico_range_low', 
    'delinq_2yrs',# numeric
    'dti', # numeric
    'open_acc', # numeric
    'pub_rec', # numeric
    'revol_bal', # numeric
    'revol_util', # numeric
    '_credit_hist', # new numeric
]

In [None]:
def visualize_feature:
    

In [None]:
for col in df.columns:
    