In [15]:
# This notebook handles filtering, encoding and standardisation for the Lending Club dataset
import pandas as pd
import datetime as dt
from numpy.random import random

p = 1  
if p < 1:
    accept = pd.read_csv('../rawData/accepted_2007_to_2018Q4.csv', skiprows=lambda i: i>0 and random() > p)
else:
    accept = pd.read_csv('../rawData/accepted_2007_to_2018Q4.csv')

In [16]:
# Use columns from Guetta paper, plus fields mentioned by Aiko
cols = ['id', 'loan_amnt', 'funded_amnt', 'term',
        'int_rate', 'grade', 'sub_grade', 'emp_length', 
        'home_ownership', 'annual_inc', 'verification_status', 
        'issue_d', 'loan_status', 'purpose', 'dti', 'delinq_2yrs', 
        'earliest_cr_line', 'open_acc', 'pub_rec', 'fico_range_high', 
        'fico_range_low', 'revol_bal', 'revol_util', 'total_pymnt', 
        'recoveries', 'last_pymnt_d']
accept = accept[cols]

In [17]:
# Remove subtotal rows
def is_int(x):
    try:
        _ = int(x)
        return(True)
    except:
        return(False)

accept = accept[accept['id'].apply(is_int)]
accept['id'] = accept['id'].apply(int)

In [18]:
# Reject loans with missing last_pymnt_d
accept = accept[accept['last_pymnt_d'].notna()]

In [19]:
# Reject loans pre 2009
def make_dateval(s):
    try:
        s = s.split('-')
        d = dt.datetime.strptime(s[1] + s[0] + '01', '%Y%b%d')
        return(int(d.day + d.month*100 + d.year*10000))
    except:
        return(20070101)

accept['issue_d'] = accept['issue_d'].apply(make_dateval)
accept = accept[accept['issue_d'] >= 20090101]
accept['issue_d'] = accept['issue_d'].astype('uint32')

In [20]:
# Remove loans that have not terminated yet
accept.loc[accept['loan_status']=='Does not meet the credit policy. Status:Fully Paid', 'loan_status'] = 'Fully Paid'
accept.loc[accept['loan_status']=='Does not meet the credit policy. Status:Charged Off', 'loan_status'] = 'Charged Off'
accept = accept[(accept['loan_status'] == 'Fully Paid') | (accept['loan_status'] == 'Charged Off')]

In [21]:
accept.shape[0] - 269020

1073721

In [22]:
# Turn dates into datetime
accept['earliest_cr_line'] = accept['earliest_cr_line'].apply(make_dateval)
accept['earliest_cr_line'] = accept['earliest_cr_line'].astype('uint32')

accept['last_pymnt_d'] = accept['last_pymnt_d'].apply(make_dateval)
accept['last_pymnt_d'] = accept['last_pymnt_d'].astype('uint32')

In [24]:
# Encode all non numeric columns to numeric, and compress data size
accept['id'] = accept['id'].astype('uint32')

In [25]:
import re
accept['term'] = accept['term'].apply(lambda x: int(re.search('\d\d', x).group(0)))
accept['term'] = accept['term'].astype('uint8')

In [26]:
# Encode grades as the median of the sub_grade range they summarize
grades = pd.Series([i*5 + 2 for i in range(7)], index=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
accept['grade'] = accept['grade'].apply(lambda s: grades[s])
accept['grade'] = accept['grade'].astype('uint8')

In [27]:
sub_grades = pd.Series(range(35), index=[i+j for i in ['A', 'B', 'C', 'D', 'E', 'F', 'G'] for j in ['1', '2', '3', '4', '5']])
accept['sub_grade'] = accept['sub_grade'].apply(lambda s: sub_grades[s])
accept['sub_grade'] = accept['sub_grade'].astype('uint8')

In [28]:
def emp_to_int(s):
    try:
        if s[0]=='<':
            return(0)
        else:
            return(int(re.search('\d+', s).group(0)))
    except:
        return(-1)

accept['emp_length'] = accept['emp_length'].apply(emp_to_int)
accept['emp_length'] = accept['emp_length'].astype('int8')

In [29]:
home_dict = {'RENT': 0, 'MORTGAGE': 1, 'OWN': 2, 'ANY': 0, 'OTHER': 0, 'NONE': 0}
accept['home_ownership'] = accept['home_ownership'].apply(lambda x: home_dict[x])
accept['home_ownership'] = accept['home_ownership'].astype('uint8')
# arguably this should be dummified

In [30]:
verify_dict = {'Source Verified': 1, 'Verified': 1,'Not Verified': 0}
accept['verification_status'] = accept['verification_status'].apply(lambda x: verify_dict[x])
accept['verification_status'] = accept['verification_status'].astype('uint8')

In [31]:
accept['loan_status'] = accept['loan_status'].apply(lambda x: 1 if x=='Fully Paid' else 0)
accept['loan_status'] = accept['loan_status'].astype('uint8')

In [32]:
# dummify 'purpose'
dummies = pd.get_dummies(accept['purpose'], drop_first=True, prefix='purp', dtype='uint8')
accept = accept.drop('purpose', axis=1)
accept = pd.concat([accept, dummies], axis=1)

In [33]:
# handle NAs in dti and revol_util
for col in ['dti', 'revol_util']:
    accept.loc[accept[col].isna(), col] = -1

In [34]:
accept.dtypes

id                          uint32
loan_amnt                  float64
funded_amnt                float64
term                         uint8
int_rate                   float64
grade                        uint8
sub_grade                    uint8
emp_length                    int8
home_ownership               uint8
annual_inc                 float64
verification_status          uint8
issue_d                     uint32
loan_status                  uint8
dti                        float64
delinq_2yrs                float64
earliest_cr_line            uint32
open_acc                   float64
pub_rec                    float64
fico_range_high            float64
fico_range_low             float64
revol_bal                  float64
revol_util                 float64
total_pymnt                float64
recoveries                 float64
last_pymnt_d                uint32
purp_credit_card             uint8
purp_debt_consolidation      uint8
purp_educational             uint8
purp_home_improvemen

In [35]:
accept.isna().any()

id                         False
loan_amnt                  False
funded_amnt                False
term                       False
int_rate                   False
grade                      False
sub_grade                  False
emp_length                 False
home_ownership             False
annual_inc                 False
verification_status        False
issue_d                    False
loan_status                False
dti                        False
delinq_2yrs                False
earliest_cr_line           False
open_acc                   False
pub_rec                    False
fico_range_high            False
fico_range_low             False
revol_bal                  False
revol_util                 False
total_pymnt                False
recoveries                 False
last_pymnt_d               False
purp_credit_card           False
purp_debt_consolidation    False
purp_educational           False
purp_home_improvement      False
purp_house                 False
purp_major

In [36]:
# Split into train and test
accept = accept.sort_values('issue_d')
N = accept.shape[0]
train = accept.iloc[:int(N * .8), :]
test = accept.iloc[int(N * .8):, :]

In [37]:
train.to_pickle('../derivedData/train.pkl')

In [38]:
test.to_pickle('../derivedData/test.pkl')