In [1]:
# Initial setup by importing needed modules
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
test = pd.read_csv('data/LoanStats3b.csv', header=1, low_memory=False)

In [3]:
# repeat all the data wrangling steps
test_drop = test.dropna(axis=1, how='all', thresh=40000).dropna(subset=['loan_amnt'])
test_drop.select_dtypes(include='float').fillna(0);
test_drop = test_drop.assign(loan_start_d=pd.to_datetime(test_drop.issue_d, format='%b-%Y'))
test_drop['yr_emp'] = test_drop['emp_length'].str[0:2].replace('< ',0).astype('float')
test_drop.earliest_cr_line = pd.to_datetime(test_drop.earliest_cr_line, format='%b-%Y')
test_drop['yr_credit']= (2013-test_drop.earliest_cr_line.dt.year).fillna(0)
test_drop['revol_util_dec'] = test_drop['revol_util'].str.replace(r'%',r'0').astype('float')/100
loan_dict = {'Fully Paid': 1, 'Charged Off': 0, 'Does not meet the credit policy. Status:Fully Paid': 1, 'Does not meet the credit policy. Status:Charged Off': 0 }
test_drop = test_drop.assign(target=test_drop.loan_status.map(loan_dict))

In [4]:
# capturing the needed features for machine learning validations
test_ml = test_drop[['purpose', 'yr_credit', 'dti', 'revol_util_dec', 'total_acc', 'addr_state', 'target']]

In [5]:
# data transformation to match training data setting
purpose_ml = pd.get_dummies(test_ml['purpose'], drop_first=True)
test_ml = pd.concat([test_ml, purpose_ml], axis=1)

In [6]:
# Get the list formed by the train set to ensure consistence on states that fall under 'SML'
SML = pd.read_csv('data/SML.csv', header=0)
SML.columns = ['state']
SML_list = SML.state.tolist()
test_ml['helper_col'] = test_ml['addr_state'].isin(SML_list)
test_ml['state'] = np.where(test_ml.helper_col == 1, 'SML', test_ml.addr_state)

In [7]:
state = pd.get_dummies(test_ml['state'], drop_first = True)
test_ml = pd.concat([test_ml, state], axis=1)

In [8]:
def dti(data):
    if data['dti'] <= 5: return 5
    elif (data['dti'] > 5) & (data['dti'] <=10) : return 10
    elif (data['dti'] > 10) & (data['dti'] <=15) : return 15
    elif (data['dti'] > 15) & (data['dti'] <=20) : return 20
    elif (data['dti'] > 20) & (data['dti'] <=30) : return 30
    else: return 40
    
test_ml['dti_gp'] = test_ml.apply(dti, axis=1)
dti_gp = pd.get_dummies(test_ml['dti_gp'])
test_ml = pd.concat([test_ml, dti_gp], axis = 1)

In [9]:
test_ml['revol'] = test_ml['revol_util_dec'].fillna(0)

In [10]:
test_ml_fin = test_ml.drop(['addr_state', 'helper_col', 'dti_gp', 'state', 'dti', 'purpose', 'revol_util_dec'], axis=1)

In [11]:
# export to a csv file 
test_ml_fin.to_csv('data/lending_test.csv', index=False)