In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.linear_model
import sklearn.tree
import sklearn.metrics
import sklearn.feature_selection
import sklearn.ensemble
import sklearn.model_selection

%matplotlib inline

# *Load in the data downloaded from Lending Club* 
*(source: https://www.lendingclub.com/info/download-data.action)*

*There are two datasets, one for rejected loans and for approved loans (and how they performed).*

- *Approved loans are for the period from 2007-2011*
- *Rejected loans are for the period from 2007-2012*

In [22]:
reject_df = pd.read_csv('/Users/joce/learning/data_science/lending-club/RejectStatsA.csv',\
                        low_memory=False, skiprows=1)

In [23]:
loan_df = pd.read_csv('/Users/joce/learning/data_science/lending-club/LoanStats3a.csv',\
                        low_memory=False, skiprows=1)

In [24]:
reject_df.head()

Unnamed: 0,Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code
0,1000.0,2007-05-26,Wedding Covered but No Honeymoon,693.0,10%,481xx,NM,4 years,0
1,1000.0,2007-05-26,Consolidating Debt,703.0,10%,010xx,MA,< 1 year,0
2,11000.0,2007-05-27,Want to consolidate my debt,715.0,10%,212xx,MD,1 year,0
3,6000.0,2007-05-27,waksman,698.0,38.64%,017xx,MA,< 1 year,0
4,1500.0,2007-05-27,mdrigo,509.0,9.43%,209xx,MD,< 1 year,0


In [5]:
loan_df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


*Lending Club also provided us with a data dictionary for the approved loans dataset*

In [21]:
data_dict = pd.read_excel('/Users/joce/learning/data_science/lending-club/LCDataDictionary.xlsx')

In [11]:
data_dict

Unnamed: 0,LoanStatNew,Description
0,acc_now_delinq,The number of accounts on which the borrower i...
1,acc_open_past_24mths,Number of trades opened in past 24 months.
2,addr_state,The state provided by the borrower in the loan...
3,all_util,Balance to credit limit on all trades
4,annual_inc,The self-reported annual income provided by th...
5,annual_inc_joint,The combined self-reported annual income provi...
6,application_type,Indicates whether the loan is an individual ap...
7,avg_cur_bal,Average current balance of all accounts
8,bc_open_to_buy,Total open to buy on revolving bankcards.
9,bc_util,Ratio of total current balance to high credit/...


# *Problem Statement*
1. *Can we predict whether a loan application will be approved?*
2. *How well did the approved loans do?*

*To predict loan application approval, we'll need a dataset that has both rejected applications and approved applications.*  

*However, the approved loan dataset has more columns than the rejected loan dataset.*
*We'll need to look at extracting relevant columns from the approved loan dataset so that we can merge it with the rejected loan dataset.*

In [14]:
# Check column names for each of our datasets and look for overlaps
for i in reject_df.columns:
    print i, '|',

Amount Requested | Application Date | Loan Title | Risk_Score | Debt-To-Income Ratio | Zip Code | State | Employment Length | Policy Code |


In [15]:
for i in loan_df.columns:
    print i, '|',

id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_il_6m | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m

*Comparing the two datasets*

|Rejected|Approved|
|---|---
|Amount reguested| loan_amnt
|Application Date | N.A.
| Loan Title | title
| Risk_Score |N.A.
| Debt-To-Income Ratio| dti 
| Zip Code |zip_code
| State | addr_state
| Employment Length| emp_length
| Policy Code |policy_code

*N.A. denotes that there is no comparable column*