# LENDING CLUB CASE STUDY

## Import Libraries

In [293]:
import pandas as pd
import numpy as np

## Data Sourcing

#### Load `Data_Dictionary.xlsx`

In [294]:
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

pd.read_excel("Data_Dictionary.xlsx", header=0)

Unnamed: 0,LoanStatNew,Description
0,acc_now_delinq,The number of accounts on which the borrower is now delinquent.
1,acc_open_past_24mths,Number of trades opened in past 24 months.
2,addr_state,The state provided by the borrower in the loan application
3,all_util,Balance to credit limit on all trades
4,annual_inc,The self-reported annual income provided by the borrower during registration.
5,annual_inc_joint,The combined self-reported annual income provided by the co-borrowers during registration
6,application_type,Indicates whether the loan is an individual application or a joint application with two co-borrowers
7,avg_cur_bal,Average current balance of all accounts
8,bc_open_to_buy,Total open to buy on revolving bankcards.
9,bc_util,Ratio of total current balance to high credit/credit limit for all bankcard accounts.


#### Load -  `loan.csv`

In [295]:
loan = pd.read_csv('loan.csv', dtype={'next_pymnt_d': object})

## Data Understanding

In [296]:
print(loan.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39717 entries, 0 to 39716
Columns: 111 entries, id to total_il_high_credit_limit
dtypes: float64(74), int64(13), object(24)
memory usage: 33.6+ MB
None


In [297]:
print(loan.shape)

(39717, 111)


In [298]:
print(loan.columns)

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq',
       'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens',
       'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
       'total_il_high_credit_limit'],
      dtype='object', length=111)


## Data Cleaning

#### Step 1 - Remove Nulls columns over 90%

In [299]:
#Checking Null Values in percentage across the columns
loan.isna().sum()*100/len(loan)

id                                  0.000000
member_id                           0.000000
loan_amnt                           0.000000
funded_amnt                         0.000000
funded_amnt_inv                     0.000000
term                                0.000000
int_rate                            0.000000
installment                         0.000000
grade                               0.000000
sub_grade                           0.000000
emp_title                           6.191303
emp_length                          2.706650
home_ownership                      0.000000
annual_inc                          0.000000
verification_status                 0.000000
issue_d                             0.000000
loan_status                         0.000000
pymnt_plan                          0.000000
url                                 0.000000
desc                               32.580507
purpose                             0.000000
title                               0.027696
zip_code  

In [300]:
loan = loan.loc[:, loan.isna().sum() < len(loan)*.9]

In [301]:
loan.shape

(39717, 55)

In [302]:
loan.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'policy_code', 'application_type',
       'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt',
       'pub_rec_bankruptcies', 'tax_liens'],
      dtype='object')

#### Step 2: Remove duplicates based on ID column

In [303]:
loan['id'].duplicated().any() # No Duplicates

False

#### Step 3 - Dropping Column with constant or unexplained values.

In [304]:
loan.drop(["id", "member_id", "grade", "sub_grade", "emp_title", "pymnt_plan", "url",
          "desc", "purpose", "title", "delinq_2yrs", "earliest_cr_line", "mths_since_last_delinq",
          "revol_bal", "revol_util", "total_pymnt_inv", "initial_list_status", "out_prncp", "out_prncp_inv",
          "total_pymnt", "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee", 
          "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt", "last_credit_pull_d",
          "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq", "chargeoff_within_12_mths",
          "delinq_amnt", "tax_liens", "zip_code"],axis = 1, inplace=True)

In [305]:
loan.shape

(39717, 19)

In [306]:
loan.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'addr_state', 'dti',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'total_acc',
       'pub_rec_bankruptcies'],
      dtype='object')

In [307]:
# "funded_amnt", "funded_amnt_inv" coluns to "loan_amnt"
loan.drop(["funded_amnt", "funded_amnt_inv"] ,axis = 1, inplace=True)

In [308]:
loan.shape

(39717, 17)

#### Step 4 - Removing rows `loan_status` == `Current`

In [309]:
print(loan["loan_status"].value_counts())

Fully Paid     32950
Charged Off     5627
Current         1140
Name: loan_status, dtype: int64


In [310]:
loan = loan[loan["loan_status"] != 'Current']

In [311]:
loan.shape

(38577, 17)

#### Step 5 - Fixing `term` column (Removing `month`)

In [312]:
loan.term.unique()

array([' 36 months', ' 60 months'], dtype=object)

In [313]:
loan.term = loan.term.apply(lambda x: int(x.replace(" months", "")))

In [314]:
loan.term.value_counts()

36    29096
60     9481
Name: term, dtype: int64

#### Step 5 - Fixing `int_rate` column (Removing `%`)

In [315]:
loan.int_rate = loan.int_rate.apply(lambda x: float(x[:-1]))

In [316]:
loan.int_rate.head()

0    10.65
1    15.27
2    15.96
3    13.49
5     7.90
Name: int_rate, dtype: float64

#### Step 6 - Fixing column `emp_length` (removing `years` and `+`)  

In [317]:
loan.emp_length.unique()

array(['10+ years', '< 1 year', '3 years', '8 years', '9 years',
       '4 years', '5 years', '1 year', '6 years', '2 years', '7 years',
       nan], dtype=object)

In [318]:
loan.emp_length = loan.emp_length.astype('str')

In [319]:
loan.emp_length = loan.emp_length.apply(lambda x: x.strip(" years"))

In [320]:
loan.emp_length.unique()

array(['10+', '< 1', '3', '8', '9', '4', '5', '1', '6', '2', '7', 'nan'],
      dtype=object)

In [321]:
def clean(input):
    if "nan" in input:
        return int(-1)
    elif "+" in input:
        return int(input.replace("+", ""))
    elif "<" in input:
        return int(input.replace("< ", ""))
    else:
        return int(input)

loan.emp_length = loan.emp_length.apply(clean)

In [322]:
loan.emp_length.unique()

array([10,  1,  3,  8,  9,  4,  5,  6,  2,  7, -1])

In [323]:
loan.verification_status.unique()

array(['Verified', 'Source Verified', 'Not Verified'], dtype=object)

In [324]:
loan.home_ownership.unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE'], dtype=object)

In [325]:
loan.loan_status.unique()

array(['Fully Paid', 'Charged Off'], dtype=object)

#### Step 7 - Deriving `issue_month` and `issue_year` from `issue_d`

In [332]:
from datetime import datetime
loan["issued_month"] = pd.DatetimeIndex(loan.issue_d.apply(lambda x : datetime.strptime(x, '%b-%y'))).month

In [333]:
loan.issued_month.unique()

Int64Index([12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype='int64', name='issue_d')

In [335]:
loan["issued_year"] = pd.DatetimeIndex(loan.issue_d.apply(lambda x : datetime.strptime(x, '%b-%y'))).year

In [336]:
loan.issued_year.unique()

array([2011, 2010, 2009, 2008, 2007])

In [338]:
loan.drop(["issue_d"] ,axis = 1, inplace=True)

In [339]:
loan.columns

Index(['loan_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
       'addr_state', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec',
       'total_acc', 'pub_rec_bankruptcies', 'issued_month', 'issued_year'],
      dtype='object')