# Lending Club Case Study

In [58]:
# Importing core libraries required for the case study
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sea

debug = True
# Utility function to take a snapshot of the csv locally just to validate the outputs
def snapshot_data(df, snapshot_name): 
    if debug == True:
        print(df.shape)
        df.to_csv('./data/snapshot.'+ snapshot_name +'.loan.csv')   

## Loading Data

In [59]:
# Loading the complete dataset into variable df
df_loan = pd.read_csv('data/loan.csv', low_memory=False)

## Step1 - Dropping redundant columns

In [60]:
# Dropping columns who dont contribute to the overall analysis as they are either transactional ir discriptive in nature
# Dropping column sub_grade as the current analysis will li
df_clean = df_loan.drop(['id','member_id', 'url', 'emp_title', 'desc', 'title', 'sub_grade'],  axis=1)
snapshot_data(df_clean,'step1')

(39717, 104)


## Step2 - Dropping columns representing customer behaviour

In [61]:

# Dropping all columns which refer to behavoural data. 
# Behaviour data of the customers are captured post the loan approval
# The data is not available at the time of loan approval and thus cannot be used for calculations
df_clean = df_clean.drop(['delinq_2yrs', 'earliest_cr_line', 
                          'inq_last_6mths', 'open_acc', 'pub_rec', 
                          'revol_bal', 'revol_util', 'total_acc', 
                          'out_prncp', 'out_prncp_inv', 'total_pymnt', 
                          'total_pymnt_inv', 'total_rec_prncp', 
                          'total_rec_int', 'total_rec_late_fee', 'recoveries', 
                          'collection_recovery_fee', 'last_pymnt_d', 
                          'last_pymnt_amnt', 'last_credit_pull_d', 
                          'application_type'],  axis=1)
snapshot_data(df_clean,'step2')

(39717, 83)


## Step3 - Removing all rows where loan_status = Current

In [62]:
# The rows where loan_stats=Current are the data where the loan repayment is currently in progress
# The loans which are currently in progress will not contribute to decisions 
# of default or pass as it's difficult to predict the outcome
#
# Dropping the rwos early as, dropping all Currrent rows introduces NA columns which can be easily dropped
df_clean = df_clean[df_clean['loan_status'] != "Current"]
snapshot_data(df_clean,'step3')

(38577, 83)


## Step4 - Dropping all columns having all NA values

In [63]:
# Print all NA columns for verification
print("Columns with all values as NA", df_clean.columns[df_clean.isna().all()].tolist())

# Dropping all the columns whose all the records are NaN or Null
df_clean = df_clean.dropna(axis='columns', how="all")
snapshot_data(df_clean,'step4')

Columns with all values as NA ['next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_m

## Step5 - Dropping all columns having all Zero values

In [64]:
# Dropping all columns with all zero values
df_clean = df_clean.loc[:, (df_clean != 0).any(axis=0)]
snapshot_data(df_clean,'step5')

(38577, 26)


## Step6 - Dropping all columns constant values

In [65]:
# Function to Drop all columns who have constant values (ignoring NA value)
def drop_constant_columns(df):
    for c in df.columns:
        if df[c].nunique(dropna=True) == 1:
            print(c)
            df = df.drop(c, axis=1)
    return df

# Drop all constant columns from df1 (definition of constant is constant value across the rows, this ignores Na values)
df_clean = drop_constant_columns(df_clean)
snapshot_data(df_clean,'step6')

pymnt_plan
initial_list_status
collections_12_mths_ex_med
policy_code
chargeoff_within_12_mths
tax_liens
(38577, 20)


## Step7 - Dropping Columns whose empty values exceed 65%

In [66]:
# Function which checks the amount of empty values in a dataframe and 
# drops the column if the amount of empty values is more than 65%
# 60% is the threshhold percentage which decides imputing vs dropping 
def drop_mostly_empty_columns(df):
    total_rows = len(df)
    for c in df.columns:
        # Drop columns whose mean na values exceed 65%
        if df[c].isna().mean().round(2) >= 0.65:
            print(c)
            df = df.drop(c, axis=1)
    return df
df_clean = drop_mostly_empty_columns(df_clean)
snapshot_data(df_clean,'step7')

mths_since_last_delinq
mths_since_last_record
(38577, 18)


In [67]:
# Printing column info to analyse missing values, empty values in a column
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38577 entries, 0 to 39716
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             38577 non-null  int64  
 1   funded_amnt           38577 non-null  int64  
 2   funded_amnt_inv       38577 non-null  float64
 3   term                  38577 non-null  object 
 4   int_rate              38577 non-null  object 
 5   installment           38577 non-null  float64
 6   grade                 38577 non-null  object 
 7   emp_length            37544 non-null  object 
 8   home_ownership        38577 non-null  object 
 9   annual_inc            38577 non-null  float64
 10  verification_status   38577 non-null  object 
 11  issue_d               38577 non-null  object 
 12  loan_status           38577 non-null  object 
 13  purpose               38577 non-null  object 
 14  zip_code              38577 non-null  object 
 15  addr_state         