# Lending Club Case Study

In [282]:
# Importing core libraries required for the case study
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sea

# Loading loan data from public google drive
# Code referenced from https://stackoverflow.com/questions/56611698/pandas-how-to-read-csv-file-from-google-drive-public
url = "https://drive.google.com/file/d/1gHUGDYuGFd3paXvypwvDzGTe_HZSjILf/view?usp=sharing"
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

debug = False
# Utility function to take a snapshot of the csv locally just to validate the outputs
def snapshot_data(df, snapshot_name): 
    if debug == True:
        print(df.shape)
        df.to_csv('./data/snapshot.'+ snapshot_name +'.loan.csv')  

## Loading Data

In [283]:
# Loading the complete dataset into variable df
df_loan = pd.read_csv(url, low_memory=False)

## Step1 - Dropping Rows - where loan_status = "Current"

In [284]:
# The rows where loan_stats=Current are the data where the loan repayment is currently in progress
# The loans which are currently in progress will not contribute to decisions 
# of default or pass as it's difficult to predict the outcome
#
# Dropping the rwos early as, dropping all Currrent rows introduces NA columns which can be easily dropped
df_clean = df_clean[df_clean['loan_status'] != "Current"]

In [285]:
snapshot_data(df_clean,'step1')

## Step2 - Dropping Columns

In [286]:
# Dropping columns which is unique id in nature. They dont contribute to loan analysis
df_clean = df_loan.drop(['id','member_id'],  axis=1)

# Dropping text/description columns which wont contribute to overall analysis
# These are names of establishment etc which will not contribute to loan pass or failure
# THe URL column is a static link with id as the attribute. Its a redundant column
df_clean = df_clean.drop(['url', 'emp_title', 'desc', 'title'],  axis=1)

# Dropping column sub_grade as the current analysis will limit to Grade only
df_clean = df_clean.drop(['sub_grade'],  axis=1)

In [287]:
# Dropping all columns which refer to behavoural data of customer post loan approval 
# Behaviour data of the customers are captured post the loan approval
# The data is not available at the time of loan approval and thus cannot be used for calculations
df_clean = df_clean.drop(['delinq_2yrs', 'earliest_cr_line', 
                          'inq_last_6mths', 'open_acc', 'pub_rec', 
                          'revol_bal', 'revol_util', 'total_acc', 
                          'out_prncp', 'out_prncp_inv', 'total_pymnt', 
                          'total_pymnt_inv', 'total_rec_prncp', 
                          'total_rec_int', 'total_rec_late_fee', 'recoveries', 
                          'collection_recovery_fee', 'last_pymnt_d', 
                          'last_pymnt_amnt', 'last_credit_pull_d', 
                          'application_type'],  axis=1)

In [288]:
# Dropping all columns whose all the values are NA
# Print all NA columns for verification
if debug == True:
    print("Columns with all values as NA", df_clean.columns[df_clean.isna().all()].tolist())

# Dropping all the columns whose all the records are NaN or Null
df_clean = df_clean.dropna(axis='columns', how="all")

In [289]:
# Dropping all columns with all zero values
df_clean = df_clean.loc[:, (df_clean != 0).any(axis=0)]

In [290]:
# Function to Drop all columns who have constant values (ignoring NA value)
# Example most of the columns is 1 and rest is NA, the column will be dropped
# If we have 1,2 and NA, the column wont be dropped
def drop_constant_columns(df):
    for c in df.columns:
        if df[c].nunique(dropna=True) == 1:
            if debug == True:
                print(c)
            df = df.drop(c, axis=1)
    return df

# Drop all constant columns from df1 (definition of constant is constant value across the rows, this ignores Na values)
df_clean = drop_constant_columns(df_clean)

In [291]:
# Function which checks the amount of empty values in a dataframe and 
# drops the column if the amount of empty values is more than 65%
# 60% is the threshhold percentage which decides imputing vs dropping 
def drop_mostly_empty_columns(df):
    total_rows = len(df)
    for c in df.columns:
        # Drop columns whose mean na values exceed 65%
        if df[c].isna().mean().round(2) >= 0.65:
            if debug == True:
                print(c)
            df = df.drop(c, axis=1)
    return df
df_clean = drop_mostly_empty_columns(df_clean)

In [292]:
snapshot_data(df_clean,'step2')

## Step3 - Convert the data types

In [293]:
# Convert the columns loan_amnt and funded_amnt as flot64
df_clean = df_clean.astype({'loan_amnt':'float','funded_amnt':'float'})

In [294]:
# Convert the term column into an integer from a string
df_clean['term'] = df_clean['term'].apply(lambda x : int(x[:-7]))

In [295]:
# Convert int_rate to  float by removing the "%" character
df_clean['int_rate'] = df_clean['int_rate'].apply(lambda x : float(x[:-1]))

In [296]:
snapshot_data(df_clean,'step3')

## Step 4 - Identify columns with blank values which need to be imputed

In [297]:
# Identify columns who have blank values and what percentage of total values are there blanks. 
# These values may need to be imputed
for c in df_clean.columns[df_clean.isna().any()].tolist():
    print(c, round(len(df_clean[df_clean[c].isna()]) / len(df_clean) * 100,2),"%")

emp_length 2.71 %
pub_rec_bankruptcies 1.75 %


In [298]:
# Since the percent of rows is very small, dropping the rows instead of imputing them
df_clean = df_clean[df_clean['emp_length'].notna()]
df_clean = df_clean[df_clean['pub_rec_bankruptcies'].notna()]

In [299]:
snapshot_data(df_clean,'step4')

## Step 5 - Converting the loan_status to boolean column

In [300]:
# Converting the loan_status to boolean column. "Fully-Paid is True and Charged Off is False"
df_clean['loan_status'] = df_clean['loan_status'].apply(lambda x: True if x == 'Fully Paid' else False)

In [301]:
snapshot_data(df_clean,'step5')

In [302]:
# Printing column info to analyse missing values, empty values in a column
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37945 entries, 0 to 39680
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             37945 non-null  float64
 1   funded_amnt           37945 non-null  float64
 2   funded_amnt_inv       37945 non-null  float64
 3   term                  37945 non-null  int64  
 4   int_rate              37945 non-null  float64
 5   installment           37945 non-null  float64
 6   grade                 37945 non-null  object 
 7   emp_length            37945 non-null  object 
 8   home_ownership        37945 non-null  object 
 9   annual_inc            37945 non-null  float64
 10  verification_status   37945 non-null  object 
 11  issue_d               37945 non-null  object 
 12  loan_status           37945 non-null  bool   
 13  purpose               37945 non-null  object 
 14  zip_code              37945 non-null  object 
 15  addr_state         

In [303]:
# Always take one final snapshot - Temp file
df_clean.to_csv('./data/snapshot.clean.loan.csv') 