# Lending Club Case Study

In [54]:
# Importing core libraries required for the case study
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import seaborn as sea

## Loading Data

In [78]:
# Loading the complete dataset into variable df
df_loan = pd.read_csv('data/loan.csv', low_memory=False)

# Save the original row count in a variable
rowcount = len(df_loan)
original_columns = df_loan.columns
df_loan.shape

(39717, 111)

## Dropping redundant columns w.r.t. the case study

In [79]:
# Dropping columns who dont contribute to the overall analysis as they are either transactional ir discriptive in nature
# Dropping column sub_grade as the current analysis will li
df_clean = df_loan.drop(['id','member_id', 'url', 'emp_title', 'desc', 'title', 'sub_grade'],  axis=1)
df_clean.shape

(39717, 105)

## Dropping columns representing customer behaviour

In [80]:

# Dropping all columns which refer to behavoural data. 
# Behaviour data of the customers are captured post the loan approval
# The data is not available at the time of loan approval and thus cannot be used for calculations
df_clean = df_clean.drop(['delinq_2yrs', 'earliest_cr_line', 
                          'inq_last_6mths', 'open_acc', 'pub_rec', 
                          'revol_bal', 'revol_util', 'total_acc', 
                          'out_prncp', 'out_prncp_inv', 'total_pymnt', 
                          'total_pymnt_inv', 'total_rec_prncp', 
                          'total_rec_int', 'total_rec_late_fee', 'recoveries', 
                          'collection_recovery_fee', 'last_pymnt_d', 
                          'last_pymnt_amnt', 'last_credit_pull_d', 
                          'application_type'],  axis=1)
df_clean.shape

(39717, 84)

## Dropping all columns having all NA values

In [81]:
# Dropping all the columns whose all the records are NaN or Null
df_clean = df_clean.dropna(axis='columns', how="all")
df_clean.shape

(39717, 30)

## Dropping all columns having all Zero values

In [82]:
# Dropping all columns with all zero values
df_clean = df_clean.loc[:, (df_clean != 0).any(axis=0)]
df_clean.shape

(39717, 28)

## Dropping all columns constant values

In [83]:
# Function to Drop all columns who have constant values (ignoring NA value)
def drop_constant_columns(df):
    for c in df.columns:
        if df[c].nunique(dropna=True) == 1:
            df = df.drop(c, axis=1)
    return df

# Drop all constant columns from df1 (definition of constant is constant value across the rows, this ignores Na values)
df_clean = drop_constant_columns(df_clean)
df_clean.shape

(39717, 22)

## Removing all rows where loan_status = Current

In [84]:
# The rows where loan_stats=Current are the data where the loan repayment is currently in progress
# The loans which are currently in progress will not contribute to decisions of default or pass as it's difficult to predict the outcome
df_clean = df_clean[df_clean['loan_status'] != "Current"]
df_clean.shape

(38577, 22)

In [85]:
# Printing column info to analyse missing values, empty values in a column
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38577 entries, 0 to 39716
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   loan_amnt               38577 non-null  int64  
 1   funded_amnt             38577 non-null  int64  
 2   funded_amnt_inv         38577 non-null  float64
 3   term                    38577 non-null  object 
 4   int_rate                38577 non-null  object 
 5   installment             38577 non-null  float64
 6   grade                   38577 non-null  object 
 7   sub_grade               38577 non-null  object 
 8   emp_length              37544 non-null  object 
 9   home_ownership          38577 non-null  object 
 10  annual_inc              38577 non-null  float64
 11  verification_status     38577 non-null  object 
 12  issue_d                 38577 non-null  object 
 13  loan_status             38577 non-null  object 
 14  purpose                 38577 non-null

## Saving Data Snapshot

In [86]:
# Save a cleaned snapshot to disk for some manual analysis
df_clean.to_csv('./data/loan_clensed.csv')