# `LENDING CLUB CASE STUDY`

**`AIM`** : To identify those significant attributes/drivers that influences loan default

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

#setting pandas dataframe display option
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## `Part 1: Importing Data`

In [None]:
df=pd.read_csv(r'../input/lending-club-data/loan.csv',low_memory=False)

In [None]:
df.head()

In [None]:
df.shape #nearly 40000 records and 111 attributes to analyse

## `Part 2: DATA WRANGLING`

In [None]:
print("Total records:",len(df.id),"\n\nTotal unique loan records:",df.id.nunique(),"\nTotal unique members:",df.member_id.nunique())

#We see that every borrower has only one loan record 

### Handling missing values

In [None]:
# assessing missing values present in the data

((df.isnull().sum().sum())/(df.shape[0]*df.shape[1]))*100

#Nearly 50% of the individual cell entries have a missing value in the dataset

In [None]:
# Observing missing values across columns

missing_columns=pd.DataFrame((df.isnull().sum()/df.shape[0])*100,columns={"% missing"})
missing_columns.sort_values(by="% missing",ascending=False,inplace=True)
#We will drop columns with % missing more than 7%

missing_columns

In [None]:
#only including columns with less than 7% missing

df=df.loc[:,missing_columns['% missing']<7]

In [None]:
#checking the column missing % after dropping

((df.isnull().sum())/df.shape[0])*100

In [None]:
df.head()


### Dropping columns

In [None]:
df.purpose.value_counts()

In [None]:
#since 'purpose' column has a much better capture than 'title' column we chose to drop 'title'

df.title.value_counts()

In [None]:
df.pymnt_plan.value_counts() #only one category present, so we drop this column

#all loans have no payment plan in place

In [None]:
df.initial_list_status.value_counts() #only one category present, so we drop this column

#all loans were initally listed as fractional loans to disbursed and no loan was listed as a whole

In [None]:
df.policy_code.value_counts() #only one category present, so we drop this column

#only publicly available policies are availed

In [None]:
df.delinq_amnt.value_counts() #only one category present, so we drop this column

In [None]:
df.chargeoff_within_12_mths.value_counts() #only one category present, so we drop this column

In [None]:
df.application_type.value_counts() #only one category present, so we drop this column

#there is no joint loan application

In [None]:
df.acc_now_delinq.value_counts() #only one category present, so we drop this column

In [None]:
df.tax_liens.value_counts() #only one category present, so we drop this column

In [None]:
df.collections_12_mths_ex_med.value_counts() #only one category present, so we drop this column

In [None]:
#since the recovery collection fee is about 1%-2% of the total recoveries made, we drop this column 

(df.collection_recovery_fee/df.recoveries).describe()

In [None]:
# dropping columns that add no business value to the analysis

df=df.drop(columns=['id','member_id','url','emp_title','zip_code','pymnt_plan','title','initial_list_status','collection_recovery_fee',
                    'policy_code','delinq_amnt','chargeoff_within_12_mths','application_type','acc_now_delinq','tax_liens','delinq_2yrs',
                    'total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collections_12_mths_ex_med','out_prncp','out_prncp_inv','last_pymnt_d','last_pymnt_amnt','last_credit_pull_d'])

#we also drop columns that have significance after the approval of the loan and that do not influence the decision of providing loan
#Eg: total_pymnt,recoveries

#Now we have only features that are significant on or before approval of a loan (baseline features)

In [None]:
df.shape

In [None]:
df['issued_month'], df['issued_year'] = df['issue_d'].str.split('-').str

In [None]:
df['issued_year'] = '20' + df['issued_year']

columns = ['issued_year']
df[columns] = df[columns].astype(np.int)

In [None]:
df.head()

### Correcting column types

In [None]:
df.head(3)

In [None]:
#Removing % from interest rate column, revol_util column

df.int_rate=df.int_rate.apply(lambda x:str(x).replace("%","")).astype('float64')
df.revol_util=df.revol_util.apply(lambda x:str(x).replace("%","")).astype('float64')

In [None]:
df.info()

In [None]:
df.head()

### Converting all cases to uppercase

In [None]:
col_names=['term','grade','sub_grade','emp_length','home_ownership','verification_status','issue_d','loan_status','purpose',
   'addr_state','earliest_cr_line']
for i in col_names:
    df[i]=df[i].str.upper().str.strip()

In [None]:
df.head() #checking if operation is successful

In [None]:
# Now, lets Extract the term_months from term column and drop term

df['term_months'] = df['term'].str.rstrip('MONTHS')
df.head()

### Validating internal rules

In [None]:
df[df.funded_amnt>df.loan_amnt] #no loan where funded amount > listed loan amount

In [None]:
df[df.funded_amnt_inv>df.funded_amnt] #no loan where funded amount by investor > total funded amount

In [None]:
df[df.total_acc<df.open_acc] #this does not seem to be possible, so we remove such records

df=df[df.total_acc>=df.open_acc]

### Creating additional features

In [None]:
df['revol_credit_limit']=round(100*df['revol_bal']/df.revol_util,2)

df.drop(columns=['revol_bal'],inplace=True)
df.head()

In [None]:
df['loan_issued_year']=df['issue_d'].apply(lambda x:x.split('-')[1]) #getting year of loan issued

In [None]:
#tagging states to a region
midwest=['IA','IL','IN','KS','MI','MN','MO','ND','NE','OH','SD','WI']
northeast=['CT','MA','ME','NH','NJ','NY','PA','RI','VT']
south=['AL','AR','DC','DE','FL','GA','KY','LA','MD','MS','NC','OK','SC','TN','TX','VA','WV']
west=['AK','AZ','CA','CO','HI','ID','MT','NM','NV','OR','UT','WA','WY']

def states_to_region(state_addr):
    if state_addr in midwest:
        return 'MIDWEST'
    elif state_addr in northeast:
        return 'NORTHEAST'
    elif state_addr in south:
        return 'SOUTH'
    elif state_addr in west:
        return 'WEST'
    else:
        return 'MISSING'
    
df['region']=df['addr_state'].apply(states_to_region)

### Binning data

In [None]:
df['loan_amnt_category']=pd.cut(df.loan_amnt,bins=[0,11500,23000,35000],labels=['<=$11500','$11500,$23000','>$23000'],right=True)
df['funded_amnt_category']=pd.cut(df.funded_amnt,bins=[0,11500,23000,35000],labels=['<=$11500','$11500,$23000','>$23000'],right=True)
df['funded_amnt_inv_category']=pd.cut(df.funded_amnt_inv,bins=[0,11500,23000,35000],labels=['<=$11500','$11500,$23000','>$23000'],right=True)
df['int_rate_category']=pd.cut(df.int_rate,bins=[0,12,18,25],labels=['<=12%','12%-18%','>18%'],right=True)
df['annual_inc_category']=pd.cut(df.annual_inc,bins=[0,10000,50000,6000000],labels=['LOW INCOME','AVERAGE INCOME','HIGH INCOME'],right=True)

### Removing unwanted characters

In [None]:
df['purpose']=df['purpose'].apply(lambda x:str(x).replace("_"," "))

### Removing outliers

In [None]:
#We exclude borrowers with abnormally high incomes

df=df[df.annual_inc<np.percentile(df.annual_inc,99)]

### Filtering data

In [None]:
#Since we are interested in loans that are either defaulted or fully paid, we exclude records with loan_status='CURRENT'

df=df[df.loan_status!='CURRENT']
df.loan_status.value_counts()

In [None]:
df.loan_status.value_counts(normalize=True)

In [None]:
defaulted_loans=df[df.loan_status=='CHARGED OFF']
paid_loans=df[df.loan_status=='FULLY PAID']
Current_loans = df[df.loan_status=='CURRENT']

## `UNIVARIATE ANALYSIS`

#### `categorical features`

In [None]:
df.head()

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,10)) 

ax1.pie(paid_loans.term.value_counts().values,labels = paid_loans.term.value_counts().index,colors = colors,autopct = '%1.1f%%',explode=[0.05]*2,shadow=True)
ax1.set_title('Term in Paid loans')
ax1.legend()
ax2.pie(defaulted_loans.term.value_counts().values,labels = defaulted_loans.term.value_counts().index,colors = colors,autopct = '%1.1f%%',explode=[0.05]*2,shadow=True)
ax2.set_title('Term in Defaulted loans')
ax2.legend()
plt.tight_layout()
plt.show()


#long term loans tend to default more often

In [None]:
fig=px.sunburst(paid_loans,path=['grade','sub_grade'],title="Loan grade in paid loans")
fig.update_traces(textinfo="label+value")
fig.update_layout(
    title={
        'text': "Loan grade in Paid loans",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

#paid loans have a better loan grade (that is grade A or B)

In [None]:
fig=px.sunburst(defaulted_loans,path=['grade','sub_grade'],title="Loan grade in Defaulted loans")
fig.update_traces(textinfo="label+value")
fig.update_layout(
    title={
        'text': "Loan grade in Defaulted loans",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

#defaulted loans have a lower loan grade

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(20,5)) 

ax1.bar(height=paid_loans.home_ownership.value_counts().values,x = paid_loans.home_ownership.value_counts().index,color=colors)
ax1.set_title('Home ownership in Paid loans')
ax1.set_ylabel('Number of borrowers')
ax2.bar(height=defaulted_loans.home_ownership.value_counts().values,x = defaulted_loans.home_ownership.value_counts().index,color=colors)
ax2.set_title('Home ownership in Defaulted loans')
ax2.set_ylabel('Number of borrowers')
plt.show()

#Nothing significant observed from home ownership type

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,10)) 

ax1.pie(paid_loans.verification_status.value_counts().values,labels = paid_loans.verification_status.value_counts().index,colors = colors,autopct = '%1.1f%%',explode=[0.05]*len(paid_loans.verification_status.value_counts().index),shadow=True)
ax1.set_title('Income verification status in Paid loans')

ax2.pie(defaulted_loans.verification_status.value_counts().values,labels = defaulted_loans.verification_status.value_counts().index,colors = colors,autopct = '%1.1f%%',explode=[0.05]*len(paid_loans.verification_status.value_counts().index),shadow=True)
ax2.set_title('Income verification status in Defaulted loans')

plt.tight_layout()
plt.show()


#Nothing significant observed from Income verification status

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(10,10)) 

ax1.barh(width=paid_loans.emp_length.value_counts().values,y = paid_loans.emp_length.value_counts().index,color=colors)
ax1.set_title('Employment length in Paid loans')
ax1.set_xlabel('Number of borrowers')
ax2.barh(width=defaulted_loans.emp_length.value_counts().values,y = defaulted_loans.emp_length.value_counts().index,color=colors)
ax2.set_title('Employment length in Defaulted loans')
ax2.set_xlabel('Number of borrowers')
plt.tight_layout()
plt.show()

#Many borrowers seem to have mostly 10+ years of work length

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(10,10)) 

ax1.barh(width=paid_loans.purpose.value_counts().values,y = paid_loans.purpose.value_counts().index,color=colors)
ax1.set_title('Loan purpose in Paid loans')
ax1.set_xlabel('Number of borrowers')
ax2.barh(width=defaulted_loans.purpose.value_counts().values,y = defaulted_loans.purpose.value_counts().index,color=colors)
ax2.set_title('Loan purpose in Defaulted loans')
ax2.set_xlabel('Number of borrowers')
plt.tight_layout()
plt.show()

#Top common reason seems to be 'DEBT CONSOLIDATION'

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(20,5)) 

ax1.bar(height=paid_loans.loan_issued_year.value_counts().values,x = paid_loans.loan_issued_year.value_counts().index,color=colors)
ax1.set_title('Year at which loans were given for Paid loans')
ax1.set_ylabel('Number of borrowers')
ax2.bar(height=defaulted_loans.loan_issued_year.value_counts().values,x = defaulted_loans.loan_issued_year.value_counts().index,color=colors)
ax2.set_title('Year at which loans were given for Defaulted loans')
ax2.set_ylabel('Number of borrowers')
plt.show()

#More loans are disbursed from 2007 to 2011

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(18,18)) 

ax1.barh(width=paid_loans.addr_state.value_counts().values,y = paid_loans.addr_state.value_counts().index,color=colors)
ax1.set_title('Loan purpose in Paid loans')
ax1.set_xlabel('Number of borrowers')
ax2.barh(width=defaulted_loans.addr_state.value_counts().values,y = defaulted_loans.addr_state.value_counts().index,color=colors)
ax2.set_title('Loan purpose in Defaulted loans')
ax2.set_xlabel('Number of borrowers')
plt.tight_layout()
plt.show()

#Top common state seems to be California

#### `continuous features`

In [None]:
df.head()

In [None]:
colors = ['#f29999','#62b3ff','#99ff99','#ffcc99']
sns.boxplot(x=df.loan_status,y=df.int_rate)
plt.xlabel('STATUS OF LOAN')
plt.ylabel('INTEREST RATE')
plt.show()

#Charged off loan have a higher interest rate

In [None]:
sns.boxplot(x=df.loan_status,y=df.annual_inc)
plt.xlabel('STATUS OF LOAN')
plt.ylabel('ANNUAL INCOME')
plt.show()

#Annual income of FULLY PAID borrowers seems to be higher

In [None]:
sns.violinplot(y=df.funded_amnt,x=df.loan_status)
plt.show()

#Both types of borrowers seem to get a similar amount funded

In [None]:
sns.boxplot(y=df.installment,x=df.loan_status)
plt.show()

#Similar installment amount per month is also observed

In [None]:
sns.kdeplot(x=df.installment,hue=df.loan_status)
plt.show()

#The number of borrowers who are charged off are lesser, so this difference is observed

In [None]:
sns.boxplot(y=df.dti,x=df.loan_status)
plt.xlabel('STATUS OF LOAN')
plt.ylabel('DTI')
plt.show()
 #As expected, the debt to income ratio of defaulted borrowers is slightly higher

## `SEGMENTED ANALYSIS`

In [None]:
# Loan granted for various purposes

sns.countplot(x='purpose', hue="loan_status", data=df)
plt.xticks(rotation=90)
plt.title('Loan count based on Loan Status')
plt.show()

In [None]:
# Charged off loan counts as per loan count

sns.countplot( x="term_months", hue='loan_status', data=df)
plt.xlabel('Loan Term (Months)')
plt.title('No. of charged off loans based on term(Months)')
plt.show()

In [None]:
# Loans corresponding to Rent and Mortage were mostly paid
sns.countplot(x='home_ownership',hue='loan_status', data=df)
plt.xticks(rotation=90)
plt.title("Home Ownership")
plt.show()

## `BIVARIATE ANALYSIS`

In [None]:
# Heatmap 

plt.figure(figsize=(15,15))
corr = df.corr()
sns.heatmap(corr, annot=True, center=0.5)
plt.show()

In [None]:
# Bivariate Analysis on annual income against Charged off Proportion

inc_range_vs_loan = df.groupby(['annual_inc_category', 'loan_status']).loan_status.count().unstack().fillna(0).reset_index()
inc_range_vs_loan['Total'] = inc_range_vs_loan['CHARGED OFF'] + inc_range_vs_loan['FULLY PAID'] 
inc_range_vs_loan['Chargedoff_Proportion'] = inc_range_vs_loan['CHARGED OFF'] / inc_range_vs_loan['Total']
inc_range_vs_loan.sort_values('Chargedoff_Proportion', ascending=False)

In [None]:
# Annual Income VS charged off loans

fig, ax1 = plt.subplots(figsize=(12, 8))
ax1.set_title('Annual Income vs Chargedoff Proportion')
ax1=sns.barplot(x='annual_inc_category', y='Chargedoff_Proportion', data = inc_range_vs_loan)
ax1.set_ylabel('Chargedoff Proportion',fontsize=14)
ax1.set_xlabel('Annual Income',fontsize=14)
plt.show()

In [None]:
sns.pairplot(data=df[['int_rate','installment','loan_amnt','funded_amnt','funded_amnt_inv','annual_inc','dti','revol_util']],corner=True)
plt.show()

In [None]:
# This function plots a given column buckets against loan_status (default = 'Charged Off')

def plotLoanStatus(dataframe, by, loanstatus='CHARGED OFF'):
    grp = dataframe.groupby(['loan_status',by])[by].count()
    cnt = dataframe.groupby(by)[by].count()
    percentages = grp.unstack() * 100 / cnt.T
    ax = percentages.loc[loanstatus].plot.bar(color=sns.color_palette('husl', 16))
    ax.set_ylabel('% of loans ' + loanstatus)
    plt.margins(0.2, 0.2)
    plt.tight_layout()
    return ax

In [None]:
# Plot
plt.figure(figsize=(7,5))
plotLoanStatus(df, 'pub_rec_bankruptcies')
plt.title("Public recorded Bankrupcies")
plt.show()

In [None]:
#Analysis on purpose of Loan against Chargedoff_Proportion
purpose_vs_loan = df.groupby(['purpose', 'loan_status']).loan_status.count().unstack().fillna(0).reset_index()
purpose_vs_loan['Total'] = purpose_vs_loan['CHARGED OFF'] + purpose_vs_loan['FULLY PAID'] 
purpose_vs_loan['Chargedoff_Proportion'] = purpose_vs_loan['CHARGED OFF'] / purpose_vs_loan['Total']
purpose_vs_loan.sort_values('Chargedoff_Proportion', ascending=False)

In [None]:
#Bar plot on above calculated values.
fig, ax1 = plt.subplots(figsize=(14, 8))
ax1.set_title('Purpose of Loans vs Chargedoff Proportion')
ax1=sns.barplot(y='purpose', x='Chargedoff_Proportion', data=purpose_vs_loan)
ax1.set_ylabel('Purpose of Loans',fontsize=14)
ax1.set_xlabel('Chargedoff Proportion',fontsize=14)
plt.show()

In [None]:
# grade against Charged off Proportion
grade_vs_loan = df.groupby(['grade', 'loan_status']).loan_status.count().unstack().fillna(0).reset_index()
grade_vs_loan['Total'] = grade_vs_loan['CHARGED OFF'] + grade_vs_loan['FULLY PAID'] 
grade_vs_loan['Chargedoff_Proportion'] = grade_vs_loan['CHARGED OFF'] / grade_vs_loan['Total']
grade_vs_loan.sort_values('Chargedoff_Proportion', ascending=False)

In [None]:
#Analysing
fig, ax1 = plt.subplots(figsize=(14, 8))
ax1.set_title('Grades vs Chargedoff Proportion')
ax1=sns.barplot(x='grade', y='Chargedoff_Proportion', data=grade_vs_loan)
ax1.set_xlabel('Grades',fontsize=14)
ax1.set_ylabel('Chargedoff Proportion',fontsize=14)
plt.show()

In [None]:
# Most number of defaulters are from the state of CA, followed by FL

g = df[df['loan_status']=='CHARGED OFF'].groupby('addr_state')['loan_status'].count().reset_index()
plt.figure(figsize=(10,10))
sns.barplot(y='addr_state', x='loan_status', data=g)
plt.xlabel('Count of loan status to be defaulter')
plt.ylabel('State')
plt.show()

In [None]:
g = df.groupby('issued_year')['loan_status'].count()
g.plot.line(x_compat=True)
plt.xticks(np.arange(min(g.index), max(g.index)+1, 1.0))
plt.title('Number of loan granted over the years')
plt.xlabel('Issue Year')
plt.ylabel('Count')
plt.show()

In [None]:
# Box Plot
plt.figure(figsize=(14,8))
ax = sns.boxplot(y='int_rate', x='issued_year', data =df,palette='rainbow')
ax.set_title('Year vs Interest Rate',fontsize=15,color='w')
ax.set_ylabel('Interest Rate',fontsize=14,color = 'w')
ax.set_xlabel('Year',fontsize=14,color = 'w')
plt.show()

In [None]:
sns.jointplot('loan_amnt', 'int_rate', df.loc[df.pub_rec_bankruptcies > 0])

In [None]:
sns.jointplot('dti', 'int_rate', data = df)