In [None]:
#import libraries
import pandas as pd
import numpy as np
import openpyxl as op
from google.colab import files
from openpyxl.utils.dataframe import dataframe_to_rows

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
#Create a directory and copy json file in that directory
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

#permission to access kaggle data
!chmod 600 ~/.kaggle/kaggle.json

#Download kaggle data
!kaggle datasets download -d wendykan/lending-club-loan-data

#Unzip kaggle data
from zipfile import ZipFile
file_name = "lending-club-loan-data.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unzip Done')

#Create a dataframe of whole data
data = pd.read_csv('loan.csv')
total_rows_Data=len(data.axes[0])
print('Total Rows in Data:', total_rows_Data)

Downloading lending-club-loan-data.zip to /content
 99% 711M/720M [00:06<00:00, 127MB/s]
100% 720M/720M [00:06<00:00, 123MB/s]
Unzip Done


  interactivity=interactivity, compiler=compiler, result=result)


Total Rows in Data: 2260668


#Data Cleaning and Data Preparation

In [None]:
#Create working Dataframe
df = data.loc[:, ['term', 'annual_inc', 'annual_inc_joint', 'int_rate', 'grade', 'purpose', 'out_prncp', 'loan_status', 'loan_amnt', 'issue_d', 'emp_length', 'home_ownership', 'dti', 'dti_joint', 'inq_last_12m', 'inq_fi', 'all_util']]
total_rows=len(df.axes[0])
print('Total Rows in DataFrame:', total_rows)

#Replacing NaN values with 0
df['annual_inc'] = df['annual_inc'].replace(np.nan, 1)
df['annual_inc'] = df['annual_inc'].replace(0, 1)
df['annual_inc_joint'] = df['annual_inc_joint'].replace(np.nan, 0)
df['dti'] = df['dti'].replace(np.nan, 400000000)
df['dti_joint'] = df['dti_joint'].replace(np.nan, 0)
df['inq_last_12m'] = df['inq_last_12m'].replace(np.nan, 1000)
df['inq_fi'] = df['inq_fi'].replace(np.nan, 1000)
df['all_util'] = df['all_util'].replace(np.nan, 0)

#Replacing NaN value with others in employ length
df['emp_length'] = df['emp_length'].replace(np.nan, 'others')

#Calculate Total income, dti and inq
sumColumn = df['annual_inc'] + df['annual_inc_joint']
df['total_inc'] = sumColumn

df['debt'] = df['dti'] * df['annual_inc']
df['debt_joint'] = df['dti_joint'] * df['annual_inc_joint']

df['total_dti'] = (df['debt'] + df['debt_joint']) / (df['annual_inc'] + df['annual_inc_joint'])
df['total_dti'] = df['total_dti'].replace(np.nan, 0)

sumColumn = df['inq_last_12m'] + df['inq_fi']
df['total_inq'] = sumColumn

#Assigning Flag to Bad account
df['bad_acc'] = df['loan_status']
df['bad_acc'].replace(to_replace = ['Charged Off', 'Current', 'Default',	'Does not meet the credit policy. Status:Charged Off',	'Does not meet the credit policy. Status:Fully Paid',	'Fully Paid',	'In Grace Period', 'Late (16-30 days)', 'Late (31-120 days)'],
                           value = ['1', '0', '0', '0', '0', '0', '0', '0', '0'],
                         inplace = True)
      
#Generate Bands according to Total income
inc_bands = [0, 1, 34000, 42000, 50000, 56000, 65000, 72096, 83806, 98800, 125000, 1100000000]
inc_bands_name =[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df['inc_band'] = pd.cut(df['total_inc'], inc_bands, labels = inc_bands_name)

#Generate Bands according to Total income
inc_bands = [0, 1, 50000, 100000, 150000, 1100000000]
inc_bands_name =[1, 2, 3, 4, 5]
df['inc_band1'] = pd.cut(df['total_inc'], inc_bands, labels = inc_bands_name)

#Generate Bands according to Interest rate
int_rates = [0, 7.69, 9.17, 10.99, 11.99, 12.99, 13.99, 15.37, 16.99, 18.99, 32]
bands_name = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
df['int_rate_band'] = pd.cut(df['int_rate'], int_rates, labels = bands_name)

#Generate Bands according to Interest rate
int_rate_bands1 = [4, 10, 15, 20, 25, 32]
bands_name1 =[1, 2, 3, 4, 5]
df['int_rate_band1'] = pd.cut(df['int_rate'], int_rate_bands1, labels = bands_name1)

#Generate Bands according to total_dti
dti_band = [-2, 10, 15, 20, 30, 100, 65000]
dti_bands_name =[1, 2, 3, 4, 5, 6]
df['dti_band'] = pd.cut(df['total_dti'], dti_band, labels = dti_bands_name)

#Generate Bands according to total_inquiry
inq_band = [-1, 1.99, 2.99, 4.99, 5.99, 100, 25000]
inq_bands_name =[1, 2, 3, 4, 5, 6]  
df['inq_band'] = pd.cut(df['total_inq'], inq_band, labels = inq_bands_name)

#Generate Bands according to all_util
all_util_band = [-1, 40.99, 55.99, 65.99, 75.99, 250]
all_util_bands_name =[1, 2, 3, 4, 5]  
df['all_util_band'] = pd.cut(df['all_util'], all_util_band, labels = all_util_bands_name)

#issue year 
df["issue_d"]= df["issue_d"].astype(str) 
df["issue_y"]= df["issue_d"].str.slice(4, 8, 1)

#Banding of Issue Year
df['issue_y_band'] = df['issue_y']
df['issue_y_band'].replace(['2007', '2008', '2009', '2010', '2011', '2012'], '2007-2012', inplace=True)

df['issue_y_band1'] = df['issue_y']
df['issue_y_band1'].replace(['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014'], '2014 and Before', inplace=True)

#Banding of Employee Length
df['emp_len_band'] = df['emp_length']
df['emp_len_band'].replace(to_replace = ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'],
                                value = ['<=3 year', '<=3 year', '<=3 year', '<=3 year', '<=6 years', '<=6 years', '<=6 years', '<=9 years', '<=9 years', '<=9 years', '>10 years'],
                              inplace = True)

#Finding Total number of rows after data cleaning
total_rows = len(df.axes[0])
print('Total Rows in DataFrame after Cleaning:',total_rows)

#Setting window size
#pd.set_option('display.max_columns', 100)
df.tail()

Total Rows in DataFrame: 2260668
Total Rows in DataFrame after Cleaning: 2260668


Unnamed: 0,term,annual_inc,annual_inc_joint,int_rate,grade,purpose,out_prncp,loan_status,loan_amnt,issue_d,emp_length,home_ownership,dti,dti_joint,inq_last_12m,inq_fi,all_util,total_inc,debt,debt_joint,total_dti,total_inq,bad_acc,inc_band,inc_band1,int_rate_band,int_rate_band1,dti_band,inq_band,all_util_band,issue_y,issue_y_band,issue_y_band1,emp_len_band
2260663,60 months,58000.0,0.0,14.08,C,debt_consolidation,8687.2,Current,12000,Oct-2017,10+ years,MORTGAGE,20.88,0.0,1.0,2.0,38.0,58000.0,1211040.0,0.0,20.88,3.0,0,6,3,7,2,4,3,1,2017,2017,2017,>10 years
2260664,60 months,30000.0,135000.0,25.82,E,debt_consolidation,0.0,Fully Paid,12000,Oct-2017,< 1 year,MORTGAGE,19.28,7.36,7.0,5.0,44.0,165000.0,578400.0,993600.0,9.527273,12.0,0,11,5,10,5,1,5,2,2017,2017,2017,<=3 year
2260665,36 months,64000.0,0.0,11.99,B,debt_consolidation,5993.27,Current,10000,Oct-2017,10+ years,OWN,12.96,0.0,3.0,0.0,93.0,64000.0,829440.0,0.0,12.96,3.0,0,6,3,4,2,2,3,5,2017,2017,2017,>10 years
2260666,60 months,60000.0,0.0,21.45,D,debt_consolidation,9924.69,Current,12000,Oct-2017,others,RENT,30.82,0.0,4.0,0.0,55.0,60000.0,1849200.0,0.0,30.82,4.0,0,6,3,10,4,5,3,2,2017,2017,2017,others
2260667,60 months,60000.0,0.0,21.45,D,credit_card,13541.01,Current,16550,Oct-2017,3 years,RENT,18.4,0.0,2.0,0.0,51.0,60000.0,1104000.0,0.0,18.4,2.0,0,6,3,10,4,3,2,2,2017,2017,2017,<=3 year


**Distribution of accounts and balances**

In [None]:
#Calculate total no of accounts
values, count = np.unique(df['out_prncp'], return_counts=True)

dfTemp = pd.DataFrame()
dfTemp['Values'] = values
dfTemp['Count'] = count

dfTemp.tail(20)

Unnamed: 0,Values,Count
364379,39487.93,3
364380,39493.2,4
364381,39501.1,2
364382,39507.57,1
364383,39507.58,1
364384,39522.94,1
364385,39522.95,2
364386,39530.36,1
364387,39541.82,1
364388,39548.64,2


In [None]:
#calculate balances
x = df.groupby(
  ['int_rate_band1']
).agg( 
    {
        'out_prncp': sum
    }
)
filename = "temp.csv"
x.to_csv(filename)
data1 = pd.read_csv('temp.csv', low_memory=False)
data1['out_prncp'] = data1['out_prncp'].astype(int)
data1.head(20)

Unnamed: 0,int_rate_band1,out_prncp
0,1,2816165477
1,2,3950040009
2,3,2347579137
3,4,591147139
4,5,346660276


**Distribution of Bad Accounts and balances by Demographics**

In [None]:
def portfolio(row_ind, demographic_val):
   
    wb = op.load_workbook('MI Portfolio(8).xlsx')
    ws = wb['Bad_Account_Portfolio']

    demographic = demographic_val
    uniqueValues, tot_acc = np.unique(df[demographic], return_counts=True)
    rows = len(uniqueValues)
    
    rpos = row_ind
    cpos = 2 

    x = df.groupby(
      [demographic, 'bad_acc']
    ).agg( 
        {
            'bad_acc': "count",
            'loan_amnt': sum,
            
        }
    )

    filename = "temp.csv"
    x.to_csv(filename)
    data1 = pd.read_csv('temp.csv', low_memory=False)

    temp = 1
    for i in range(rows):
        ws.cell(row=rpos, column=cpos).value = uniqueValues[i]
        ws.cell(row=rpos, column=cpos+1).value = tot_acc[i]
        ws.cell(row=rpos, column=cpos+3).value = data1.at[temp, 'loan_amnt'] + data1.at[temp-1, 'loan_amnt']
        ws.cell(row=rpos, column=cpos+2).value = data1.at[temp, 'bad_acc.1']
        ws.cell(row=rpos, column=cpos+4).value = data1.at[temp, 'loan_amnt']
        rpos = rpos+1
        temp = temp + 2
    wb.save('MI Portfolio(8).xlsx')
    wb.close()
    
#portfolio(3, 'term')
#portfolio(7, 'grade') 
#portfolio(16, 'purpose') 
portfolio(32, 'inc_band') 
#portfolio(44, 'int_rate_band')
# portfolio(56, 'int_rate_band1')
# #portfolio(63, 'inc_band1')

**Distribution of Bad Accounts and balances by Demographics - Manually**

In [None]:
#portfolio(7, 'grade') 
#portfolio(16, 'purpose') 
#portfolio(32, 'inc_band') 
#portfolio(63, 'inc_band1')
#portfolio(44, 'int_rate_band')
# portfolio(56, 'int_rate_band1')

demographic = 'int_rate_band'

uniqueValues, tot_acc = np.unique(df[demographic], return_counts=True)

x = df.groupby(
  [demographic, 'bad_acc']
).agg( 
    {
        'bad_acc': "count",
        'loan_amnt': sum,
    }
)

filename = "temp.csv"
x.to_csv(filename)
data1 = pd.read_csv('temp.csv', low_memory=False)

data1 = data1.pivot(index = demographic, columns = 'bad_acc', values =  'loan_amnt')
#data1 = data1.pivot(index = demographic, columns = 'bad_acc', values =  'bad_acc.1')
data1.head(15)

bad_acc,0,1
int_rate_band,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4209677700,105416450
2,3124592100,194094525
3,4219356575,286790675
4,2655125600,248143900
5,2350697550,305096900
6,2481091325,382263800
7,2829889450,393509125
8,2555442350,513033000
9,2210500000,582087675
10,3311275075,1058032150


In [None]:
x = df.groupby(
  ['term', 'bad_acc']
).agg( 
    {
        'bad_acc': "count",
        'loan_amnt': sum,
    }
)
print(x)

                    bad_acc    loan_amnt
term       bad_acc                      
 36 months 0        1450605  18513427850
           1         159149   2003735025
 60 months 0         548408  11434219875
           1         102506   2064733175


In [None]:
x = df.groupby(
  ['int_rate_band']
).agg( 
    {
        'out_prncp': sum
    }
)
print(x)

                  out_prncp
int_rate_band              
1              1.436986e+09
2              7.870345e+08
3              1.588711e+09
4              9.047969e+08
5              6.738393e+08
6              7.173873e+08
7              1.103931e+09
8              8.506686e+08
9              6.863622e+08
10             1.301876e+09


***Vintage View***

In [None]:
#Yearly Acquision

wb = op.load_workbook(r'MI report(2).xlsx')
ws = wb['Vintage View']

uniqueValues, occurCount = np.unique(df['issue_y'], return_counts=True)
rows = len(uniqueValues)

rpos = 3
cpos = 2 

x = df.groupby(
  ['issue_y']
).agg( 
    {
        'loan_amnt': sum,
    }
)
filename = "temp.csv"
x.to_csv(filename)
data1 = pd.read_csv('temp.csv', low_memory=False)
                
for i in range(rows):
    ws.cell(row=rpos, column=cpos).value = uniqueValues[i]
    ws.cell(row=rpos, column=cpos+1).value = occurCount[i]
    ws.cell(row=rpos, column=cpos+2).value = data1.at[i, 'loan_amnt']   
    rpos = rpos+1
wb.save(r'MI report(2).xlsx')
wb.close()

In [None]:
values, count = np.unique(df['issue_y'], return_counts=True)

dfTemp = pd.DataFrame()
dfTemp['Values'] = values
dfTemp['Count'] = count
dfTemp.Values[7] = '2014 and before'

sum = count[0] + count[1] + count[2] + count[3] + count[4] + count[5] + count[6] + count[7]
dfTemp.Count[7] = sum
dfTemp = dfTemp.drop([0,1,2,3,4,5,6])
dfTemp.tail(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Values,Count
7,2014 and before,466345
8,2015,421095
9,2016,434407
10,2017,443579
11,2018,495242


In [None]:
#Distribution of Accounts and balance home ownership

x = df.groupby(
   ['issue_y_band1', 'home_ownership']
).agg(
    {
         'home_ownership': "count",
    }
)
filename = "temp.csv"
x.to_csv(filename)
df1 = pd.read_csv('temp.csv', low_memory=False)

x.add_suffix('_Count').reset_index()

df2 = df1.pivot(index = 'issue_y_band1',
          columns = 'home_ownership',
          values = 'home_ownership.1')

df2 = df2.replace(np.nan, 0)

df2['OTHERS'] = df2['ANY'] + df2['OTHER'] + df2['NONE']
df2  = df2.drop(['OTHER', 'ANY', 'NONE'],axis=1)

df2.head(100)

home_ownership,MORTGAGE,OWN,RENT,OTHERS
issue_y_band1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014 and Before,235908.0,41712.0,188492.0,233.0
2015,207683.0,45766.0,167644.0,2.0
2016,211516.0,53037.0,169744.0,110.0
2017,217123.0,52414.0,173643.0,399.0
2018,239220.0,60128.0,195406.0,488.0


In [None]:
#Distribution of Accounts and balance according to Employee Length Band
x = df.groupby(
   ['issue_y_band1', 'emp_len_band']
).agg(
    {
         'emp_len_band': "count",
    }
)

filename = "temp.csv"
x.to_csv(filename)
df1 = pd.read_csv('temp.csv', low_memory=False)

x.add_suffix('_Count').reset_index()
df2 = df1.pivot(index = 'issue_y_band1', columns = 'emp_len_band', values = 'emp_len_band.1')

df2 = df2.replace(np.nan, 0)
df2.head(100)

emp_len_band,<=3 year,<=6 years,<=9 years,>10 years,others
issue_y_band1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014 and Before,143864,84917,66473,150073,21018
2015,132740,66274,56743,141521,23817
2016,135409,70951,49861,149972,28214
2017,148638,72696,44317,146057,31871
2018,162170,84093,46610,160382,41987


In [None]:
#pd.set_option('display.max_rows', 10000)
x = df.groupby(
   ['issue_y_band1', 'loan_status']
).agg( 
    {
         'loan_status': "count",
    }
)

filename = "temp.csv"
x.to_csv(filename)
df1 = pd.read_csv('temp.csv', low_memory=False)

df1 = df1.pivot(index = 'issue_y_band1', columns = 'loan_status', values = 'loan_status.1')

df1 = df1.replace(np.nan, 0)
df1['Delinquent'] = df1['Default'] + df1['Late (16-30 days)'] + df1['Late (31-120 days)']

df1  = df1.drop(['Default', 'Late (16-30 days)', 'Late (31-120 days)', 'Does not meet the credit policy. Status:Charged Off', 'In Grace Period', 'Does not meet the credit policy. Status:Fully Paid', 'Current'],axis=1)

df1.head(10)

loan_status,Charged Off,Fully Paid,Delinquent
issue_y_band1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014 and Before,76392.0,373022.0,444.0
2015,75275.0,298137.0,1808.0
2016,66667.0,208032.0,5773.0
2017,36379.0,122521.0,9799.0
2018,6942.0,40240.0,7841.0


In [None]:
#Vintage View by Demographic and Bureue details

Demographic = 'int_rate_band1'

x = df.groupby(
  ['issue_y_band1', Demographic , 'loan_status']
).agg( 
    {
        
        'loan_status': "count",
    }
)

x = x.unstack()

filename = "temp.csv"
x.to_csv(filename)
df1 = pd.read_csv('temp.csv', low_memory=False)
df1 = df1.replace(np.nan, '0')

df1['loan_status.2'] = pd.to_numeric(df1['loan_status.2'], errors= 'coerce')
df1['loan_status.7'] = pd.to_numeric(df1['loan_status.7'], errors= 'coerce')
df1['loan_status.8'] = pd.to_numeric(df1['loan_status.8'], errors= 'coerce')

df1['loan_status.9'] = df1['loan_status.2'] + df1['loan_status.7'] + df1['loan_status.8']

df1 = df1.drop(['loan_status.1', 'loan_status.2', 'loan_status.3','loan_status.4', 'loan_status.5', 'loan_status.6', 'loan_status.7', 'loan_status.8'], axis =1)
df1 = df1.drop([0, 1])
df1.rename(columns = {'Unnamed: 0':'Year', 'Unnamed: 1': 'Demographic', 'loan_status': 'Charged Off', 'loan_status.9': 'Delinquent'}, inplace = True) 

df1 = df1.pivot(index = 'Demographic', columns = 'Year', values = ['Delinquent', 'Charged Off'])
df1 = df1.replace(np.nan, '0')
pd.set_option('display.max_rows', 100)
df1.head(100)

Unnamed: 0_level_0,Delinquent,Delinquent,Delinquent,Delinquent,Delinquent,Charged Off,Charged Off,Charged Off,Charged Off,Charged Off
Year,2014 and Before,2015,2016,2017,2018,2014 and Before,2015,2016,2017,2018
Demographic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,6.0,186.0,892.0,1223.0,902.0,5956,10825,8689,3397,897
2,132.0,718.0,2603.0,3785.0,2611.0,27748,30288,26669,13479,2156
3,216.0,707.0,1417.0,3205.0,2581.0,29585,26610,18762,11778,2305
4,77.0,163.0,605.0,772.0,1114.0,11817,6006,8206,3359,1005
5,13.0,34.0,256.0,814.0,633.0,1286,1546,4341,4366,579


In [None]:
df['total_inc'].quantile([.25, .5, .75, 1])

0.25        48000.0
0.50        69996.0
0.75       100000.0
1.00    110000000.0
Name: total_inc, dtype: float64

In [None]:
x = np.percentile(df.total_inc, [0, 25, 50, 75, 100])
print(x)

[1.0000e+00 4.8000e+04 6.9996e+04 1.0000e+05 1.1000e+08]
