In [None]:
# This notebook handles Aiko's 3rd through 5th questions
import pandas as pd
accept = pd.read_csv('../rawData/accepted_2007_to_2018Q4.csv')

# Remove subtotal rows
def find_weird(x):
    try:
        _ = int(x)
        return(True)
    except:
        return(False)

accept = accept[accept['id'].apply(find_weird)]
accept['id'] = accept['id'].apply(int)
accept['id'].dtype

In [None]:
# Question 3
df1 = accept.loc[:, ['int_rate', 'sub_grade', 'term']]
df1.groupby(['sub_grade', 'term'])['int_rate'].mean().unstack()
# This analysis uses abs rates, not spreads to trasuries.  
# With a very flat/inverted yield curve this can be misleading
# It appears to show little/no premium for longer lending, but spreads might
# tell a different story

In [None]:
# Look at rates as a function of month
import datetime as dt
def make_dateval(s):
    s = s.split('-')
    return(dt.datetime.strptime(s[1] + s[0] + '01', '%Y%b%d'))
accept['issue_d'] = accept['issue_d'].apply(make_dateval)
accept['month'] = accept['issue_d'].apply(lambda x: x.month)

In [None]:
df2 = accept.loc[:, ['int_rate', 'grade', 'month']]
df2 = df2.groupby(['grade', 'month'])['int_rate'].mean().unstack()
df2.loc['Mean', :] = df2.mean()
df2
# September looks expensive

In [None]:
# Why loan interest rate should depend on rating
import matplotlib as mpl
import matplotlib.pyplot as plt
df3 = accept.loc[:, ['sub_grade', 'loan_status']]
df3['fully_paid'] = 0
df3.loc[df3['loan_status']=='Fully Paid', 'fully_paid'] = 1
df3.loc[df3['loan_status']=='Current', 'fully_paid'] = 1
df3 = df3.groupby('sub_grade')['fully_paid'].mean()
df3 = 1 - df3
ax3 = df3.plot()
ax3.set_xlabel('Sub Grade')
ax3.set_ylabel('Non-Perf Rate')
ax3.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0%}'))
ax3.set_title('Default Rate by Rating Grade')
plt.show()
# Default rates increase strikingly linearly with rating, so of course you need to charge for that
# Re time series dependence, this is because the base ref changes (e.g. 3 or 5 year treasury yields)
# Re loan term: yes of course.  The borrower can prepay any time but the investor can't call early
# Thus the investor is taking credit risk for a longer period (which means it has more chance to deteriorate)
# and the investor is also giving the borrower a longer option on prepayment

In [None]:
# Question 4
# Most of the first section is covered by what I wrote above
# What happens to percentage rates of loans involving settlement?
s4 = accept['settlement_status'].value_counts()
s4 = s4 / sum(s4)
print(s4)

In [None]:
df5 = accept[['settlement_percentage']].loc[accept['settlement_percentage'].notna(), :]
df5['sett_bin'] = df5['settlement_percentage'].apply(lambda x: 5 * int((x-.01)/5))
df5 = df5.groupby('sett_bin')['settlement_percentage'].count()
df5 = df5[:-3]
ax5 = df5.plot(kind='bar')
ax5.set_xlabel('Settle Percent')
ax5.set_ylabel('Count')
ax5.set_title('Distribution of Settlement Percentages')
ax5.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))

In [None]:
# How does loan term impact default rates?
df6 = accept.loc[:, ['loan_status', 'sub_grade', 'term']]
df6['fully_paid'] = 0
df6.loc[df6['loan_status']=='Fully Paid', 'fully_paid'] = 1
df6.loc[df6['loan_status']=='Current', 'fully_paid'] = 1
df6 = df6.groupby(['sub_grade', 'term'])['fully_paid'].mean().unstack()
df6 = 1 - df6
ax6 = df6.plot()
ax6.set_xlabel('')
ax6.set_ylabel('Default Rate')
ax6.set_title('Default by Grade and Term')
ax6.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))
# Not too much difference really

In [None]:
# last_payment_date - issuance_date
df7 = accept.loc[(accept['loan_status']!='Current') & accept['last_pymnt_d'].notna(),
                 ['last_pymnt_d', 'issue_d', 'loan_status', 'grade', 'sub_grade', 'term', 
                 'total_pymnt', 'loan_amnt']].copy() # These last two are needed for a later question
# issue_d
df7['last_pymnt_d'] = df7['last_pymnt_d'].apply(make_dateval)

In [None]:
df7['mths_to_term'] = (df7['last_pymnt_d'] - df7['issue_d']).apply(lambda t:t.days * 12/365)
df7['fully_paid'] = 0
df7.loc[df7['loan_status']=='Fully Paid', 'fully_paid'] = 1
df7_3 = df7.loc[df7['term']==' 36 months']
df7_5 = df7.loc[df7['term']==' 60 months']

In [None]:
df7_3 = df7_3.groupby(['sub_grade', 'fully_paid'])['mths_to_term'].mean().unstack()
df7_5 = df7_5.groupby(['sub_grade', 'fully_paid'])['mths_to_term'].mean().unstack()

In [None]:
df7_3.columns = ['Default', 'Prepay']
ax7 = df7_3.plot()
ax7.set_xlabel('')
ax7.set_ylabel('Months')
ax7.set_title('Average Loan Life for 3 Year Loans')
ax7.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.1f}'))

In [None]:
df7_5.columns = ['Default', 'Prepay']
ax8 = df7_5.plot()
ax8.set_xlabel('')
ax8.set_ylabel('Months')
ax8.set_title('Average Loan Life for 5 Year Loans')
ax8.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.1f}'))
# 3 year loans default faster than 5 year.  This is because the monthly payments are larger

In [None]:
df8 = df7.loc[(df7['term']==' 60 months') & (df7['mths_to_term']<=60) ].copy()
df8['life_bin'] = df8['mths_to_term'].apply(lambda x: 3 * int((x - 0.1)/3))
df8 = df8.groupby(['life_bin', 'fully_paid'])['mths_to_term'].count().unstack()
df8.columns = ['Default', 'Prepay']
ax9 = df8.plot(kind='bar')
ax9.set_xlabel('Bin Start (Months)')
ax9.set_ylabel('Count')
ax9.set_title('Distribution of Life for 5-year Loans')
ax9.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
# Very few make it to full term.  The interest rates are high so prepayment
# (or default!) makes sense early.

In [None]:
accept['loan_status'].value_counts()

In [None]:
df9 = accept.loc[accept['loan_status']=='Fully Paid', ['total_pymnt', 'loan_amnt', 'sub_grade']].copy()
df9['prof_ret'] = (df9['total_pymnt'] - df9['loan_amnt']) / df9['loan_amnt']
print('Return on all fully paid loans: {0:.1%}'.format(df9['prof_ret'].mean()))
df9 = df9.groupby('sub_grade')['prof_ret'].mean()
ax10 = df9.plot()
ax10.set_xlabel('')
ax10.set_ylabel('Lendor Return')
ax10.set_title('Returns on Fully Paid Loans by Rating')
ax10.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))

In [None]:
df10 = accept.loc[accept['loan_status']=='Charged Off', ['total_pymnt', 'loan_amnt', 'sub_grade']].copy()
df10['prof_ret'] = (df10['total_pymnt'] - df10['loan_amnt']) / df10['loan_amnt']
print('Return on all charged off loans: {0:.1%}'.format(df10['prof_ret'].mean()))
df10 = df10.groupby('sub_grade')['prof_ret'].mean()
ax11 = df10.plot()
ax11.set_xlabel('')
ax11.set_ylabel('Lendor Return')
ax11.set_title('Returns on Defaulted Loans by Rating')
ax11.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))
# Interesting - the higher interest rate on the junk loans gets offset by the earlier default

In [None]:
df11 = accept.loc[accept['debt_settlement_flag']=='Y', ['total_pymnt', 'loan_amnt', 'sub_grade']].copy()
df11['prof_ret'] = (df11['total_pymnt'] - df11['loan_amnt']) / df11['loan_amnt']
print('Return on all settled loans: {0:.1%}'.format(df11['prof_ret'].mean()))

In [None]:
df7['prof_ret'] = (df7['total_pymnt'] - df7['loan_amnt']) / df7['loan_amnt']
df12 = df7[df7['loan_status']=='Fully Paid'].groupby('mths_to_term')['prof_ret'].mean()
ax12 = df12.plot()
ax12.set_xlabel('Months Survived')
ax12.set_ylabel('Lendor Return')
ax12.set_title('Returns on Fully-Paid Loans by Survival Time')
ax12.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))

In [None]:
df13 = df7[df7['loan_status']=='Charged Off'].groupby('mths_to_term')['prof_ret'].mean()
ax13 = df13.plot()
ax13.axhline(y=0, linestyle='--')
ax13.set_xlabel('Months Survived')
ax13.set_ylabel('Lendor Return')
ax13.set_title('Returns on Charged-Off Loans by Survival Time')
ax13.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))
# So you need to make it to 32 months to break even

In [None]:
# Question 5: Survival Analysis
df14 = df7.loc[(df7['term']==' 36 months') & (df7['fully_paid']==1)].copy()
df14['life_bin'] = df14['mths_to_term'].apply(lambda x: 3 * int((x - 0.1)/3))
df14 = df14.groupby(['life_bin', 'grade'])['mths_to_term'].count().unstack()
df14 = (df14.sum() - df14.cumsum()) / df14.sum()
ax14 = df14.loc[:36].plot()
ax14.set_xlabel('Months Survived')
ax14.set_ylabel('Survival Probability')
ax14.set_title('Survival Plot for Non-Defaulting 3-year Loans')
ax14.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))

In [None]:
df15 = df7.loc[(df7['term']==' 36 months') & (df7['fully_paid']==0)].copy()
df15['life_bin'] = df15['mths_to_term'].apply(lambda x: 3 * int((x - 0.1)/3))
df15 = df15.groupby(['life_bin', 'grade'])['mths_to_term'].count().unstack()
df15 = (df15.sum() - df15.cumsum()) / df15.sum()
ax15 = df15.loc[:36].plot()
ax15.set_xlabel('Months Survived')
ax15.set_ylabel('Survival Probability')
ax15.set_title('Survival Plot for Defaulting 3-year Loans')
ax15.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))
# Defaults happen faster than repayments

In [None]:
df16 = df7.loc[df7['term']==' 36 months'].copy()
df16['life_bin'] = df16['mths_to_term'].apply(lambda x: 3 * int((x - 0.1)/3))
df16 = df16.groupby(['life_bin', 'grade'])['mths_to_term'].count().unstack()
df16 = (df16.sum() - df16.cumsum()) / df16.sum()
ax16 = df16.loc[:36].plot()
ax16.set_xlabel('Months Survived')
ax16.set_ylabel('Survival Probability')
ax16.set_title('Survival Plot for All 3-year Loans')
ax16.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:.0%}'))
# A is linear (driven by prepayment), G is convex due to defaults