In [None]:
# This notebook handles Aiko's 'Warm Up' and first two questions.  
# In order to keep notebook sizes manageable, subsequent questions can be found in BasicEDA_2 and 3
import pandas as pd
accept = pd.read_csv('../rawData/accepted_2007_to_2018Q4.csv')

In [None]:
# These columns all have mixed datatypes
accept.columns[[0,19,49,59,118,129,130,131,134,135,136,139,145,146,147]]

In [None]:
# id is worth fixing
def find_weird(x):
    try:
        _ = int(x)
        return(True)
    except:
        return(False)

accept = accept[accept['id'].apply(find_weird)]
accept['id'] = accept['id'].apply(int)
accept['id'].dtype

In [None]:
# Let's look at a couple of entries
pd.options.display.max_rows = 155
accept.sample(2).T

In [None]:
# What's the status with missingness?
accept.info(verbose=True, null_counts=True)

In [None]:
# Let's look at the distribution of the target variable
status = accept['loan_status'].value_counts()
print(status)
bad = status[2] + status[-2] + status[-1]
print()
print(f'Charged off and defaulted: {bad / sum(status):.1%} of all loans')

In [None]:
# Aiko's Question 1
# Variation in annual issuance
# Step 1: convert issue_d to datetime
import datetime as dt
def make_dateval(s):
    s = s.split('-')
    return(dt.datetime.strptime(s[1] + s[0] + '01', '%Y%b%d'))
    
accept['issue_d'] = accept['issue_d'].apply(make_dateval)

In [None]:
monthly_issue = accept.groupby('issue_d')['funded_amnt'].sum()
import matplotlib.pyplot as plt
monthly_issue.plot()
# May 2016 was when the CEO was ousted
# https://www.reuters.com/article/us-lendingclub-results/lending-club-ceo-resigns-after-internal-probe-shares-plummet-idUSKCN0Y01BK#:~:text=(Reuters%2FIFR)%20%2D%20Renaud,the%20company%20said%20on%20Monday.

In [None]:
accept['purpose'].value_counts()

In [None]:
# Show distribution of ratings for different purposes
rating_by_purpose = accept.groupby(['purpose', 'sub_grade'])['id'].count()  # or use grade for less graularity
rating_by_purpose = rating_by_purpose.unstack().T
rating_by_purpose = rating_by_purpose / rating_by_purpose.sum()
import plotly.express as px
fig1 = px.line(rating_by_purpose)
fig1.update_layout(title='Distribution of LC Rating by Purpose',
                   xaxis_title='Rating',
                   yaxis_title='Frequency',
                   yaxis_tickformat = ',.0%')
fig1.show()
# educational and credit cards (the biggest segment) are higher rated; 
# small business, moving and vacation are at the lower end

In [None]:
accept['year'] = accept['issue_d'].apply(lambda x: x.year)

In [None]:
# Show distribution of ratings for different years
rating_by_year = accept.groupby(['year', 'sub_grade'])['id'].count()  # or use grade for less graularity
rating_by_year = rating_by_year.unstack().T
rating_by_year = rating_by_year / rating_by_year.sum()
fig2 = px.line(rating_by_year)
fig2.update_layout(title='Distribution of LC Rating by Year',
                   xaxis_title='Rating',
                   yaxis_title='Frequency',
                   yaxis_tickformat = ',.0%')
fig2.show()
# 2007 is a bit of an outlier

In [None]:
# Show stacked bar with x = Year and y = counts by purpose
year_by_purpose = accept.groupby(['year', 'purpose'])['id'].count()
year_by_purpose = year_by_purpose.unstack()
fig3 = px.bar(year_by_purpose)
fig3.update_layout(title='Count of Loans by Purpose',
                   xaxis_title='',
                   yaxis_title='Number of loans')
fig3.show()

In [None]:
# Show stacked bar with x = Year and y = counts by grade
year_by_grade = accept.groupby(['year', 'grade'])['id'].count()
year_by_grade = year_by_grade.unstack()
fig4 = px.bar(year_by_grade)
fig4.update_layout(title='Count of Loans by Rating',
                   xaxis_title='',
                   yaxis_title='Number of loans')
fig4.show()

In [None]:
# How are loan sizes distributed?
fig5 = px.histogram(accept['loan_amnt'], nbins=40)
fig5.update_layout(title='Histogram of Loan Amounts',
                   xaxis_title='Loan Amount',
                   yaxis_title='Number of loans')
fig5.show()

In [None]:
# Do sizes vary by purpose?
df6 = accept.loc[:, ['purpose', 'loan_amnt']]
df6g = df6.groupby('purpose').median()
df6['med_loan'] = df6['purpose'].apply(lambda x:df6g.loc[x, 'loan_amnt'])
df6 = df6.sort_values('med_loan', ascending = False)
fig6 = px.box(df6, x='purpose', y='loan_amnt')
fig6.update_layout(title='Distribution of Loan Amount by Purpose',
                   xaxis_title='',
                   yaxis_title='Loan Amount')
fig6.show()

In [None]:
# Do sizes vary by loan grade?
df7 = accept[['grade', 'loan_amnt']].sort_values('grade')
fig7 = px.box(df7, x='grade', y='loan_amnt')
fig7.update_layout(title='Distribution of Loan Amount by Rating',
                   xaxis_title='',
                   yaxis_title='Loan Amount')
fig7.show()
# Medians definitely increase as ratings get worse

In [None]:
# Are loans with higher amounts harder to pay in full?
df8 = accept.loc[:, ['loan_amnt', 'loan_status']]
# split loan sizes into ten bins ($0 - $4,000, $4001 - $8000 etc.)
df8['loan_bin'] = df8['loan_amnt'].apply(lambda x: int((x-1)/4000))
df8['fully_paid'] = 0
df8.loc[df8['loan_status']=='Fully Paid', 'fully_paid'] = 1
df8 = df8.groupby('loan_bin')['fully_paid'].mean()
df8.plot()
# It looks significant - but maybe that's due to timing?

In [None]:
# Same analysis but now also including loans that are 'Current'
df9 = accept.loc[:, ['loan_amnt', 'loan_status']]
# split loan sizes into ten bins ($0 - $4,000, $4001 - $8000 etc.)
df9['loan_bin'] = df9['loan_amnt'].apply(lambda x: int((x-1)/4000))
df9['fully_paid'] = 0
df9.loc[df9['loan_status']=='Fully Paid', 'fully_paid'] = 1
# Although this wasn't in the question, let's include 'Current' in the 'good' totals
df9.loc[df9['loan_status']=='Current', 'fully_paid'] = 1
df9 = df9.groupby('loan_bin')['fully_paid'].mean()
df9.plot()