In [1]:
import pandas as pd
import numpy as np
from scipy import stats

From the data exploratory exercise, we identified that loan purpose appears to have influence on the loans being paid off or not. To prove that the influence is statistically significant, we will run chi square test on it

In [2]:
lend = pd.read_csv('data/lending_clean.csv')
lend_post_2008 = lend[lend['loan_start_d'] >= '2009-1-1']

Null Hypothesis - different loan purposes do not affect loan paid off rate

Alternative Hypothesis - loan purposes affect paid off rate

In [3]:
avg_loan_paid_off = lend_post_2008.target.sum()/len(lend_post_2008)
print('Average paid off rate : {:.2%}'.format(avg_loan_paid_off))

Average paid off rate : 85.29%


In [4]:
# There are 14 categories under purposes
len(lend_post_2008['purpose'].unique())

14

In [5]:
expected_rates = np.full(14, avg_loan_paid_off)

In [6]:
y = lend_post_2008.groupby(['purpose','target'])[['target']].count().unstack(level=1)
z = y.apply(lambda r: r/r.sum(), axis=1)
z.take([1],axis=1)

Unnamed: 0_level_0,target
target,1
purpose,Unnamed: 1_level_2
car,0.900214
credit_card,0.890857
debt_consolidation,0.846546
educational,0.798742
home_improvement,0.874774
house,0.837209
major_purchase,0.896518
medical,0.835726
moving,0.829268
other,0.832093


In [7]:
observed_rates = np.full(14, [0.900214, 0.890857, 0.846546, 0.798742, 0.874774, 0.837209, 0.896518, 0.835726, 0.829268, 0.832093, 0.800000, 0.725859, 0.850000, 0.894800])

In [8]:
stats.chisquare(observed_rates, expected_rates)

Power_divergenceResult(statistic=0.036663702990805, pvalue=0.9999999999999973)