In [1]:
import pandas as pd
import numpy as np
from scipy import stats

From the data exploratory exercise, we identified that loan purpose appears to have influence on the loans being paid off or not. To prove that the influence is statistically significant, we will run chi square test on it

In [2]:
lend = pd.read_csv('data/lending_clean.csv')
lend_2009on = lend[lend['loan_start_d'] >= '2009-01-01']

Null Hypothesis - different loan purposes do not affect loan paid off rate

Alternative Hypothesis - loan purposes affect paid off rate

In [3]:
table = pd.crosstab(lend_2009on['purpose'], lend_2009on['target'])
table_smell = pd.crosstab(lend_2009on['purpose'], lend_2009on['target'], normalize='index')
print(round(table_smell[0],2).sort_values(ascending=False))

purpose
small_business        0.27
renewable_energy      0.19
educational           0.19
moving                0.17
medical               0.17
other                 0.16
house                 0.16
vacation              0.15
debt_consolidation    0.15
home_improvement      0.13
credit_card           0.11
wedding               0.10
major_purchase        0.10
car                   0.10
Name: 0, dtype: float64


In [4]:
stat, p, dof, expected = stats.chi2_contingency(table)
print ('chi_statistic: {:.3f}'.format(stat))
print ('p-value: {:.2f}'.format(p))
print ('degree of freedom: {:.0f}'.format(dof))
# print ('expected freq: {}'.format(expected))

chi_statistic: 378.179
p-value: 0.00
degree of freedom: 13


In [5]:
# interpret p-value, using 95% confident level
prob = 0.95
alpha = 1.0 - prob
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

Dependent (reject H0)


Conclude that loan_purpose has impact on the loan paid off rates

Additionally, the following attributes will be test as well per exploratory analysis

- Credit history
- DTI
- Number of credit lines 

In [6]:
# write a function to streamline the testing steps
def chi_square(data):
    table = pd.crosstab(lend_2009on[data], lend_2009on['target'])
    stat, p, dof, expected = stats.chi2_contingency(table)
    table_smell = pd.crosstab(lend_2009on[data], lend_2009on['target'], normalize='index')
    print(round(table_smell[0],2).sort_values(ascending=False))
    print ('chi_statistic: {:.3f}'.format(stat))
    print ('p-value: {:.2f}'.format(p))
    print ('degree of freedom: {:.0f}'.format(dof))
    prob = 0.95
    alpha = 1.0 - prob
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

In [7]:
# Credit history
chi_square('yr_credit')

yr_credit
49.0    0.40
44.0    0.28
43.0    0.23
48.0    0.22
4.0     0.20
5.0     0.19
38.0    0.19
45.0    0.18
32.0    0.18
3.0     0.18
11.0    0.16
39.0    0.16
6.0     0.16
28.0    0.16
12.0    0.15
10.0    0.15
13.0    0.15
14.0    0.15
16.0    0.15
20.0    0.15
23.0    0.15
26.0    0.15
29.0    0.15
33.0    0.14
40.0    0.14
7.0     0.14
9.0     0.14
17.0    0.14
27.0    0.13
19.0    0.13
41.0    0.13
34.0    0.13
18.0    0.13
15.0    0.13
22.0    0.13
24.0    0.13
21.0    0.12
36.0    0.12
30.0    0.12
8.0     0.12
25.0    0.11
46.0    0.11
37.0    0.09
35.0    0.09
42.0    0.08
31.0    0.08
47.0    0.00
50.0    0.00
61.0    0.00
52.0    0.00
55.0    0.00
57.0    0.00
65.0    0.00
Name: 0, dtype: float64
chi_statistic: 112.650
p-value: 0.00
degree of freedom: 52
Dependent (reject H0)


In [8]:
# DTI
chi_square('dti')

dti
25.85    1.00
25.54    1.00
28.77    1.00
27.00    1.00
28.63    1.00
29.45    1.00
28.53    1.00
26.94    1.00
28.48    1.00
25.67    1.00
29.58    1.00
29.60    1.00
27.51    1.00
25.91    1.00
27.45    1.00
25.61    1.00
25.44    1.00
26.03    1.00
27.17    1.00
27.90    1.00
28.20    1.00
28.04    1.00
29.85    1.00
27.34    1.00
29.08    1.00
25.88    0.67
3.79     0.67
27.26    0.67
21.88    0.62
20.63    0.60
         ... 
5.61     0.00
5.59     0.00
21.38    0.00
5.56     0.00
21.36    0.00
27.22    0.00
5.84     0.00
5.86     0.00
6.21     0.00
27.03    0.00
26.87    0.00
21.59    0.00
6.18     0.00
26.88    0.00
26.89    0.00
26.90    0.00
26.91    0.00
26.92    0.00
26.96    0.00
6.03     0.00
26.98    0.00
6.01     0.00
5.98     0.00
26.99    0.00
5.96     0.00
5.95     0.00
27.01    0.00
5.90     0.00
27.02    0.00
29.99    0.00
Name: 0, Length: 2867, dtype: float64
chi_statistic: 2886.172
p-value: 0.39
degree of freedom: 2866
Independent (fail to reject H0)


In [9]:
# number of credit line
chi_square('total_acc')

total_acc
74.0    1.00
70.0    1.00
60.0    0.29
3.0     0.23
59.0    0.21
57.0    0.20
66.0    0.20
43.0    0.20
4.0     0.19
8.0     0.18
48.0    0.17
33.0    0.17
5.0     0.17
6.0     0.17
7.0     0.17
10.0    0.17
45.0    0.17
62.0    0.17
30.0    0.16
36.0    0.16
16.0    0.16
14.0    0.16
38.0    0.16
41.0    0.16
35.0    0.16
11.0    0.16
9.0     0.16
40.0    0.16
17.0    0.15
28.0    0.15
        ... 
18.0    0.13
51.0    0.12
26.0    0.12
34.0    0.12
53.0    0.11
37.0    0.11
32.0    0.11
47.0    0.11
54.0    0.10
55.0    0.10
52.0    0.08
50.0    0.07
58.0    0.05
75.0    0.00
76.0    0.00
87.0    0.00
81.0    0.00
80.0    0.00
79.0    0.00
64.0    0.00
65.0    0.00
78.0    0.00
67.0    0.00
68.0    0.00
69.0    0.00
77.0    0.00
71.0    0.00
72.0    0.00
73.0    0.00
90.0    0.00
Name: 0, Length: 81, dtype: float64
chi_statistic: 150.631
p-value: 0.00
degree of freedom: 80
Dependent (reject H0)


Conclude that credit history and number of credit line have impact on the loan paid off rates, while DTI does not