## Logistic Regression

#### Libraries

In [23]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

#### Data 

In [2]:
bank = pd.read_csv("data/bank.csv")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Summary Statistics

In [3]:
bank.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


#### Frequency Table

In [4]:
bank['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

In [5]:
bank['education'].value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

In [6]:
bank['marital'].value_counts()

married     27214
single      12790
divorced     5207
Name: marital, dtype: int64

#### Cross Tabulation

In [7]:
marital = bank['marital']
subscribed = bank['y']
pd.crosstab(marital, subscribed)

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,4585,622
married,24459,2755
single,10878,1912


In [8]:
education = bank['education']
pd.crosstab(education, subscribed)

y,no,yes
education,Unnamed: 1_level_1,Unnamed: 2_level_1
primary,6260,591
secondary,20752,2450
tertiary,11305,1996
unknown,1605,252


#### Regression

In [25]:
formula = 'y ~ age + education + marital + job'
model = smf.glm(formula=formula, data=bank, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                  Generalized Linear Model Regression Results                  
Dep. Variable:     ['y[no]', 'y[yes]']   No. Observations:                45211
Model:                             GLM   Df Residuals:                    45193
Model Family:                 Binomial   Df Model:                           17
Link Function:                   logit   Scale:                             1.0
Method:                           IRLS   Log-Likelihood:                -15811.
Date:                 Wed, 18 Apr 2018   Deviance:                       31622.
Time:                         10:32:18   Pearson chi2:                 4.52e+04
No. Iterations:                      6                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  2.7543      0.113     24.276      0.000       2.532       2.977
educati

#### Confidence Intervals

In [26]:
print(result.conf_int())

                               0         1
Intercept               2.531913  2.976654
education[T.secondary] -0.309129 -0.101663
education[T.tertiary]  -0.680271 -0.441218
education[T.unknown]   -0.502840 -0.170680
marital[T.married]      0.001875  0.190134
marital[T.single]      -0.448423 -0.235634
job[T.blue-collar]      0.297255  0.533470
job[T.entrepreneur]     0.302265  0.715839
job[T.housemaid]        0.057023  0.495773
job[T.management]      -0.000004  0.236129
job[T.retired]         -0.813241 -0.513197
job[T.self-employed]   -0.010196  0.347831
job[T.services]         0.159293  0.431388
job[T.student]         -1.082313 -0.736570
job[T.technician]       0.065253  0.287645
job[T.unemployed]      -0.433560 -0.086626
job[T.unknown]         -0.254463  0.495575
age                    -0.015091 -0.008123


#### Odds Ratio

In [27]:
print(np.exp(result.params))

Intercept                 15.709781
education[T.secondary]     0.814325
education[T.tertiary]      0.570784
education[T.unknown]       0.714080
marital[T.married]         1.100764
marital[T.single]          0.710328
job[T.blue-collar]         1.514920
job[T.entrepreneur]        1.663713
job[T.housemaid]           1.318372
job[T.management]          1.125315
job[T.retired]             0.515190
job[T.self-employed]       1.183904
job[T.services]            1.343584
job[T.student]             0.402749
job[T.technician]          1.192974
job[T.unemployed]          0.770980
job[T.unknown]             1.128124
age                        0.988460
dtype: float64


#### Odds Ratio & Confidence Intervals

In [28]:
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print(np.exp(conf))

                             2.5%      97.5%         OR
Intercept               12.577545  19.622050  15.709781
education[T.secondary]   0.734086   0.903334   0.814325
education[T.tertiary]    0.506480   0.643253   0.570784
education[T.unknown]     0.604810   0.843091   0.714080
marital[T.married]       1.001877   1.209412   1.100764
marital[T.single]        0.638634   0.790069   0.710328
job[T.blue-collar]       1.346159   1.704838   1.514920
job[T.entrepreneur]      1.352920   2.045903   1.663713
job[T.housemaid]         1.058680   1.641766   1.318372
job[T.management]        0.999996   1.266338   1.125315
job[T.retired]           0.443419   0.598579   0.515190
job[T.self-employed]     0.989855   1.415994   1.183904
job[T.services]          1.172682   1.539393   1.343584
job[T.student]           0.338811   0.478753   0.402749
job[T.technician]        1.067429   1.333284   1.192974
job[T.unemployed]        0.648197   0.917020   0.770980
job[T.unknown]           0.775333   1.641442   1