# Challenges 9
Topic: Poisson GLM  
Name: Adam Levin  
Date: 8/7/2016  

**Challenge 1**

In [229]:
import pandas as pd
import numpy as np
from pandas.io.stata import StataReader

In [230]:
!curl http://data.princeton.edu/wws509/datasets/ships.dta > ships.dta

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2324  100  2324    0     0   4950      0 --:--:-- --:--:-- --:--:-- 56682


In [231]:
reader = StataReader('ships.dta')
data = reader.read()

In [232]:
data.head()

Unnamed: 0,type,construction,operation,months,damage
0,A,1960-64,1960-74,127.0,0.0
1,A,1960-64,1975-79,63.0,0.0
2,A,1965-70,1960-74,1095.0,3.0
3,A,1965-70,1975-79,1095.0,4.0
4,A,1970-74,1960-74,1512.0,6.0


In [233]:
data.construction.value_counts()

1970-74    10
1965-70    10
1960-64     9
1975-79     5
Name: construction, dtype: int64

In [234]:
data = data.replace('1970-74',1972)
data = data.replace('1965-70',1967.5)
data = data.replace('1960-64',1962)
data = data.replace('1975-79',1977)

In [235]:
data.operation.value_counts()

1977       19
1960-74    15
Name: operation, dtype: int64

In [236]:
data = data.replace('1960-74',1967)
data = data.replace('1977',1977)

In [237]:
data = pd.get_dummies(data)

In [238]:
data.head()

Unnamed: 0,construction,operation,months,damage,type_A,type_B,type_C,type_D,type_E
0,1962.0,1967,127.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1962.0,1977,63.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1967.5,1967,1095.0,3.0,1.0,0.0,0.0,0.0,0.0
3,1967.5,1977,1095.0,4.0,1.0,0.0,0.0,0.0,0.0
4,1972.0,1967,1512.0,6.0,1.0,0.0,0.0,0.0,0.0


In [239]:
import statsmodels.api as sm

In [240]:
poisson_model = sm.GLM(data['damage'],data.drop('damage',axis=1),family=sm.families.Poisson())

In [241]:
sm.families.family.Poisson.links

[statsmodels.genmod.families.links.log,
 statsmodels.genmod.families.links.identity,
 statsmodels.genmod.families.links.sqrt]

In [242]:
results = poisson_model.fit()

In [243]:
results.summary()

0,1,2,3
Dep. Variable:,damage,No. Observations:,34.0
Model:,GLM,Df Residuals:,26.0
Model Family:,Poisson,Df Model:,7.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-108.37
Date:,"Sun, 07 Aug 2016",Deviance:,118.88
Time:,21:38:22,Pearson chi2:,112.0
No. Iterations:,9,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
construction,0.0564,0.014,4.017,0.000,0.029 0.084
operation,0.0546,0.014,3.934,0.000,0.027 0.082
months,4.89e-05,7.11e-06,6.875,0.000,3.5e-05 6.28e-05
type_A,-217.1916,38.301,-5.671,0.000,-292.260 -142.123
type_B,-216.2441,38.373,-5.635,0.000,-291.454 -141.034
type_C,-218.4018,38.294,-5.703,0.000,-293.456 -143.347
type_D,-218.0558,38.297,-5.694,0.000,-293.116 -142.996
type_E,-217.3055,38.302,-5.674,0.000,-292.376 -142.235


This model is not very good. The chi2 statistic is 112 with 26 degrees of freedom. Since P(chi-sq-26) < 112 = 1, the model fits poorly. The sample size is very small. 

**Challenge 2**

In [244]:
poisson_model = sm.GLM(data['damage'],data.drop(['damage','months'],axis=1),
                       offset=np.log(data['months']),family=sm.families.Poisson())

In [245]:
results = poisson_model.fit()

In [246]:
results.summary()

0,1,2,3
Dep. Variable:,damage,No. Observations:,34.0
Model:,GLM,Df Residuals:,27.0
Model Family:,Poisson,Df Model:,6.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-78.076
Date:,"Sun, 07 Aug 2016",Deviance:,58.286
Time:,21:38:23,Pearson chi2:,64.6
No. Iterations:,9,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
construction,0.0445,0.013,3.450,0.001,0.019 0.070
operation,0.0349,0.012,2.880,0.004,0.011 0.059
type_A,-162.1934,27.270,-5.948,0.000,-215.641 -108.746
type_B,-162.7330,27.213,-5.980,0.000,-216.070 -109.396
type_C,-162.8204,27.239,-5.978,0.000,-216.207 -109.433
type_D,-162.4262,27.286,-5.953,0.000,-215.906 -108.947
type_E,-161.7863,27.258,-5.935,0.000,-215.211 -108.362


This model performs a little better - the log-liklihood has increased.

**Challenge 3**

I'll predict 10 hold-out points and calculate the MSE.

In [247]:
test_inds = np.random.choice(range(len(data)),10,replace=False)
train_inds = [i for i in range(len(data)) if i not in test_inds]

In [248]:
poisson_model = sm.GLM(data.ix[train_inds,'damage'],data.drop(['damage','months'],axis=1).iloc[train_inds,:],
                       offset=np.log(data.ix[train_inds,'months']),family=sm.families.Poisson())

In [249]:
results = poisson_model.fit()
predictions = results.predict(data.drop(['damage','months'],axis=1).iloc[test_inds,:],
                              offset=np.log(data.ix[test_inds,'months']))
true_values = data.ix[test_inds,'damage']

In [250]:
pd.DataFrame({'True Value':true_values,'Prediction':predictions})

Unnamed: 0,Prediction,True Value
22,0.166098,0.0
28,0.200016,0.0
30,2.568388,7.0
10,50.354988,53.0
33,3.82894,1.0
7,83.906596,39.0
8,38.169414,29.0
26,2.319248,11.0
16,1.663109,0.0
14,2.256969,1.0


In [251]:
print 'MSE: %g' % np.mean((predictions-true_values)**2)

MSE: 221.509


In [252]:
print 'MSE of prediction with mean: %g' % np.mean((np.mean(true_values)-true_values)**2)

MSE of prediction with mean: 335.49


I'm impressed!

**Challenge 4**

In [253]:
from scipy.stats import chi2

In [254]:
poisson_model = sm.GLM(data['damage'],data.drop(['damage','months'],axis=1),
                       offset=np.log(data['months']),family=sm.families.Poisson())

In [255]:
results = poisson_model.fit()

In [256]:
rv = chi2(7)

In [257]:
1 - rv.cdf(results.pearson_chi2)

1.7881696123822621e-11

The probability of observing a chi2 statistic more extreme than the one observed in the model by random chance is very small so we can safely conclude the the model is better than the null model.

**Challenge 5**

In [258]:
ols = sm.OLS(np.log(data['damage']+1),data.drop('damage',axis=1))

In [259]:
results = ols.fit()

In [260]:
results.summary()

0,1,2,3
Dep. Variable:,damage,R-squared:,0.761
Model:,OLS,Adj. R-squared:,0.697
Method:,Least Squares,F-statistic:,11.83
Date:,"Sun, 07 Aug 2016",Prob (F-statistic):,1.11e-06
Time:,21:38:27,Log-Likelihood:,-32.762
No. Observations:,34,AIC:,81.52
Df Residuals:,26,BIC:,93.73
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
construction,0.0876,0.026,3.305,0.003,0.033 0.142
operation,0.0242,0.026,0.916,0.368,-0.030 0.078
months,6.338e-05,2.3e-05,2.760,0.010,1.62e-05 0.000
type_A,-218.6444,63.946,-3.419,0.002,-350.088 -87.201
type_B,-217.7965,64.072,-3.399,0.002,-349.498 -86.095
type_C,-219.2652,63.943,-3.429,0.002,-350.702 -87.829
type_D,-219.3385,63.941,-3.430,0.002,-350.772 -87.905
type_E,-218.6381,63.949,-3.419,0.002,-350.088 -87.189

0,1,2,3
Omnibus:,0.377,Durbin-Watson:,1.834
Prob(Omnibus):,0.828,Jarque-Bera (JB):,0.437
Skew:,-0.225,Prob(JB):,0.804
Kurtosis:,2.674,Cond. No.,12300000.0


In general the coefficients are similiar - they're signs are the same. The magnitudes seem more extreme with OLS.

Let's see how the MSE on a test set compares to Poisson GLM.

In [261]:
ols = sm.OLS(np.log(data.ix[train_inds,'damage']+1),data.drop('damage',axis=1).iloc[train_inds,:])

In [262]:
results = ols.fit()

In [263]:
predictions = np.exp(results.predict(data.drop('damage',axis=1).iloc[test_inds,:]))

In [264]:
pd.DataFrame({'True Value':true_values,'Prediction':predictions})

Unnamed: 0,Prediction,True Value
22,0.756647,0.0
28,2.953097,0.0
30,5.668235,7.0
10,34.456175,53.0
33,15.951005,1.0
7,253.980043,39.0
8,13.562853,29.0
26,2.498855,11.0
16,2.061596,0.0
14,1.189208,1.0


In [265]:
print 'MSE: %g' % np.mean((predictions-true_values)**2)

MSE: 4710.98


It's way worse.

**Challenge 6**

In [266]:
!curl http://data.princeton.edu/wws509/datasets/smoking.dta > smoking.dta

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1840  100  1840    0     0  37550      0 --:--:-- --:--:-- --:--:-- 39148


In [267]:
reader = StataReader('smoking.dta')
data = reader.read()

In [268]:
data.head(2)

Unnamed: 0,age,smoke,pop,dead
0,40-44,Doesn't smoke,656.0,18.0
1,45-49,Doesn't smoke,359.0,22.0


In [269]:
data.age = data.age.cat.rename_categories([42,47,52,57,62,67,72,77,83]).astype(int)

In [270]:
data = pd.get_dummies(data)

In [271]:
data.head(2)

Unnamed: 0,age,pop,dead,smoke_Doesn't smoke,smoke_Smokes cigars or pipe only,smoke_Smokes cigarettes and cigar or pipe,smoke_smokes cigarettes only
0,42,656.0,18.0,1.0,0.0,0.0,0.0
1,47,359.0,22.0,1.0,0.0,0.0,0.0


In [272]:
poisson_model = sm.GLM(data['dead'],data.drop(['dead','pop'],axis=1),
                       offset=np.log(data['pop']),family=sm.families.Poisson())

In [273]:
results = poisson_model.fit()

In [274]:
results.summary()

0,1,2,3
Dep. Variable:,dead,No. Observations:,36.0
Model:,GLM,Df Residuals:,31.0
Model Family:,Poisson,Df Model:,4.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-162.95
Date:,"Sun, 07 Aug 2016",Deviance:,85.874
Time:,21:38:31,Pearson chi2:,81.2
No. Iterations:,9,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
age,0.0657,0.001,59.801,0.000,0.064 0.068
smoke_Doesn't smoke,-6.1519,0.084,-73.499,0.000,-6.316 -5.988
smoke_Smokes cigars or pipe only,-6.1218,0.086,-70.973,0.000,-6.291 -5.953
smoke_Smokes cigarettes and cigar or pipe,-5.9125,0.072,-81.716,0.000,-6.054 -5.771
smoke_smokes cigarettes only,-5.7105,0.071,-80.003,0.000,-5.850 -5.571


In [275]:
ols = sm.OLS(np.log(data['dead']),data.drop('dead',axis=1))

In [276]:
results = ols.fit()

In [277]:
results.summary()

0,1,2,3
Dep. Variable:,dead,R-squared:,0.829
Model:,OLS,Adj. R-squared:,0.801
Method:,Least Squares,F-statistic:,29.12
Date:,"Sun, 07 Aug 2016",Prob (F-statistic):,1.17e-10
Time:,21:38:31,Log-Likelihood:,-34.119
No. Observations:,36,AIC:,80.24
Df Residuals:,30,BIC:,89.74
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
age,0.0785,0.010,8.259,0.000,0.059 0.098
pop,0.0007,0.000,6.483,0.000,0.001 0.001
smoke_Doesn't smoke,-1.1181,0.660,-1.695,0.100,-2.465 0.229
smoke_Smokes cigars or pipe only,-1.5690,0.656,-2.392,0.023,-2.909 -0.229
smoke_Smokes cigarettes and cigar or pipe,-1.2479,0.828,-1.507,0.142,-2.939 0.443
smoke_smokes cigarettes only,-0.8442,0.753,-1.121,0.271,-2.382 0.693

0,1,2,3
Omnibus:,7.055,Durbin-Watson:,0.993
Prob(Omnibus):,0.029,Jarque-Bera (JB):,6.189
Skew:,-1.008,Prob(JB):,0.0453
Kurtosis:,3.253,Cond. No.,26700.0


Let's see how the MSE of the two models compares on some hold out data.

In [282]:
test_inds = np.random.choice(range(len(data)),10,replace=False)
train_inds = [i for i in range(len(data)) if i not in test_inds]

In [283]:
poisson_model = sm.GLM(data.ix[train_inds,'dead'],data.drop(['dead','pop'],axis=1).iloc[train_inds,:],
                       offset=np.log(data.ix[train_inds,'pop']),family=sm.families.Poisson())
ols = sm.OLS(np.log(data.ix[train_inds,'dead']),data.drop('dead',axis=1).iloc[train_inds,:])
results_p = poisson_model.fit()
results_o = ols.fit()
predictions_p = results_p.predict(data.drop(['dead','pop'],axis=1).iloc[test_inds,:],
                                  offset = np.log(data.ix[test_inds,'pop']))
predictions_o = np.exp(results_o.predict(data.drop('dead',axis=1).iloc[test_inds,:]))
true_values = data.ix[test_inds,'dead']

In [284]:
pd.DataFrame({'True Values':true_values,
              'Poisson Model Predictions':predictions_p,
              'OLS Model Predictions':predictions_o})[['True Values','Poisson Model Predictions','OLS Model Predictions']]

Unnamed: 0,True Values,Poisson Model Predictions,OLS Model Predictions
11,3.0,6.428349,14.919311
4,117.0,140.445758,106.672713
33,432.0,432.463428,257.429855
15,212.0,203.920014,121.141717
2,19.0,16.873836,26.501693
25,337.0,377.395733,177.736577
8,120.0,145.407613,295.334732
20,193.0,186.812594,74.520865
32,689.0,628.653746,443.921383
31,778.0,706.326754,853.941339


In [285]:
print 'MSE of prediction with mean: %g' % np.mean((true_values-np.mean(true_values))**2)
print 'MSE of prediction with GLM: %g' % np.mean((true_values-predictions_p)**2)
print 'MSE of prediction with OLS: %g' % np.mean((true_values-predictions_o)**2)

MSE of prediction with mean: 64985
MSE of prediction with GLM: 1172.58
MSE of prediction with OLS: 17501


Wow, GLMs are the bomb!