In [3]:
from pandas.io.stata import StataReader

import pandas as pd
import numpy as np

import statsmodels.api as sm
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error

### Challenge 1

In [4]:
reader = StataReader('ships.dta')
df = reader.read()

In [5]:
data = pd.get_dummies(df)
y = data.damage
X = data.drop('damage', axis = 1)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.30,random_state=4444)

In [49]:
# Data 
pois_no_offset_model=sm.GLM(y_train, sm.add_constant(x_train), family=sm.families.Poisson(sm.families.links.log))

# Fitting our model using Maximum likelihood
pois_no_offset_model_results=pois_no_offset_model.fit()
y_hat_no_offset = pois_no_offset_model_results.predict(sm.add_constant(x_test))

print(pois_no_offset_model_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   23
Model:                            GLM   Df Residuals:                       13
Model Family:                 Poisson   Df Model:                            9
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -57.077
Date:                Mon, 15 Aug 2016   Deviance:                       50.036
Time:                        16:05:18   Pearson chi2:                     45.9
No. Iterations:                     9                                         
                           coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------
const                    0.4767      0.087      5.464      0.000         0.306     0.648
months                

For this model a lot of coefficients appear significant, we may try to improve the model and reduce number of significant values. Question: does Pearson chi-squared test if the model follows a Poisson distribution or if residuals follow normal distribution?

### Challenge 2

In [11]:
# Data 
pois_offset_model=sm.GLM(y_train, sm.add_constant(x_train), family=sm.families.Poisson(sm.families.links.log), offset = np.log(x_train['months']))

# Fitting our model using Maximum likelihood
pois_offset_model_results=pois_offset_model.fit()
print(pois_offset_model_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 damage   No. Observations:                   23
Model:                            GLM   Df Residuals:                       13
Model Family:                 Poisson   Df Model:                            9
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -44.986
Date:                Mon, 15 Aug 2016   Deviance:                       25.853
Time:                        15:43:09   Pearson chi2:                     28.3
No. Iterations:                     9                                         
                           coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------
const                   -3.0585      0.088    -34.658      0.000        -3.232    -2.886
months                

### Challenge 3


In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.30,random_state=4444)

In [48]:
pois_m = sm.GLM(y_train, sm.add_constant(x_train), family=sm.families.Poisson(sm.families.links.log), offset = np.log(x_train['months']))
pois_results = pois_m.fit()

y_hat_offset = pois_results.predict(sm.add_constant(x_test))

print('Mean Squared Error =', mean_squared_error(y_test,y_hat_offset))

Mean Squared Error = 189.567282412


### Challenge 4

It is possible to compute null_deviance without fitting the null model.

In [12]:
from scipy.stats import chisqprob


print('P-value of the difference btwn null and full model with the offset',
      chisqprob(pois_offset_model_results.null_deviance - pois_offset_model_results.deviance,12))

print('P-value of the difference btwn null and full model without offset',
      chisqprob(pois_no_offset_model_results.null_deviance - pois_no_offset_model_results.deviance,12))


P-value of the difference btwn null and full model with the offset 0.000212155421377
P-value of the difference btwn null and full model without offset 5.103091909e-84


Since, both p-values are small we would reject the hypothesis that the parameters of the model are zero, hence use the full model, not the null model with or wothout the offset.

### Challenge 5

In [13]:
from statsmodels.formula.api import OLS

Compute OLS for log(Y), fit to test data

In [22]:
ls_m = sm.OLS(np.log(y_train + 0.0001), sm.add_constant(x_train))
ls_m_results = ls_m.fit()

log_y_hat = ls_m_results.predict(sm.add_constant(x_test))

print('Mean Squared Error =', mean_squared_error(y_test,np.exp(log_y_hat)))

Mean Squared Error = 140438.582342


Compare coefficients for all three models. I am not sure it makes sense.

In [45]:
dictionary = {'GLM_offset':pois_offset_model_results.params,'GLM_no_offset': pois_no_offset_model_results.params,'OLS':ls_m_results.params}

table1 = pd.DataFrame.from_dict(dictionary)
table1

Unnamed: 0,GLM_no_offset,GLM_offset,OLS
const,0.476679,-3.05854,-1.377016
months,0.000116,1.9e-05,0.000351
type_A,0.515644,-0.422135,0.519184
type_B,0.481056,-1.266918,-0.233956
type_C,-0.623834,-1.236767,-1.04051
type_D,-0.173691,-0.420427,-1.816054
type_E,0.277504,0.287707,1.194321
construction_1960-64,-0.502999,-1.070824,-5.814979
construction_1965-70,-0.01777,-0.67914,-1.168822
construction_1970-74,0.83955,-0.290216,3.283761


Compare mean squared errors.

In [53]:
dictionary = {'GLM_offset':mean_squared_error(y_test,y_hat_offset),'GLM_no_offset': mean_squared_error(y_test,y_hat_no_offset),'OLS': mean_squared_error(y_test,np.exp(log_y_hat))}

table2 = pd.DataFrame(dictionary,index = ['MSE'])
table2

Unnamed: 0,GLM_no_offset,GLM_offset,OLS
MSE,4240.470053,189.567282,140438.582342


### Challenge 6

In [54]:
reader = StataReader('smoking.dta')
df = reader.read()

In [101]:
df.head(5)

Unnamed: 0,age,smoke,pop,dead
0,40-44,Doesn't smoke,656.0,18.0
1,45-49,Doesn't smoke,359.0,22.0
2,50-54,Doesn't smoke,249.0,19.0
3,55-59,Doesn't smoke,632.0,55.0
4,60-64,Doesn't smoke,1067.0,117.0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 0 to 35
Data columns (total 4 columns):
age      36 non-null category
smoke    36 non-null category
pop      36 non-null float32
dead     36 non-null float32
dtypes: category(2), float32(2)
memory usage: 752.0 bytes


In [112]:
y = df.dead
X = pd.get_dummies(df.iloc[:,:3])

In [119]:
x_train, x_test, y_train, y_test = train_test_split(sm.add_constant(X), y,test_size=0.15,random_state=4444)

In [120]:
pois_m = sm.GLM(y_train, x_train, family=sm.families.Poisson(sm.families.links.log), offset = np.log(x_train['pop']))
pois_results = pois_m.fit()
y_GLM = pois_results.predict(x_test)


ls_m = sm.OLS(np.log(y_train + 0.0001), x_train)
ls_results = ls_m.fit()
y_OLS = ls_results.predict(x_test)

In [121]:
dictionary = {'GLM_offset':mean_squared_error(y_test,y_GLM),'OLS': mean_squared_error(y_test,np.exp(y_OLS))}

table2 = pd.DataFrame(dictionary,index = ['MSE'])
table2

Unnamed: 0,GLM_offset,OLS
MSE,192817.404474,71088.147661


For Poisson model the offset is the population size.