forked from statsmodels/statsmodels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_glm.py
153 lines (131 loc) · 5.14 KB
/
example_glm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
'''Generalized Linear Models
'''
import numpy as np
import statsmodels.api as sm
from scipy import stats
from matplotlib import pyplot as plt
#Example for using GLM on binomial response data
#the input response vector in this case is N by 2 (success, failure)
#This data is taken with permission from
#Jeff Gill (2000) Generalized linear models: A unified approach
#The response variable is
#(of students above the math national median, # of students below)
#| The explanatory variables are (in column order)
#| The proportion of low income families "LOWINC"
#| The proportions of minority students,"PERASIAN","PERBLACK","PERHISP"
#| The percentage of minority teachers "PERMINTE",
#| The median teacher salary including benefits in 1000s "AVSALK"
#| The mean teacher experience in years "AVYRSEXP",
#| The per-pupil expenditures in thousands "PERSPENK"
#| The pupil-teacher ratio "PTRATIO"
#| The percent of students taking college credit courses "PCTAF",
#| The percentage of charter schools in the districut "PCTCHRT"
#| The percent of schools in the district operating year round "PCTYRRND"
#| The following are interaction terms "PERMINTE_AVYRSEXP","PERMINTE_AVSAL",
#| "AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF",
#| "PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF"
data = sm.datasets.star98.load()
data.exog = sm.add_constant(data.exog)
#The response variable is (success, failure). Eg., the first
#observation is
print data.endog[0]
#Giving a total number of trials for this observation of
#print data.endog[0].sum()
glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())
binom_results = glm_binom.fit()
#The fitted values are
print binom_results.params
#The corresponding t-values are
print binom_results.tvalues
#It is common in GLMs with interactions to compare first differences.
#We are interested in the difference of the impact of the explanatory variable
#on the response variable. This example uses interquartile differences for
#the percentage of low income households while holding the other values
#constant at their mean.
means = data.exog.mean(axis=0)
means25 = means.copy()
means25[0] = stats.scoreatpercentile(data.exog[:,0], 25)
means75 = means.copy()
means75[0] = lowinc_75per = stats.scoreatpercentile(data.exog[:,0], 75)
resp_25 = binom_results.predict(means25)
resp_75 = binom_results.predict(means75)
diff = resp_75 - resp_25
#.. print """The interquartile first difference for the percentage of low income
#.. households in a school district is %2.4f %%""" % (diff*100)
#The interquartile first difference for the percentage of low income
#households in a school district is
print diff*100
means0 = means.copy()
means100 = means.copy()
means0[0] = data.exog[:,0].min()
means100[0] = data.exog[:,0].max()
resp_0 = binom_results.predict(means0)
resp_100 = binom_results.predict(means100)
diff_full = resp_100 - resp_0
print """The full range difference is %2.4f %%""" % (diff_full*100)
nobs = binom_results.nobs
y = data.endog[:,0]/data.endog.sum(1)
yhat = binom_results.mu
#Plot of yhat vs y
plt.figure()
plt.scatter(yhat, y)
line_fit = sm.OLS(y, sm.add_constant(yhat)).fit().params
fit = lambda x: line_fit[1]+line_fit[0]*x # better way in scipy?
plt.plot(np.linspace(0,1,nobs), fit(np.linspace(0,1,nobs)))
plt.title('Model Fit Plot')
plt.ylabel('Observed values')
#@savefig glm_fitted.png
plt.xlabel('Fitted values')
#Plot of yhat vs. Pearson residuals
plt.figure()
plt.scatter(yhat, binom_results.resid_pearson)
plt.plot([0.0, 1.0],[0.0, 0.0], 'k-')
plt.title('Residual Dependence Plot')
plt.ylabel('Pearson Residuals')
#@savefig glm_resids.png
plt.xlabel('Fitted values')
#Histogram of standardized deviance residuals
plt.figure()
res = binom_results.resid_deviance.copy()
stdres = (res - res.mean())/res.std()
plt.hist(stdres, bins=25)
#@savefig glm_hist_res.png
plt.title('Histogram of standardized deviance residuals')
#QQ Plot of Deviance Residuals
plt.figure()
res.sort()
p = np.linspace(0 + 1./(nobs-1), 1-1./(nobs-1), nobs)
quants = np.zeros_like(res)
for i in range(nobs):
quants[i] = stats.scoreatpercentile(res, p[i]*100)
mu = res.mean()
sigma = res.std()
y = stats.norm.ppf(p, loc=mu, scale=sigma)
plt.scatter(y, quants)
plt.plot([y.min(),y.max()],[y.min(),y.max()],'r--')
plt.title('Normal - Quantile Plot')
plt.ylabel('Deviance Residuals Quantiles')
plt.xlabel('Quantiles of N(0,1)')
from statsmodels import graphics
#@savefig glm_qqplot.png
img = graphics.gofplots.qqplot(res, line='r')
#.. plt.show()
#.. plt.close('all')
#Example for using GLM Gamma for a proportional count response
#Brief description of the data and design
print sm.datasets.scotland.DESCRLONG
data2 = sm.datasets.scotland.load()
data2.exog = sm.add_constant(data2.exog)
glm_gamma = sm.GLM(data2.endog, data2.exog, family=sm.families.Gamma())
glm_results = glm_gamma.fit()
##Example for Gaussian distribution with a noncanonical link
nobs2 = 100
x = np.arange(nobs2)
np.random.seed(54321)
X = np.column_stack((x,x**2))
X = sm.add_constant(X)
lny = np.exp(-(.03*x + .0001*x**2 - 1.0)) + .001 * np.random.rand(nobs2)
gauss_log = sm.GLM(lny, X, family=sm.families.Gaussian(sm.families.links.log))
gauss_log_results = gauss_log.fit()
#check summary
print binom_results.summary()