forked from statsmodels/statsmodels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_glm.py
141 lines (117 loc) · 4.11 KB
/
example_glm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""Generalized Linear Models
"""
import numpy as np
import statsmodels.api as sm
from scipy import stats
from matplotlib import pyplot as plt
#GLM: Binomial response data
#---------------------------
#Load data
#^^^^^^^^^
# In this example, we use the Star98 dataset which was taken with permission
# from Jeff Gill (2000) Generalized linear models: A unified approach. Codebook
# information can be obtained by typing:
print sm.datasets.star98.NOTE
# Load the data and add a constant to the exogenous (independent) variables:
data = sm.datasets.star98.load()
data.exog = sm.add_constant(data.exog, prepend=False)
# The dependent variable is N by 2 (Success: NABOVE, Failure: NBELOW):
print data.endog[:5, :]
# The independent variables include all the other variables described above, as
# well as the interaction terms:
print data.exog[:2, :]
#Fit and summary
#^^^^^^^^^^^^^^^
glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())
res = glm_binom.fit()
print res.summary()
#Quantities of interest
#^^^^^^^^^^^^^^^^^^^^^^
# Total number of trials:
print data.endog[0].sum()
# Parameter estimates:
print res.params
# The corresponding t-values:
print res.tvalues
# First differences: We hold all explanatory variables constant at their means
# and manipulate the percentage of low income households to assess its impact
# on the response variables:
means = data.exog.mean(axis=0)
means25 = means.copy()
means25[0] = stats.scoreatpercentile(data.exog[:, 0], 25)
means75 = means.copy()
means75[0] = lowinc_75per = stats.scoreatpercentile(data.exog[:, 0], 75)
resp_25 = res.predict(means25)
resp_75 = res.predict(means75)
diff = resp_75 - resp_25
# The interquartile first difference for the percentage of low income
# households in a school district is:
print '%2.4f%%' % (diff * 100)
#Plots
#^^^^^
# We extract information that will be used to draw some interesting plots:
nobs = res.nobs
y = data.endog[:, 0] / data.endog.sum(1)
yhat = res.mu
# Plot yhat vs y:
plt.figure();
plt.scatter(yhat, y);
line_fit = sm.OLS(y, sm.add_constant(yhat, prepend=False)).fit().params
fit = lambda x: line_fit[1] + line_fit[0] * x # better way in scipy?
plt.plot(np.linspace(0, 1, nobs), fit(np.linspace(0, 1, nobs)));
plt.title('Model Fit Plot');
plt.ylabel('Observed values');
#@savefig glm_fitted.png
plt.xlabel('Fitted values');
# Plot yhat vs. Pearson residuals:
plt.figure();
plt.scatter(yhat, res.resid_pearson);
plt.plot([0.0, 1.0], [0.0, 0.0], 'k-');
plt.title('Residual Dependence Plot');
plt.ylabel('Pearson Residuals');
#@savefig glm_resids.png
plt.xlabel('Fitted values');
#Histogram of standardized deviance residuals
plt.figure();
resid = res.resid_deviance.copy()
resid_std = (resid - resid.mean()) / resid.std()
plt.hist(resid_std, bins=25);
#@savefig glm_hist_res.png
plt.title('Histogram of standardized deviance residuals');
#QQ Plot of Deviance Residuals
from statsmodels import graphics
#@savefig glm_qqplot.png
graphics.gofplots.qqplot(resid, line='r');
#GLM: Gamma for proportional count response
#------------------------------------------
#Load data
#^^^^^^^^^
# In the example above, we printed the ``NOTE`` attribute to learn about the
# Star98 dataset. Statsmodels datasets ships with other useful information. For
# example:
print sm.datasets.scotland.DESCRLONG
# Load the data and add a constant to the exogenous variables:
data2 = sm.datasets.scotland.load()
data2.exog = sm.add_constant(data2.exog, prepend=False)
print data2.exog[:5, :]
print data2.endog[:5]
#Fit and summary
#^^^^^^^^^^^^^^^
glm_gamma = sm.GLM(data2.endog, data2.exog, family=sm.families.Gamma())
glm_results = glm_gamma.fit()
print glm_results.summary()
#GLM: Gaussian distribution with a noncanonical link
#---------------------------------------------------
#Artificial data
#^^^^^^^^^^^^^^^
nobs2 = 100
x = np.arange(nobs2)
np.random.seed(54321)
X = np.column_stack((x, x**2))
X = sm.add_constant(X, prepend=False)
lny = np.exp(-(.03 * x + .0001 * x**2 - 1.0)) + .001 * np.random.rand(nobs2)
#Fit and summary
#^^^^^^^^^^^^^^^
gauss_log = sm.GLM(lny, X, family=sm.families.Gaussian(sm.families.links.log))
gauss_log_results = gauss_log.fit()
print gauss_log_results.summary()