Skip to content

Commit

Permalink
Merge pull request #7059 from kshedden/gee_unstr
Browse files Browse the repository at this point in the history
ENH: Unstructured covariance for GEE
  • Loading branch information
kshedden committed Oct 3, 2020
2 parents a5a6c44 + 8ead01c commit 4201f12
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 12 deletions.
86 changes: 85 additions & 1 deletion statsmodels/genmod/cov_struct.py
Expand Up @@ -28,7 +28,7 @@ class CovStruct(object):
random errors in the model.
The current state of the covariance structure is represented
through the value of the `dep_params` attribute.
through the value of the `dep_params` attribute.
The default state of a newly-created instance should always be
the identity correlation matrix.
Expand Down Expand Up @@ -216,6 +216,90 @@ def summary(self):
return ("Observations within a cluster are modeled "
"as being independent.")

class Unstructured(CovStruct):
"""
An unstructured dependence structure.
To use the unstructured dependence structure, a `time`
argument must be provided when creating the GEE. The
time argument must be of integer dtype, and indicates
which position in a complete data vector is occupied
by each observed value.
"""

def __init__(self, cov_nearest_method="clipped"):

super(Unstructured, self).__init__(cov_nearest_method)

def initialize(self, model):

self.model = model

import numbers
if not issubclass(self.model.time.dtype.type, numbers.Integral):
msg = "time must be provided and must have integer dtype"
raise ValueError(msg)

q = self.model.time[:, 0].max() + 1

self.dep_params = np.eye(q)

@Appender(CovStruct.covariance_matrix.__doc__)
def covariance_matrix(self, endog_expval, index):

if hasattr(self.model, "time"):
time_li = self.model.time_li
ix = time_li[index][:, 0]
return self.dep_params[np.ix_(ix, ix)],True

return self.dep_params, True

@Appender(CovStruct.update.__doc__)
def update(self, params):

endog = self.model.endog_li
nobs = self.model.nobs
varfunc = self.model.family.variance
cached_means = self.model.cached_means
has_weights = self.model.weights is not None
weights_li = self.model.weights

time_li = self.model.time_li
q = self.model.time.max() + 1
csum = np.zeros((q, q))
wsum = 0.
cov = np.zeros((q, q))

scale = 0.
for i in range(self.model.num_group):

# Get the Pearson residuals
expval, _ = cached_means[i]
stdev = np.sqrt(varfunc(expval))
resid = (endog[i] - expval) / stdev

ix = time_li[i][:, 0]
m = np.outer(resid, resid)
ssr = np.sum(np.diag(m))

w = weights_li[i] if has_weights else 1.
csum[np.ix_(ix, ix)] += w
wsum += w * len(ix)
cov[np.ix_(ix, ix)] += w * m
scale += w * ssr
ddof = self.model.ddof_scale
scale /= wsum * (nobs - ddof) / float(nobs)
cov /= (csum - ddof)

sd = np.sqrt(np.diag(cov))
cov /= np.outer(sd, sd)

self.dep_params = cov

def summary(self):
print("Estimated covariance structure:")
print(self.dep_params)


class Exchangeable(CovStruct):
"""
Expand Down
30 changes: 19 additions & 11 deletions statsmodels/genmod/generalized_estimating_equations.py
Expand Up @@ -230,7 +230,7 @@ def unpack_cov(self, bcov):
Gaussian | x x x
inv Gaussian | x x x
binomial | x x x x x x x x x
Poisson | x x x
Poisson | x x x
neg binomial | x x x x
gamma | x x x
Expand All @@ -252,12 +252,18 @@ def unpack_cov(self, bcov):
and agrees with R's gee implementation. To obtain the robust
standard errors reported in Stata, multiply by sqrt(N / (N - g)),
where N is the total sample size, and g is the average group size.
%(notes)s
Examples
--------
%(example)s
"""

_gee_nointercept = """
The nominal and ordinal GEE models should not have an intercept
(either implicit or explicit). Use "0 + " in a formula to
suppress the intercept.
"""

_gee_family_doc = """\
The default is Gaussian. To specify the binomial
distribution use `family=sm.families.Binomial()`. Each family
Expand Down Expand Up @@ -390,7 +396,7 @@ def unpack_cov(self, bcov):
>>> import statsmodels.api as sm
>>> fam = sm.families.Poisson()
>>> ind = sm.cov_struct.Independence()
>>> model = sm.GEE.from_formula("y ~ age + trt + base", "subject", \
>>> model = sm.GEE.from_formula("y ~ age + trt + base", "subject",
data, cov_struct=ind, family=fam)
>>> result = model.fit()
>>> print(result.summary())
Expand All @@ -401,7 +407,7 @@ def unpack_cov(self, bcov):
>>> import statsmodels.formula.api as smf
>>> fam = sm.families.Poisson()
>>> ind = sm.cov_struct.Independence()
>>> model = smf.gee("y ~ age + trt + base", "subject", \
>>> model = smf.gee("y ~ age + trt + base", "subject",
data, cov_struct=ind, family=fam)
>>> result = model.fit()
>>> print(result.summary())
Expand All @@ -420,7 +426,7 @@ def unpack_cov(self, bcov):
Using formulas:
>>> import statsmodels.formula.api as smf
>>> model = smf.ordinal_gee("y ~ x1 + x2", groups, data,
>>> model = smf.ordinal_gee("y ~ 0 + x1 + x2", groups, data,
cov_struct=gor)
>>> result = model.fit()
>>> print(result.summary())
Expand All @@ -439,15 +445,15 @@ def unpack_cov(self, bcov):
Using formulas:
>>> import statsmodels.api as sm
>>> model = sm.NominalGEE.from_formula("y ~ x1 + x2", groups,
>>> model = sm.NominalGEE.from_formula("y ~ 0 + x1 + x2", groups,
data, cov_struct=gor)
>>> result = model.fit()
>>> print(result.summary())
Using the formula API:
>>> import statsmodels.formula.api as smf
>>> model = smf.nominal_gee("y ~ x1 + x2", groups, data,
>>> model = smf.nominal_gee("y ~ 0 + x1 + x2", groups, data,
cov_struct=gor)
>>> result = model.fit()
>>> print(result.summary())
Expand Down Expand Up @@ -480,7 +486,8 @@ class GEE(GLM):
"Equations.\n" + _gee_init_doc %
{'extra_params': base._missing_param_doc,
'family_doc': _gee_family_doc,
'example': _gee_example})
'example': _gee_example,
'notes': ""})

cached_means = None

Expand Down Expand Up @@ -596,7 +603,6 @@ def __init__(self, endog, exog, groups, time=None, family=None,

# Time defaults to a 1d grid with equal spacing
if self.time is not None:
self.time = np.asarray(self.time, np.float64)
if self.time.ndim == 1:
self.time = self.time[:, None]
self.time_li = self.cluster_list(self.time)
Expand Down Expand Up @@ -2265,7 +2271,8 @@ class OrdinalGEE(GEE):
" Ordinal Response Marginal Regression Model using GEE\n" +
_gee_init_doc % {'extra_params': base._missing_param_doc,
'family_doc': _gee_ordinal_family_doc,
'example': _gee_ordinal_example})
'example': _gee_ordinal_example,
'notes': _gee_nointercept})

def __init__(self, endog, exog, groups, time=None, family=None,
cov_struct=None, missing='none', offset=None,
Expand Down Expand Up @@ -2547,7 +2554,8 @@ class NominalGEE(GEE):
" Nominal Response Marginal Regression Model using GEE.\n" +
_gee_init_doc % {'extra_params': base._missing_param_doc,
'family_doc': _gee_nominal_family_doc,
'example': _gee_nominal_example})
'example': _gee_nominal_example,
'notes': _gee_nointercept})

def __init__(self, endog, exog, groups, time=None, family=None,
cov_struct=None, missing='none', offset=None,
Expand Down
63 changes: 63 additions & 0 deletions statsmodels/genmod/tests/test_gee.py
Expand Up @@ -2047,3 +2047,66 @@ def test_grid_ar():
rtol=0.05)
assert_allclose(result1.cov_struct.dep_params,
result3.cov_struct.dep_params[1], rtol=0.05)


def test_unstructured_complete():

np.random.seed(43)
ngrp = 400
cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]])
covr = np.linalg.cholesky(cov)
e = np.random.normal(size=(ngrp, 3))
e = np.dot(e, covr.T)
xmat = np.random.normal(size=(3*ngrp, 3))
par = np.r_[1, -2, 0.1]
ey = np.dot(xmat, par)
y = ey + e.ravel()
g = np.kron(np.arange(ngrp), np.ones(3))
t = np.kron(np.ones(ngrp), np.r_[0, 1, 2]).astype(np.int)

m = gee.GEE(y, xmat, time=t, cov_struct=cov_struct.Unstructured(),
groups=g)
r = m.fit()

assert_allclose(r.params, par, 0.05, 0.5)
assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5)


def test_unstructured_incomplete():

np.random.seed(43)
ngrp = 400
cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]])
covr = np.linalg.cholesky(cov)
e = np.random.normal(size=(ngrp, 3))
e = np.dot(e, covr.T)
xmat = np.random.normal(size=(3*ngrp, 3))
par = np.r_[1, -2, 0.1]
ey = np.dot(xmat, par)

yl, xl, tl, gl = [], [], [], []
for i in range(ngrp):

# Omit one observation from each group of 3
ix = [0, 1, 2]
ix.pop(i % 3)
ix = np.asarray(ix)
tl.append(ix)

yl.append(ey[3*i + ix] + e[i, ix])
x = xmat[3*i + ix, :]
xl.append(x)
gl.append(i * np.ones(2))

y = np.concatenate(yl)
x = np.concatenate(xl, axis=0)
t = np.concatenate(tl)
t = np.asarray(t, dtype=np.int)
g = np.concatenate(gl)

m = gee.GEE(y, x, time=t[:, None], cov_struct=cov_struct.Unstructured(),
groups=g)
r = m.fit()

assert_allclose(r.params, par, 0.05, 0.5)
assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5)

0 comments on commit 4201f12

Please sign in to comment.