Skip to content

Commit

Permalink
Merge pull request #5345 from kshedden/gee_dep_params
Browse files Browse the repository at this point in the history
ENH: Allow dep_data to be specified using formula or names
  • Loading branch information
kshedden committed Nov 3, 2018
2 parents 0f25d54 + 83725a0 commit 16bf05a
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 20 deletions.
23 changes: 16 additions & 7 deletions statsmodels/genmod/cov_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ class Nested(CovStruct):
The variance components are estimated using least squares
regression of the products r*r', for standardized residuals r and
r' in the same group, on a vector of indicators defining which
r' in the same group, on a matrix of indicators defining which
variance components are shared by r and r'.
"""

Expand Down Expand Up @@ -481,12 +481,21 @@ def summary(self):
dependence structure.
"""

msg = "Variance estimates\n------------------\n"
for k in range(len(self.vcomp_coeff)):
msg += "Component %d: %.3f\n" % (k + 1, self.vcomp_coeff[k])
msg += "Residual: %.3f\n" % (self.scale -
np.sum(self.vcomp_coeff))
return msg
dep_names = ["Groups"]
if hasattr(self.model, "_dep_data_names"):
dep_names.extend(self.model._dep_data_names)
else:
dep_names.extend(["Component %d:" % (k + 1) for k in range(len(self.vcomp_coeff) - 1)])
if hasattr(self.model, "_groups_name"):
dep_names[0] = self.model._groups_name
dep_names.append("Residual")

vc = self.vcomp_coeff.tolist()
vc.append(self.scale - np.sum(vc))

smry = pd.DataFrame({"Variance": vc}, index=dep_names)

return smry


class Stationary(CovStruct):
Expand Down
53 changes: 41 additions & 12 deletions statsmodels/genmod/generalized_estimating_equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import numpy as np
from scipy import stats
import pandas as pd
import patsy

from statsmodels.tools.decorators import (cache_readonly,
resettable_cache)
Expand Down Expand Up @@ -643,12 +644,23 @@ def from_formula(cls, formula, groups, data, subset=None,
args : extra arguments
These are passed to the model
kwargs : extra keyword arguments
These are passed to the model with one exception. The
``eval_env`` keyword is passed to patsy. It can be either a
:class:`patsy:patsy.EvalEnvironment` object or an integer
indicating the depth of the namespace to use. For example, the
default ``eval_env=0`` uses the calling namespace. If you wish
to use a "clean" environment set ``eval_env=-1``.
These are passed to the model with two exceptions. `dep_data`
is processed as described below. The ``eval_env`` keyword is
passed to patsy. It can be either a :class:`patsy:patsy.EvalEnvironment`
object or an integer indicating the depth of the namespace to use.
For example, the default ``eval_env=0`` uses the calling namespace.
If you wish to use a "clean" environment set ``eval_env=-1``.
Optional arguments
------------------
dep_data : string or array-like
Data used for estimating the dependence structure. See
specific dependence structure classes (e.g. Nested) for
details. If `dep_data` is a string, it is interpreted as
a formula that is applied to `data`. If it is an array, it
must be an array of strings corresponding to column names in
`data`. Otherwise it must be an array-like with the same
number of rows as data.
Returns
-------
Expand All @@ -666,24 +678,41 @@ def from_formula(cls, formula, groups, data, subset=None,
the DataFrame before calling this method.
""" % {'missing_param_doc': base._missing_param_doc}

if type(groups) == str:
groups_name = "Groups"
if isinstance(groups, str):
groups_name = groups
groups = data[groups]

if type(time) == str:
if isinstance(time, str):
time = data[time]

if type(offset) == str:
if isinstance(offset, str):
offset = data[offset]

if type(exposure) == str:
if isinstance(exposure, str):
exposure = data[exposure]

dep_data = kwargs.get("dep_data")
dep_data_names = None
if dep_data is not None:
if isinstance(dep_data, str):
dep_data = patsy.dmatrix(dep_data, data, return_type='dataframe')
dep_data_names = dep_data.columns.tolist()
else:
dep_data_names = list(dep_data)
dep_data = data[dep_data]
kwargs["dep_data"] = np.asarray(dep_data)

model = super(GEE, cls).from_formula(formula, data=data, subset=subset,
groups=groups, time=time,
offset=offset,
exposure=exposure,
*args, **kwargs)

if dep_data_names is not None:
model._dep_data_names = dep_data_names
model._groups_name = groups_name

return model

def cluster_list(self, array):
Expand Down Expand Up @@ -2291,7 +2320,7 @@ def setup_nominal(self, endog, exog, groups, time, offset):
jrow += 1

# exog names
if type(self.exog_orig) == pd.DataFrame:
if isinstance(self.exog_orig, pd.DataFrame):
xnames_in = self.exog_orig.columns
else:
xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)]
Expand All @@ -2302,7 +2331,7 @@ def setup_nominal(self, endog, exog, groups, time, offset):
exog_out = pd.DataFrame(exog_out, columns=xnames)

# Preserve endog name if there is one
if type(self.endog_orig) == pd.Series:
if isinstance(self.endog_orig, pd.Series):
endog_out = pd.Series(endog_out, name=self.endog_orig.name)

return endog_out, exog_out, groups_out, time_out, offset_out
Expand Down
38 changes: 37 additions & 1 deletion statsmodels/genmod/tests/test_gee.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np
import pytest
from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
assert_array_less, assert_raises, assert_, dec)
assert_array_less, assert_raises, assert_)
from statsmodels.genmod.generalized_estimating_equations import (
GEE, OrdinalGEE, NominalGEE, NominalGEEResults, OrdinalGEEResults,
NominalGEEResultsWrapper, OrdinalGEEResultsWrapper)
Expand Down Expand Up @@ -728,6 +728,42 @@ def test_nested_linear(self):
assert_almost_equal(mdf2.standard_errors(), se,
decimal=6)

smry = mdf2.cov_struct.summary()
assert_allclose(smry.Variance, np.r_[1.043878, 0.611656, 1.421205], atol=1e-5, rtol=1e-5)


def test_nested_pandas(self):

np.random.seed(4234)
n = 10000

# Outer groups
groups = np.kron(np.arange(n // 100), np.ones(100)).astype(np.int)

# Inner groups
groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(np.int)
groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(np.int)

# Group effects
groups_e = np.random.normal(size=n // 100)
groups1_e = 2 * np.random.normal(size=n // 50)
groups2_e = 3 * np.random.normal(size=n // 10)

y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2]
y += 0.5 * np.random.normal(size=n)

df = pd.DataFrame({"y": y, "TheGroups": groups, "groups1": groups1, "groups2": groups2})

model = sm.GEE.from_formula("y ~ 1", groups="TheGroups",
dep_data="0 + groups1 + groups2",
cov_struct=sm.cov_struct.Nested(),
data=df)
result = model.fit()

# The true variances are 1, 4, 9, 0.25
smry = result.cov_struct.summary()
assert_allclose(smry.Variance, np.r_[1.437299, 4.421543, 8.905295, 0.258480], atol=1e-5, rtol=1e-5)

def test_ordinal(self):

family = Binomial()
Expand Down

0 comments on commit 16bf05a

Please sign in to comment.