From f0474f67f77c429172915f098785841a04bb1ce5 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 25 Oct 2018 12:01:40 -0400 Subject: [PATCH 1/2] Allow dep_data to be specified using formula or names --- statsmodels/genmod/cov_struct.py | 23 +++++++---- .../generalized_estimating_equations.py | 41 ++++++++++++++++--- statsmodels/genmod/tests/test_gee.py | 38 ++++++++++++++++- 3 files changed, 88 insertions(+), 14 deletions(-) diff --git a/statsmodels/genmod/cov_struct.py b/statsmodels/genmod/cov_struct.py index 500b09a5c6c..cd54347e174 100644 --- a/statsmodels/genmod/cov_struct.py +++ b/statsmodels/genmod/cov_struct.py @@ -345,7 +345,7 @@ class Nested(CovStruct): The variance components are estimated using least squares regression of the products r*r', for standardized residuals r and - r' in the same group, on a vector of indicators defining which + r' in the same group, on a matrix of indicators defining which variance components are shared by r and r'. """ @@ -481,12 +481,21 @@ def summary(self): dependence structure. """ - msg = "Variance estimates\n------------------\n" - for k in range(len(self.vcomp_coeff)): - msg += "Component %d: %.3f\n" % (k + 1, self.vcomp_coeff[k]) - msg += "Residual: %.3f\n" % (self.scale - - np.sum(self.vcomp_coeff)) - return msg + dep_names = ["Groups"] + if hasattr(self.model, "_dep_data_names"): + dep_names.extend(self.model._dep_data_names) + else: + dep_names.extend(["Component %d:" % (k + 1) for k in range(len(self.vcomp_coeff) - 1)]) + if hasattr(self.model, "_groups_name"): + dep_names[0] = self.model._groups_name + dep_names.append("Residual") + + vc = self.vcomp_coeff.tolist() + vc.append(self.scale - np.sum(vc)) + + smry = pd.DataFrame({"Variance": vc}, index=dep_names) + + return smry class Stationary(CovStruct): diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index 817f3b013e2..2333ab1deec 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -28,6 +28,7 @@ import numpy as np from scipy import stats import pandas as pd +import patsy from statsmodels.tools.decorators import (cache_readonly, resettable_cache) @@ -643,12 +644,23 @@ def from_formula(cls, formula, groups, data, subset=None, args : extra arguments These are passed to the model kwargs : extra keyword arguments - These are passed to the model with one exception. The - ``eval_env`` keyword is passed to patsy. It can be either a - :class:`patsy:patsy.EvalEnvironment` object or an integer - indicating the depth of the namespace to use. For example, the - default ``eval_env=0`` uses the calling namespace. If you wish - to use a "clean" environment set ``eval_env=-1``. + These are passed to the model with two exceptions. `dep_data` + is processed as described below. The ``eval_env`` keyword is + passed to patsy. It can be either a :class:`patsy:patsy.EvalEnvironment` + object or an integer indicating the depth of the namespace to use. + For example, the default ``eval_env=0`` uses the calling namespace. + If you wish to use a "clean" environment set ``eval_env=-1``. + + Optional arguments + ------------------ + dep_data : string or array-like + Data used for estimating the dependence structure. See + specific dependence structure classes (e.g. Nested) for + details. If `dep_data` is a string, it is interpreted as + a formula that is applied to `data`. If it is an array, it + must be an array of strings corresponding to column names in + `data`. Otherwise it must be an array-like with the same + number of rows as data. Returns ------- @@ -666,7 +678,9 @@ def from_formula(cls, formula, groups, data, subset=None, the DataFrame before calling this method. """ % {'missing_param_doc': base._missing_param_doc} + groups_name = "Groups" if type(groups) == str: + groups_name = groups groups = data[groups] if type(time) == str: @@ -678,12 +692,27 @@ def from_formula(cls, formula, groups, data, subset=None, if type(exposure) == str: exposure = data[exposure] + dep_data = kwargs.get("dep_data") + dep_data_names = None + if dep_data is not None: + if type(dep_data) is str: + dep_data = patsy.dmatrix(dep_data, data, return_type='dataframe') + dep_data_names = dep_data.columns.tolist() + else: + dep_data_names = list(dep_data) + dep_data = data[dep_data] + kwargs["dep_data"] = np.asarray(dep_data) + model = super(GEE, cls).from_formula(formula, data=data, subset=subset, groups=groups, time=time, offset=offset, exposure=exposure, *args, **kwargs) + if dep_data_names is not None: + model._dep_data_names = dep_data_names + model._groups_name = groups_name + return model def cluster_list(self, array): diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py index 96a5348451a..28ed5d30a28 100644 --- a/statsmodels/genmod/tests/test_gee.py +++ b/statsmodels/genmod/tests/test_gee.py @@ -14,7 +14,7 @@ import numpy as np import pytest from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose, - assert_array_less, assert_raises, assert_, dec) + assert_array_less, assert_raises, assert_) from statsmodels.genmod.generalized_estimating_equations import ( GEE, OrdinalGEE, NominalGEE, NominalGEEResults, OrdinalGEEResults, NominalGEEResultsWrapper, OrdinalGEEResultsWrapper) @@ -728,6 +728,42 @@ def test_nested_linear(self): assert_almost_equal(mdf2.standard_errors(), se, decimal=6) + smry = mdf2.cov_struct.summary() + assert_allclose(smry.Variance, np.r_[1.043878, 0.611656, 1.421205], atol=1e-5, rtol=1e-5) + + + def test_nested_pandas(self): + + np.random.seed(4234) + n = 10000 + + # Outer groups + groups = np.kron(np.arange(n // 100), np.ones(100)).astype(np.int) + + # Inner groups + groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(np.int) + groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(np.int) + + # Group effects + groups_e = np.random.normal(size=n // 100) + groups1_e = 2 * np.random.normal(size=n // 50) + groups2_e = 3 * np.random.normal(size=n // 10) + + y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2] + y += 0.5 * np.random.normal(size=n) + + df = pd.DataFrame({"y": y, "TheGroups": groups, "groups1": groups1, "groups2": groups2}) + + model = sm.GEE.from_formula("y ~ 1", groups="TheGroups", + dep_data="0 + groups1 + groups2", + cov_struct=sm.cov_struct.Nested(), + data=df) + result = model.fit() + + # The true variances are 1, 4, 9, 0.25 + smry = result.cov_struct.summary() + assert_allclose(smry.Variance, np.r_[1.437299, 4.421543, 8.905295, 0.258480], atol=1e-5, rtol=1e-5) + def test_ordinal(self): family = Binomial() From 83725a00571569a45965e0df244539432e0306bd Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Thu, 25 Oct 2018 23:20:11 -0400 Subject: [PATCH 2/2] Use isinstance instead of comparing types with == --- .../genmod/generalized_estimating_equations.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index 2333ab1deec..4e9a789be18 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -679,23 +679,23 @@ def from_formula(cls, formula, groups, data, subset=None, """ % {'missing_param_doc': base._missing_param_doc} groups_name = "Groups" - if type(groups) == str: + if isinstance(groups, str): groups_name = groups groups = data[groups] - if type(time) == str: + if isinstance(time, str): time = data[time] - if type(offset) == str: + if isinstance(offset, str): offset = data[offset] - if type(exposure) == str: + if isinstance(exposure, str): exposure = data[exposure] dep_data = kwargs.get("dep_data") dep_data_names = None if dep_data is not None: - if type(dep_data) is str: + if isinstance(dep_data, str): dep_data = patsy.dmatrix(dep_data, data, return_type='dataframe') dep_data_names = dep_data.columns.tolist() else: @@ -2320,7 +2320,7 @@ def setup_nominal(self, endog, exog, groups, time, offset): jrow += 1 # exog names - if type(self.exog_orig) == pd.DataFrame: + if isinstance(self.exog_orig, pd.DataFrame): xnames_in = self.exog_orig.columns else: xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)] @@ -2331,7 +2331,7 @@ def setup_nominal(self, endog, exog, groups, time, offset): exog_out = pd.DataFrame(exog_out, columns=xnames) # Preserve endog name if there is one - if type(self.endog_orig) == pd.Series: + if isinstance(self.endog_orig, pd.Series): endog_out = pd.Series(endog_out, name=self.endog_orig.name) return endog_out, exog_out, groups_out, time_out, offset_out