Merge pull request #5345 from kshedden/gee_dep_params

ENH: Allow dep_data to be specified using formula or names
statsmodels · Nov 3, 2018 · 16bf05a · 16bf05a
2 parents 0f25d54 + 83725a0
commit 16bf05a
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 20 deletions.
diff --git a/statsmodels/genmod/cov_struct.py b/statsmodels/genmod/cov_struct.py
@@ -345,7 +345,7 @@ class Nested(CovStruct):
 
     The variance components are estimated using least squares
     regression of the products r*r', for standardized residuals r and
-    r' in the same group, on a vector of indicators defining which
+    r' in the same group, on a matrix of indicators defining which
     variance components are shared by r and r'.
     """
 
@@ -481,12 +481,21 @@ def summary(self):
         dependence structure.
         """
 
-        msg = "Variance estimates\n------------------\n"
-        for k in range(len(self.vcomp_coeff)):
-            msg += "Component %d: %.3f\n" % (k + 1, self.vcomp_coeff[k])
-        msg += "Residual: %.3f\n" % (self.scale -
-                                     np.sum(self.vcomp_coeff))
-        return msg
+        dep_names = ["Groups"]
+        if hasattr(self.model, "_dep_data_names"):
+            dep_names.extend(self.model._dep_data_names)
+        else:
+            dep_names.extend(["Component %d:" % (k + 1) for k in range(len(self.vcomp_coeff) - 1)])
+        if hasattr(self.model, "_groups_name"):
+            dep_names[0] = self.model._groups_name
+        dep_names.append("Residual")
+
+        vc = self.vcomp_coeff.tolist()
+        vc.append(self.scale - np.sum(vc))
+
+        smry = pd.DataFrame({"Variance": vc}, index=dep_names)
+
+        return smry
 
 
 class Stationary(CovStruct):

diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py
@@ -28,6 +28,7 @@
 import numpy as np
 from scipy import stats
 import pandas as pd
+import patsy
 
 from statsmodels.tools.decorators import (cache_readonly,
                                           resettable_cache)
@@ -643,12 +644,23 @@ def from_formula(cls, formula, groups, data, subset=None,
         args : extra arguments
             These are passed to the model
         kwargs : extra keyword arguments
-            These are passed to the model with one exception. The
-            ``eval_env`` keyword is passed to patsy. It can be either a
-            :class:`patsy:patsy.EvalEnvironment` object or an integer
-            indicating the depth of the namespace to use. For example, the
-            default ``eval_env=0`` uses the calling namespace. If you wish
-            to use a "clean" environment set ``eval_env=-1``.
+            These are passed to the model with two exceptions. `dep_data`
+            is processed as described below.  The ``eval_env`` keyword is
+            passed to patsy. It can be either a :class:`patsy:patsy.EvalEnvironment`
+            object or an integer indicating the depth of the namespace to use.
+            For example, the default ``eval_env=0`` uses the calling namespace.
+            If you wish to use a "clean" environment set ``eval_env=-1``.
+
+        Optional arguments
+        ------------------
+        dep_data : string or array-like
+            Data used for estimating the dependence structure.  See
+            specific dependence structure classes (e.g. Nested) for
+            details.  If `dep_data` is a string, it is interpreted as
+            a formula that is applied to `data`. If it is an array, it
+            must be an array of strings corresponding to column names in
+            `data`.  Otherwise it must be an array-like with the same
+            number of rows as data.
 
         Returns
         -------
@@ -666,24 +678,41 @@ def from_formula(cls, formula, groups, data, subset=None,
         the DataFrame before calling this method.
         """ % {'missing_param_doc': base._missing_param_doc}
 
-        if type(groups) == str:
+        groups_name = "Groups"
+        if isinstance(groups, str):
+            groups_name = groups
             groups = data[groups]
 
-        if type(time) == str:
+        if isinstance(time, str):
             time = data[time]
 
-        if type(offset) == str:
+        if isinstance(offset, str):
             offset = data[offset]
 
-        if type(exposure) == str:
+        if isinstance(exposure, str):
             exposure = data[exposure]
 
+        dep_data = kwargs.get("dep_data")
+        dep_data_names = None
+        if dep_data is not None:
+            if isinstance(dep_data, str):
+                dep_data = patsy.dmatrix(dep_data, data, return_type='dataframe')
+                dep_data_names = dep_data.columns.tolist()
+            else:
+                dep_data_names = list(dep_data)
+                dep_data = data[dep_data]
+            kwargs["dep_data"] = np.asarray(dep_data)
+
         model = super(GEE, cls).from_formula(formula, data=data, subset=subset,
                                              groups=groups, time=time,
                                              offset=offset,
                                              exposure=exposure,
                                              *args, **kwargs)
 
+        if dep_data_names is not None:
+            model._dep_data_names = dep_data_names
+        model._groups_name = groups_name
+
         return model
 
     def cluster_list(self, array):
@@ -2291,7 +2320,7 @@ def setup_nominal(self, endog, exog, groups, time, offset):
                 jrow += 1
 
         # exog names
-        if type(self.exog_orig) == pd.DataFrame:
+        if isinstance(self.exog_orig, pd.DataFrame):
             xnames_in = self.exog_orig.columns
         else:
             xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)]
@@ -2302,7 +2331,7 @@ def setup_nominal(self, endog, exog, groups, time, offset):
         exog_out = pd.DataFrame(exog_out, columns=xnames)
 
         # Preserve endog name if there is one
-        if type(self.endog_orig) == pd.Series:
+        if isinstance(self.endog_orig, pd.Series):
             endog_out = pd.Series(endog_out, name=self.endog_orig.name)
 
         return endog_out, exog_out, groups_out, time_out, offset_out

diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py
@@ -14,7 +14,7 @@
 import numpy as np
 import pytest
 from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
-                           assert_array_less, assert_raises, assert_, dec)
+                           assert_array_less, assert_raises, assert_)
 from statsmodels.genmod.generalized_estimating_equations import (
     GEE, OrdinalGEE, NominalGEE, NominalGEEResults, OrdinalGEEResults,
     NominalGEEResultsWrapper, OrdinalGEEResultsWrapper)
@@ -728,6 +728,42 @@ def test_nested_linear(self):
         assert_almost_equal(mdf2.standard_errors(), se,
                             decimal=6)
 
+        smry = mdf2.cov_struct.summary()
+        assert_allclose(smry.Variance, np.r_[1.043878, 0.611656, 1.421205], atol=1e-5, rtol=1e-5)
+
+
+    def test_nested_pandas(self):
+
+        np.random.seed(4234)
+        n = 10000
+
+        # Outer groups
+        groups = np.kron(np.arange(n // 100), np.ones(100)).astype(np.int)
+
+        # Inner groups
+        groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(np.int)
+        groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(np.int)
+
+        # Group effects
+        groups_e = np.random.normal(size=n // 100)
+        groups1_e = 2 * np.random.normal(size=n // 50)
+        groups2_e = 3 * np.random.normal(size=n // 10)
+
+        y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2]
+        y += 0.5 * np.random.normal(size=n)
+
+        df = pd.DataFrame({"y": y, "TheGroups": groups, "groups1": groups1, "groups2": groups2})
+
+        model = sm.GEE.from_formula("y ~ 1", groups="TheGroups",
+                                    dep_data="0 + groups1 + groups2",
+                                    cov_struct=sm.cov_struct.Nested(),
+                                    data=df)
+        result = model.fit()
+
+        # The true variances are 1, 4, 9, 0.25
+        smry = result.cov_struct.summary()
+        assert_allclose(smry.Variance, np.r_[1.437299, 4.421543, 8.905295, 0.258480], atol=1e-5, rtol=1e-5)
+
     def test_ordinal(self):
 
         family = Binomial()