From 45728d3536ac99178a49f8982dc5a989d8556205 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 28 Apr 2014 03:33:08 -0400 Subject: [PATCH 1/6] Refactor to allow matrix structure in covariance matrices to be exploited --- .../genmod/dependence_structures/covstruct.py | 156 +++++++++++++++--- .../generalized_estimating_equations.py | 92 ++++------- statsmodels/genmod/tests/test_gee.py | 54 +++++- 3 files changed, 214 insertions(+), 88 deletions(-) diff --git a/statsmodels/genmod/dependence_structures/covstruct.py b/statsmodels/genmod/dependence_structures/covstruct.py index 0502e0b2c4b..525018f10bb 100644 --- a/statsmodels/genmod/dependence_structures/covstruct.py +++ b/statsmodels/genmod/dependence_structures/covstruct.py @@ -1,5 +1,6 @@ from statsmodels.compat.python import iterkeys, itervalues, zip, range import numpy as np +from scipy import linalg as spl class CovStruct(object): @@ -16,7 +17,6 @@ class CovStruct(object): # Parameters describing the dependency structure dep_params = None - def initialize(self, model): """ Called by GEE, used by implementations that need additional @@ -29,7 +29,6 @@ def initialize(self, model): """ self.model = model - def update(self, params): """ Updates the association parameter values based on the current @@ -42,7 +41,6 @@ def update(self, params): """ raise NotImplementedError - def covariance_matrix(self, endog_expval, index): """Returns the working covariance or correlation matrix for a given cluster of data. @@ -66,6 +64,53 @@ def covariance_matrix(self, endog_expval, index): """ raise NotImplementedError + def covariance_matrix_solve(self, expval, i, sdev, rhs): + """ + Solves the matrix equation `covmat * soln = rhs` and returns + `soln`, where `covmat` is the covariance matrix represented by + this class. + + Parameters + ---------- + expval: array-like + The expected values of endog for the cluster for which the + covariance or correlation matrix will be returned + index: integer + The index of the cluster for which the covariance or + correlation matrix will be returned + rhs : list/tuple of array-like + A set of right-hand sides; each defines a matrix equation + to be solved. + + Returns + ------- + soln : list/tuple of array-like + The solutions to the matrix equations. + + Notes + ----- + Returns None if the solver fails. + + Multiple systems with the same LHS are solved for different + RHS's; the LHS is only factorized once to save time. + + This is a default implementation, it can be reimplemented in + subclasses to optimize the linear algebra according to the + covariance matrix structure. + """ + + vmat, is_cor = self.covariance_matrix(expval, i) + + if is_cor: + vmat *= np.outer(sdev, sdev) + + try: + vco = spl.cho_factor(vmat) + except np.linalg.LinAlgError: + return None + + soln = [spl.cho_solve(vco, x) for x in rhs] + return soln def summary(self): """ @@ -86,11 +131,19 @@ class Independence(CovStruct): def update(self, params): return - def covariance_matrix(self, expval, index): dim = len(expval) return np.eye(dim, dtype=np.float64), True + def covariance_matrix_solve(self, expval, i, sdev, rhs): + v = sdev**2 + rslt = [] + for x in rhs: + if x.ndim == 1: + rslt.append(x / v) + else: + rslt.append(x / v[:, None]) + return rslt def summary(self): return "Observations within a cluster are independent." @@ -104,7 +157,6 @@ class Exchangeable(CovStruct): # The correlation between any two values in the same cluster dep_params = 0 - def update(self, params): endog = self.model.endog_li @@ -119,11 +171,7 @@ def update(self, params): residsq_sum, scale, nterm = 0, 0, 0 for i in range(self.model.num_group): - if len(endog[i]) == 0: - continue - expval, _ = cached_means[i] - sdev = np.sqrt(varfunc(expval)) resid = (endog[i] - expval) / sdev @@ -137,19 +185,39 @@ def update(self, params): scale /= (nobs - dim) self.dep_params = residsq_sum / (scale * (nterm - dim)) - def covariance_matrix(self, expval, index): dim = len(expval) dp = self.dep_params * np.ones((dim, dim), dtype=np.float64) return dp + (1 - self.dep_params) * np.eye(dim), True + def covariance_matrix_solve(self, expval, i, sdev, rhs): + + k = len(expval) + c = self.dep_params / (1 - self.dep_params) + c /= 1 + self.dep_params * (k - 1) + + rslt = [] + for x in rhs: + if x.ndim == 1: + x1 = x / sdev + y = x1 / (1 - self.dep_params) + y -= c * sum(x1) + y /= sdev + else: + x1 = x / sdev[:, None] + y = x1 / (1 - self.dep_params) + y -= c * x1.sum(0) + y /= sdev[:, None] + rslt.append(y) + + return rslt + def summary(self): return ("The correlation between two observations in the " + "same cluster is %.3f" % self.dep_params) - class Nested(CovStruct): """A nested working dependence structure. @@ -506,6 +574,61 @@ def covariance_matrix(self, endog_expval, index): cmat = self.dep_params**np.abs(idx[:, None] - idx[None, :]) return cmat, True + def covariance_matrix_solve(self, expval, i, sdev, rhs): + + k = len(expval) + soln = [] + + if k == 1: + return [x / sdev**2 for x in rhs] + + if k == 2: + mat = np.array([[1, -self.dep_params], [-self.dep_params, 1]]) + mat /= (1 - self.dep_params**2) + for x in rhs: + if x.ndim == 1: + x1 = x / sdev + else: + x1 = x / sdev[:, None] + x1 = np.dot(mat, x1) + if x.ndim == 1: + x1 /= sdev + else: + x1 /= sdev[:, None] + soln.append(x1) + return soln + + # General case: c0, c1, c2 define the inverse. c0 is on the + # diagonal, except for the first and last position. c1 is on + # the first and last position of the diagonal. c2 is on the + # sub/super diagonal. + c0 = (1 + self.dep_params**2) / (1 - self.dep_params**2) + c1 = 1 / (1 - self.dep_params**2) + c2 = self.dep_params / (self.dep_params**2 - 1) + soln = [] + for x in rhs: + flatten = False + if x.ndim == 1: + x = x[:, None] + flatten = True + x1 = x / sdev[:, None] + + z0 = np.zeros((1, x.shape[1])) + rhs1 = np.concatenate((x[1:,:], z0), axis=0) + rhs2 = np.concatenate((z0, x[0:-1,:]), axis=0) + + y = c0*x + c2*rhs1 + c2*rhs2 + y[0, :] = c1*x[0, :] + c2*x[1, :] + y[-1, :] = c1*x[-1, :] + c2*x[-2, :] + + y /= sdev[:, None] + + if flatten: + y = y[:,0] + + soln.append(y) + + return soln def summary(self): @@ -513,7 +636,6 @@ def summary(self): self.dep_params) - class GlobalOddsRatio(CovStruct): """ Estimate the global odds ratio for a GEE with ordinal or nominal @@ -625,14 +747,12 @@ def pooled_odds_ratio(self, tables): return np.exp(log_pooled_or) - def covariance_matrix(self, expected_value, index): vmat = self.get_eyy(expected_value, index) vmat -= np.outer(expected_value, expected_value) return vmat, False - def observed_crude_oddsratio(self): """The crude odds ratio is obtained by pooling all data corresponding to a given pair of cut points (c,c'), then @@ -653,9 +773,6 @@ def observed_crude_oddsratio(self): # Get the observed crude OR for i in range(len(endog)): - if len(endog[i]) == 0: - continue - # The observed joint values for the current cluster yvec = endog[i] endog_11 = np.outer(yvec, yvec) @@ -673,8 +790,6 @@ def observed_crude_oddsratio(self): return self.pooled_odds_ratio(list(itervalues(tables))) - - def get_eyy(self, endog_expval, index): """ Returns a matrix V such that V[i,j] is the joint probability @@ -709,7 +824,6 @@ def get_eyy(self, endog_expval, index): return vmat - def update(self, params): """Update the global odds ratio based on the current value of params.""" @@ -747,12 +861,10 @@ def update(self, params): tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum() tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum() - cor_expval = self.pooled_odds_ratio(list(itervalues(tables))) self.dep_params[0] *= self.crude_or / cor_expval - def summary(self): return "Global odds ratio: %.3f\n" % self.dep_params[0] diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index c41f3ee9bd6..f8c16acac73 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -25,7 +25,6 @@ from statsmodels.compat.python import iterkeys, range, lrange, lzip, zip import numpy as np from scipy import stats -from scipy import linalg as spl import pandas as pd from statsmodels.tools.decorators import (cache_readonly, @@ -171,9 +170,9 @@ class GEE(base.Model): __doc__ = """ Generalized Estimating Equations Models - GEE estimates Generalized Linear Models when the data has a - cluster structure and the observations are possibly correlated - within but not across clusters + GEE estimates Generalized Linear Models when the data have a + grouped structure, and the observations are possibly correlated + within groups but not between groups. Parameters ---------- @@ -185,7 +184,7 @@ class GEE(base.Model): intercept is not included by default and should be added by the user. See `statsmodels.tools.add_constant`. groups : array-like - A 1d array of length `nobs` containing the cluster labels. + A 1d array of length `nobs` containing the group labels. time : array-like A 2d array of time (or other index) values, used by some dependence structures to define similarity relationships among @@ -396,6 +395,12 @@ def mean_deriv_exog(exog, params): self.mean_deriv_exog = mean_deriv_exog + # Skip the covariance updates if all groups have a single + # observation (reduces to fitting a GLM). + self._do_cov_update = True + if max([len(x) for x in self.endog_li]) == 1: + self._do_cov_update = False + def cluster_list(self, array): """ Returns `array` split into subarrays corresponding to the @@ -467,28 +472,16 @@ def _update_mean_params(self): bmat, score = 0, 0 for i in range(self.num_group): - if len(endog[i]) == 0: - continue - expval, lpr = cached_means[i] - + resid = endog[i] - expval dmat = self.mean_deriv(exog[i], lpr) - sdev = np.sqrt(varfunc(expval)) - vmat, is_cor = self.cov_struct.covariance_matrix(expval, i) - if is_cor: - vmat *= np.outer(sdev, sdev) - try: - vco = spl.cho_factor(vmat) - except np.linalg.LinAlgError: - return None, None + rslt = self.cov_struct.covariance_matrix_solve(expval, i, + sdev, (dmat, resid)) + vinv_d, vinv_resid = tuple(rslt) - vinv_d = spl.cho_solve(vco, dmat) bmat += np.dot(dmat.T, vinv_d) - - resid = endog[i] - expval - vinv_resid = spl.cho_solve(vco, resid) score += np.dot(dmat.T, vinv_resid) update = np.linalg.solve(bmat, score) @@ -498,7 +491,7 @@ def _update_mean_params(self): def update_cached_means(self, mean_params): """ cached_means should always contain the most recent calculation - of the cluster-wise mean vectors. This function should be + of the group-wise mean vectors. This function should be called every time the regression parameters are changed, to keep the cached means up to date. """ @@ -555,32 +548,16 @@ def _covmat(self): bmat, cmat = 0, 0 for i in range(self.num_group): - if len(endog[i]) == 0: - continue - expval, lpr = cached_means[i] - + resid = endog[i] - expval dmat = self.mean_deriv(exog[i], lpr) - sdev = np.sqrt(varfunc(expval)) - vmat, is_cor = self.cov_struct.covariance_matrix(expval, i) - if is_cor: - vmat *= np.outer(sdev, sdev) - - try: - vco = spl.cho_factor(vmat) - except np.linalg.LinAlgError: - warnings.warn("Singular matrix encountered in GEE " - "covariance estimation", - ConvergenceWarning) - return None, None, None, None - vinv_d = spl.cho_solve(vco, dmat) + rslt = self.cov_struct.covariance_matrix_solve(expval, i, + sdev, (dmat, resid)) + vinv_d, vinv_resid = tuple(rslt) bmat += np.dot(dmat.T, vinv_d) - - resid = endog[i] - expval - vinv_resid = spl.cho_solve(vco, resid) dvinv_resid = np.dot(dmat.T, vinv_resid) cmat += np.outer(dvinv_resid, dvinv_resid) @@ -596,38 +573,29 @@ def _covmat(self): bcm = 0 for i in range(self.num_group): - if len(endog[i]) == 0: - continue - expval, lpr = cached_means[i] - + resid = endog[i] - expval dmat = self.mean_deriv(exog[i], lpr) - sdev = np.sqrt(varfunc(expval)) - vmat, is_cor = self.cov_struct.covariance_matrix(expval, i) - if is_cor: - vmat *= np.outer(sdev, sdev) - vmat *= scale - try: - vco = spl.cho_factor(vmat) - except np.linalg.LinAlgError: - return None, None + vinv_d, = self.cov_struct.covariance_matrix_solve(expval, + i, sdev, (dmat,)) + vinv_d /= scale - vinv_d = spl.cho_solve(vco, dmat) hmat = np.dot(vinv_d, naive_covariance) hmat = np.dot(hmat, dmat.T).T - resid = endog[i] - expval aresid = np.linalg.solve(np.eye(len(resid)) - hmat, resid) - srt = np.dot(dmat.T, spl.cho_solve(vco, aresid)) + srt, = self.cov_struct.covariance_matrix_solve(expval, i, + sdev, (aresid,)) + srt = np.dot(dmat.T, srt) / scale bcm += np.outer(srt, srt) robust_covariance_bc = np.dot(naive_covariance, np.dot(bcm, naive_covariance)) - return robust_covariance, naive_covariance, \ - robust_covariance_bc, cmat + return (robust_covariance, naive_covariance, + robust_covariance_bc, cmat) def predict(self, params, exog=None, offset=None, linear=False): """ @@ -748,7 +716,9 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, # updated at least once. if fitlack < ctol and itr > 0: break - self._update_assoc(mean_params) + + if self._do_cov_update: + self._update_assoc(mean_params) if fitlack >= ctol: warnings.warn("Iteration limit reached prior to convergence", diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py index 4d7114b4d6b..445310ff058 100644 --- a/statsmodels/genmod/tests/test_gee.py +++ b/statsmodels/genmod/tests/test_gee.py @@ -14,11 +14,11 @@ import os from numpy.testing import assert_almost_equal -from statsmodels.genmod.generalized_estimating_equations import GEE,\ - GEEMargins, Multinomial -from statsmodels.genmod.families import Gaussian,Binomial,Poisson -from statsmodels.genmod.dependence_structures import Exchangeable,\ - Independence,GlobalOddsRatio,Autoregressive,Nested +from statsmodels.genmod.generalized_estimating_equations import (GEE, + GEEMargins, Multinomial) +from statsmodels.genmod.families import Gaussian, Binomial, Poisson +from statsmodels.genmod.dependence_structures import (Exchangeable, + Independence, GlobalOddsRatio, Autoregressive, Nested) import pandas as pd import statsmodels.formula.api as sm @@ -251,6 +251,50 @@ def test_logistic(self): # Check for run-time exceptions in summary # print(mdf.summary()) + def test_autoregressive(self): + + dep_params_true = [0, 0.589208623896, 0.559823804948] + + params_true = [[1.08043787, 1.12709319, 0.90133927], + [0.9613677, 1.05826987, 0.90832055], + [1.05370439, 0.96084864, 0.93923374]] + + np.random.seed(342837482) + + num_group = 100 + ar_param = 0.5 + k = 3 + + ga = Gaussian() + + for gsize in 1,2,3: + + ix = np.arange(gsize)[:,None] - np.arange(gsize)[None,:] + ix = np.abs(ix) + cmat = ar_param ** ix + cmat_r = np.linalg.cholesky(cmat) + + endog = [] + exog = [] + groups = [] + for i in range(num_group): + x = np.random.normal(size=(gsize,k)) + exog.append(x) + expval = x.sum(1) + errors = np.dot(cmat_r, np.random.normal(size=gsize)) + endog.append(expval + errors) + groups.append(i*np.ones(gsize)) + + endog = np.concatenate(endog) + groups = np.concatenate(groups) + exog = np.concatenate(exog, axis=0) + + ar = Autoregressive() + md = GEE(endog, exog, groups, family=ga, cov_struct = ar) + mdf = md.fit() + + assert_almost_equal(ar.dep_params, dep_params_true[gsize-1]) + assert_almost_equal(mdf.params, params_true[gsize-1]) def test_post_estimation(self): From 078eaae3a8516d1ff2b5d79d3eb99a736126bcf5 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 28 Apr 2014 11:05:56 -0400 Subject: [PATCH 2/6] Improve handling of linear algebra issues; add options to fine tune iterations --- .../genmod/dependence_structures/covstruct.py | 102 ++++++------------ .../generalized_estimating_equations.py | 62 ++++++++--- statsmodels/genmod/tests/test_gee.py | 2 +- 3 files changed, 84 insertions(+), 82 deletions(-) diff --git a/statsmodels/genmod/dependence_structures/covstruct.py b/statsmodels/genmod/dependence_structures/covstruct.py index 525018f10bb..27f303dadda 100644 --- a/statsmodels/genmod/dependence_structures/covstruct.py +++ b/statsmodels/genmod/dependence_structures/covstruct.py @@ -5,17 +5,24 @@ class CovStruct(object): """ - The base class for correlation and covariance structures of - cluster data. + A base class for correlation and covariance structures of grouped + data. Each implementation of this class takes the residuals from a - regression model that has been fitted to clustered data, and uses - them to estimate the within-cluster variance and dependence - structure of the random errors in the model. + regression model that has been fitted to grouped data, and uses + them to estimate the within-group dependence structure of the + random errors in the model. + + The state of the covariance structure is represented through the + value of the class variable `dep_params`. The default state of a + newly-created instance should correspond to the identity + correlation matrix. """ - # Parameters describing the dependency structure - dep_params = None + def __init__(self): + + # Parameters describing the dependency structure + self.dep_params = None def initialize(self, model): """ @@ -125,8 +132,6 @@ class Independence(CovStruct): An independence working dependence structure. """ - dep_params = np.r_[[]] - # Nothing to update def update(self, params): return @@ -154,8 +159,12 @@ class Exchangeable(CovStruct): An exchangeable working dependence structure. """ - # The correlation between any two values in the same cluster - dep_params = 0 + def __init__(self): + + super(Exchangeable, self).__init__() + + # The correlation between any two values in the same cluster + self.dep_params = 0. def update(self, params): @@ -268,25 +277,6 @@ class Nested(CovStruct): variance components are shared by r and r'. """ - # The regression design matrix for estimating the variance - # components - designx = None - - # Matrices containing labels that indicate which covariance - # parameters are constrained to be equal - ilabels = None - - # The SVD of designx - designx_u = None - designx_s = None - designx_v = None - - # The scale parameter - scale = None - - # The regression coefficients for estimating the variance - # components - vcomp_coeff = None def initialize(self, model): """ @@ -460,28 +450,22 @@ class Autoregressive(CovStruct): in medicine. Vol 7, 59-71, 1988. """ - # The autoregression parameter - dep_params = 0 - - designx = None - - # The function for determining distances based on time - dist_func = None - - def __init__(self, dist_func=None): + super(Autoregressive, self).__init__() + + # The function for determining distances based on time if dist_func is None: self.dist_func = lambda x, y: np.abs(x - y).sum() else: self.dist_func = dist_func + self.designx = None - def update(self, params): + # The autocorrelation parameter + self.dep_params = 0. - if self.model.time is None: - raise ValueError("GEE: time must be provided to GEE if " - "using AR dependence structure") + def update(self, params): endog = self.model.endog_li time = self.model.time_li @@ -507,25 +491,19 @@ def update(self, params): self.designx = designx scale = self.model.estimate_scale() - varfunc = self.model.family.variance - cached_means = self.model.cached_means # Weights - var = (1 - self.dep_params**(2 * designx)) /\ - (1 - self.dep_params**2) + var = 1 - self.dep_params**(2*designx) + var /= 1 - self.dep_params**2 wts = 1 / var wts /= wts.sum() residmat = [] for i in range(self.model.num_group): - if len(endog[i]) == 0: - continue - expval, _ = cached_means[i] - sdev = np.sqrt(scale * varfunc(expval)) resid = (endog[i] - expval) / sdev @@ -565,7 +543,6 @@ def fitfunc(a): from scipy.optimize import brent self.dep_params = brent(fitfunc, brack=[b_lft, b_ctr, b_rgt]) - def covariance_matrix(self, endog_expval, index): ngrp = len(endog_expval) if self.dep_params == 0: @@ -665,21 +642,10 @@ class GlobalOddsRatio(CovStruct): from the given cut points. """ - # The current estimate of the odds ratio - dep_params = [None,] - - # The current estimate of the crude odds ratio - crude_or = None - - # See docstring - ibd = None - - # See docstring - cpp = None - def __init__(self, endog_type): super(GlobalOddsRatio, self).__init__() self.endog_type = endog_type + self.dep_params = 0. def initialize(self, model): @@ -717,7 +683,7 @@ def initialize(self, model): # Initialize the dependence parameters self.crude_or = self.observed_crude_oddsratio() - self.dep_params[0] = self.crude_or + self.dep_params = self.crude_or def pooled_odds_ratio(self, tables): @@ -797,7 +763,7 @@ def get_eyy(self, endog_expval, index): probabilities of endog and the odds ratio `current_or`. """ - current_or = self.dep_params[0] + current_or = self.dep_params ibd = self.ibd[index] # The between-observation joint probabilities @@ -863,8 +829,8 @@ def update(self, params): cor_expval = self.pooled_odds_ratio(list(itervalues(tables))) - self.dep_params[0] *= self.crude_or / cor_expval + self.dep_params *= self.crude_or / cor_expval def summary(self): - return "Global odds ratio: %.3f\n" % self.dep_params[0] + return "Global odds ratio: %.3f\n" % self.dep_params diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index f8c16acac73..a06a2c571d9 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -479,6 +479,8 @@ def _update_mean_params(self): rslt = self.cov_struct.covariance_matrix_solve(expval, i, sdev, (dmat, resid)) + if rslt is None: + return None, None vinv_d, vinv_resid = tuple(rslt) bmat += np.dot(dmat.T, vinv_d) @@ -555,6 +557,8 @@ def _covmat(self): rslt = self.cov_struct.covariance_matrix_solve(expval, i, sdev, (dmat, resid)) + if rslt is None: + return None, None, None, None vinv_d, vinv_resid = tuple(rslt) bmat += np.dot(dmat.T, vinv_d) @@ -578,16 +582,22 @@ def _covmat(self): dmat = self.mean_deriv(exog[i], lpr) sdev = np.sqrt(varfunc(expval)) - vinv_d, = self.cov_struct.covariance_matrix_solve(expval, - i, sdev, (dmat,)) + rslt = self.cov_struct.covariance_matrix_solve(expval, + i, sdev, (dmat,)) + if rslt is None: + return None, None, None, None + vinv_d = rslt[0] vinv_d /= scale hmat = np.dot(vinv_d, naive_covariance) hmat = np.dot(hmat, dmat.T).T aresid = np.linalg.solve(np.eye(len(resid)) - hmat, resid) - srt, = self.cov_struct.covariance_matrix_solve(expval, i, - sdev, (aresid,)) + rslt = self.cov_struct.covariance_matrix_solve(expval, i, + sdev, (aresid,)) + if rslt is None: + return None, NOne, None, None + srt = rslt[0] srt = np.dot(dmat.T, srt) / scale bcm += np.outer(srt, srt) @@ -660,6 +670,7 @@ def _starting_params(self): def fit(self, maxiter=60, ctol=1e-6, start_params=None, + params_niter=1, first_dep_update=0, covariance_type='robust'): """ Fits a GEE model. @@ -674,12 +685,31 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, start_params : array-like A vector of starting values for the regression coefficients. If None, a default is chosen. + params_niter : integer + The number of Gauss-Seidel updates of the mean structure + parameters that take place prior to each update of the + dependence structure. + first_dep_update : integer + No dependence structure updates occur before this + iteration number. covariance_type : string One of "robust", "naive", or "bias_reduced". Returns ------- An instance of the GEEResults class + + Notes + ----- + If convergence difficulties occur, increase the values of + `first_dep_update` and/or `params_niter`. Setting + `first_dep_update` to a greater value (e.g. ~10-20) causes the + algorithm to move close to the GLM solution before attempting + to identify the dependence structure. + + For the Gaussian family, there is no benefit to setting + `params_niter` to a value greater than 1, since the mean + structure parameters converge in one step. """ self.fit_history = {'params': [], @@ -695,7 +725,8 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, # Define here in case singularity encountered on first # iteration. - fitlack = -1. + del_params = -1. + num_assoc_updates = 0 for itr in range(maxiter): update, score = self._update_mean_params() @@ -705,7 +736,10 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, break mean_params += update self.update_cached_means(mean_params) - fitlack = np.sqrt(np.sum(score**2)) + + # L2 norm of the change in mean structure parameters at + # this iteration. + del_params = np.sqrt(np.sum(score**2)) self.fit_history['params'].append(mean_params.copy()) self.fit_history['score'].append(score) @@ -714,13 +748,15 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, # Don't exit until the association parameters have been # updated at least once. - if fitlack < ctol and itr > 0: + if del_params < ctol and num_assoc_updates > 0: break - if self._do_cov_update: + if self._do_cov_update and (itr % params_niter) == 0\ + and itr >= first_dep_update: self._update_assoc(mean_params) + num_assoc_updates += 1 - if fitlack >= ctol: + if del_params >= ctol: warnings.warn("Iteration limit reached prior to convergence", IterationLimitWarning) @@ -731,8 +767,8 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, bcov, ncov, bc_cov, _ = self._covmat() if bcov is None: - warnings.warn("Unable to determine covariance structure " - "for GEE estimates", ConvergenceWarning) + warnings.warn("Estimated covariance structure for GEE " + "estimates is singular", ConvergenceWarning) return None if self.constraint is not None: @@ -753,8 +789,8 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, results.fit_history = self.fit_history results.naive_covariance = ncov results.robust_covariance_bc = bc_cov - results.score_norm = fitlack - results.converged = (fitlack < ctol) + results.score_norm = del_params + results.converged = (del_params < ctol) results.cov_struct = self.cov_struct return results diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py index 445310ff058..a1b2c0cff76 100644 --- a/statsmodels/genmod/tests/test_gee.py +++ b/statsmodels/genmod/tests/test_gee.py @@ -360,7 +360,7 @@ def test_linear(self): [0.0440369906460754,0.0480069787567662, 0.049519758758187,0.0479760443027526]] - for j,v in enumerate((vi,ve)): + for j,v in enumerate((vi, ve)): md = GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) From 4bf5117d3cb54ea71ce0115d4cce50d0f1f83543 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Mon, 28 Apr 2014 12:50:45 -0400 Subject: [PATCH 3/6] Fixed typo NOne -> None --- statsmodels/genmod/generalized_estimating_equations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index a06a2c571d9..d600d7d0ce4 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -596,7 +596,7 @@ def _covmat(self): rslt = self.cov_struct.covariance_matrix_solve(expval, i, sdev, (aresid,)) if rslt is None: - return None, NOne, None, None + return None, None, None, None srt = rslt[0] srt = np.dot(dmat.T, srt) / scale bcm += np.outer(srt, srt) From a0fb449812e0a1fe37bf35951fe5555344f7f69a Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 29 Apr 2014 01:43:20 -0400 Subject: [PATCH 4/6] Minor renaming and docstring edits --- .../genmod/dependence_structures/covstruct.py | 118 ++++++++++-------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/statsmodels/genmod/dependence_structures/covstruct.py b/statsmodels/genmod/dependence_structures/covstruct.py index 27f303dadda..20476e831ce 100644 --- a/statsmodels/genmod/dependence_structures/covstruct.py +++ b/statsmodels/genmod/dependence_structures/covstruct.py @@ -71,20 +71,22 @@ def covariance_matrix(self, endog_expval, index): """ raise NotImplementedError - def covariance_matrix_solve(self, expval, i, sdev, rhs): + def covariance_matrix_solve(self, expval, index, stdev, rhs): """ - Solves the matrix equation `covmat * soln = rhs` and returns - `soln`, where `covmat` is the covariance matrix represented by - this class. + Solves matrix equations of the form `covmat * soln = rhs` and + returns the values of `soln`, where `covmat` is the covariance + matrix represented by this class. Parameters ---------- expval: array-like - The expected values of endog for the cluster for which the - covariance or correlation matrix will be returned + The expected value of endog for each observed value in the + group. index: integer - The index of the cluster for which the covariance or - correlation matrix will be returned + The group index. + stdev : array-like + The standard deviation of endog for each observation in + the group. rhs : list/tuple of array-like A set of right-hand sides; each defines a matrix equation to be solved. @@ -98,18 +100,24 @@ def covariance_matrix_solve(self, expval, i, sdev, rhs): ----- Returns None if the solver fails. - Multiple systems with the same LHS are solved for different - RHS's; the LHS is only factorized once to save time. + Some dependence structures do not use `expval` and/or `index` + to determine the correlation matrix. Some families + (e.g. binomial) do not use the `stdev` parameter when forming + the covariance matrix. + + Systems of linear equations with the covariance matrix as the + left hand side (LHS) are solved for different right hand sides + (RHS); the LHS is only factorized once to save time. This is a default implementation, it can be reimplemented in subclasses to optimize the linear algebra according to the - covariance matrix structure. + struture of the covariance matrix. """ - vmat, is_cor = self.covariance_matrix(expval, i) + vmat, is_cor = self.covariance_matrix(expval, index) if is_cor: - vmat *= np.outer(sdev, sdev) + vmat *= np.outer(stdev, stdev) try: vco = spl.cho_factor(vmat) @@ -140,8 +148,8 @@ def covariance_matrix(self, expval, index): dim = len(expval) return np.eye(dim, dtype=np.float64), True - def covariance_matrix_solve(self, expval, i, sdev, rhs): - v = sdev**2 + def covariance_matrix_solve(self, expval, i, stdev, rhs): + v = stdev**2 rslt = [] for x in rhs: if x.ndim == 1: @@ -181,8 +189,8 @@ def update(self, params): for i in range(self.model.num_group): expval, _ = cached_means[i] - sdev = np.sqrt(varfunc(expval)) - resid = (endog[i] - expval) / sdev + stdev = np.sqrt(varfunc(expval)) + resid = (endog[i] - expval) / stdev ngrp = len(resid) residsq = np.outer(resid, resid) @@ -199,7 +207,7 @@ def covariance_matrix(self, expval, index): dp = self.dep_params * np.ones((dim, dim), dtype=np.float64) return dp + (1 - self.dep_params) * np.eye(dim), True - def covariance_matrix_solve(self, expval, i, sdev, rhs): + def covariance_matrix_solve(self, expval, i, stdev, rhs): k = len(expval) c = self.dep_params / (1 - self.dep_params) @@ -208,15 +216,15 @@ def covariance_matrix_solve(self, expval, i, sdev, rhs): rslt = [] for x in rhs: if x.ndim == 1: - x1 = x / sdev + x1 = x / stdev y = x1 / (1 - self.dep_params) y -= c * sum(x1) - y /= sdev + y /= stdev else: - x1 = x / sdev[:, None] + x1 = x / stdev[:, None] y = x1 / (1 - self.dep_params) y -= c * x1.sum(0) - y /= sdev[:, None] + y /= stdev[:, None] rslt.append(y) return rslt @@ -366,8 +374,8 @@ def update(self, params): expval, _ = cached_means[i] - sdev = np.sqrt(varfunc(expval)) - resid = (endog[i] - offset[i] - expval) / sdev + stdev = np.sqrt(varfunc(expval)) + resid = (endog[i] - offset[i] - expval) / stdev ix1, ix2 = np.tril_indices(len(resid), -1) dvmat.append(resid[ix1] * resid[ix2]) @@ -421,27 +429,28 @@ def summary(self): class Autoregressive(CovStruct): """ - An autoregressive working dependence structure. The dependence is - defined in terms of the `time` component of the parent GEE class. - Time represents a potentially multidimensional index from which - distances between pairs of obsercations can be determined. The - correlation between two observations in the same cluster is - dep_params**distance, where `dep_params` is the autocorrelation - parameter to be estimated, and distance is the distance between - the two observations, calculated from their corresponding time - values. `time` is stored as an n_obs x k matrix, where `k` - represents the number of dimensions in the time index. + An autoregressive working dependence structure. + + The dependence is defined in terms of the `time` component of the + parent GEE class. Time represents a potentially multidimensional + index from which distances between pairs of observations can be + determined. The correlation between two observations in the same + cluster is dep_params^distance, where `dep_params` is the + autocorrelation parameter to be estimated, and `distance` is the + distance between the two observations, calculated from their + corresponding time values. `time` is stored as an n_obs x k + matrix, where `k` represents the number of dimensions in the time + index. The autocorrelation parameter is estimated using weighted nonlinear least squares, regressing each value within a cluster on - each preceeding value within the same cluster. + each preceeding value in the same cluster. Parameters ---------- dist_func: function from R^k x R^k to R^+, optional - A function that takes the time vector for two observations and - computed the distance between the two observations based on the - time vector. + A function that computes the distance between the two + observations based on their `time` values. Reference --------- @@ -504,8 +513,8 @@ def update(self, params): for i in range(self.model.num_group): expval, _ = cached_means[i] - sdev = np.sqrt(scale * varfunc(expval)) - resid = (endog[i] - expval) / sdev + stdev = np.sqrt(scale * varfunc(expval)) + resid = (endog[i] - expval) / stdev ngrp = len(resid) for j1 in range(ngrp): @@ -551,34 +560,37 @@ def covariance_matrix(self, endog_expval, index): cmat = self.dep_params**np.abs(idx[:, None] - idx[None, :]) return cmat, True - def covariance_matrix_solve(self, expval, i, sdev, rhs): + def covariance_matrix_solve(self, expval, i, stdev, rhs): + # The inverse of an AR(1) covariance matrix is tri-diagonal. k = len(expval) soln = [] + # LHS has 1 column if k == 1: - return [x / sdev**2 for x in rhs] + return [x / stdev**2 for x in rhs] + # LHS has 2 columns if k == 2: mat = np.array([[1, -self.dep_params], [-self.dep_params, 1]]) mat /= (1 - self.dep_params**2) for x in rhs: if x.ndim == 1: - x1 = x / sdev + x1 = x / stdev else: - x1 = x / sdev[:, None] + x1 = x / stdev[:, None] x1 = np.dot(mat, x1) if x.ndim == 1: - x1 /= sdev + x1 /= stdev else: - x1 /= sdev[:, None] + x1 /= stdev[:, None] soln.append(x1) return soln - # General case: c0, c1, c2 define the inverse. c0 is on the - # diagonal, except for the first and last position. c1 is on - # the first and last position of the diagonal. c2 is on the - # sub/super diagonal. + # LHS has >= 3 columns: values c0, c1, c2 defined below give + # the inverse. c0 is on the diagonal, except for the first + # and last position. c1 is on the first and last position of + # the diagonal. c2 is on the sub/super diagonal. c0 = (1 + self.dep_params**2) / (1 - self.dep_params**2) c1 = 1 / (1 - self.dep_params**2) c2 = self.dep_params / (self.dep_params**2 - 1) @@ -588,7 +600,7 @@ def covariance_matrix_solve(self, expval, i, sdev, rhs): if x.ndim == 1: x = x[:, None] flatten = True - x1 = x / sdev[:, None] + x1 = x / stdev[:, None] z0 = np.zeros((1, x.shape[1])) rhs1 = np.concatenate((x[1:,:], z0), axis=0) @@ -598,10 +610,10 @@ def covariance_matrix_solve(self, expval, i, sdev, rhs): y[0, :] = c1*x[0, :] + c2*x[1, :] y[-1, :] = c1*x[-1, :] + c2*x[-2, :] - y /= sdev[:, None] + y /= stdev[:, None] if flatten: - y = y[:,0] + y = np.squeeze(y) soln.append(y) From 69e1166099ee0bd8746feb47bab7d769834167b7 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Tue, 29 Apr 2014 01:50:07 -0400 Subject: [PATCH 5/6] Minor typographical changes --- statsmodels/genmod/generalized_estimating_equations.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/statsmodels/genmod/generalized_estimating_equations.py b/statsmodels/genmod/generalized_estimating_equations.py index d600d7d0ce4..948f16ad2bc 100644 --- a/statsmodels/genmod/generalized_estimating_equations.py +++ b/statsmodels/genmod/generalized_estimating_equations.py @@ -723,15 +723,13 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, self.update_cached_means(mean_params) - # Define here in case singularity encountered on first - # iteration. del_params = -1. num_assoc_updates = 0 - for itr in range(maxiter): + update, score = self._update_mean_params() if update is None: - warnings.warn("Singular matrix encountered in GEE update", + warnings.warn("Singular matrix encountered in GEE update", ConvergenceWarning) break mean_params += update @@ -752,7 +750,7 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, break if self._do_cov_update and (itr % params_niter) == 0\ - and itr >= first_dep_update: + and (itr >= first_dep_update): self._update_assoc(mean_params) num_assoc_updates += 1 @@ -782,7 +780,7 @@ def fit(self, maxiter=60, ctol=1e-6, start_params=None, # The superclass constructor will multiply the covariance # matrix argument bcov by scale, which we don't want, so we - # divide bvov by the scale parameter here + # divide bcov by the scale parameter here results = GEEResults(self, mean_params, bcov / scale, scale) results.covariance_type = covariance_type From 8e19eb9c3c888f5e8d13200a162f4fe3b181ec2f Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Wed, 30 Apr 2014 01:46:50 -0400 Subject: [PATCH 6/6] Minor docstring and naming changes, ready to merge --- .../genmod/dependence_structures/covstruct.py | 83 ++++++++++--------- statsmodels/genmod/tests/test_gee.py | 1 - 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/statsmodels/genmod/dependence_structures/covstruct.py b/statsmodels/genmod/dependence_structures/covstruct.py index 20476e831ce..06d3b08fd97 100644 --- a/statsmodels/genmod/dependence_structures/covstruct.py +++ b/statsmodels/genmod/dependence_structures/covstruct.py @@ -148,7 +148,7 @@ def covariance_matrix(self, expval, index): dim = len(expval) return np.eye(dim, dtype=np.float64), True - def covariance_matrix_solve(self, expval, i, stdev, rhs): + def covariance_matrix_solve(self, expval, index, stdev, rhs): v = stdev**2 rslt = [] for x in rhs: @@ -158,6 +158,10 @@ def covariance_matrix_solve(self, expval, i, stdev, rhs): rslt.append(x / v[:, None]) return rslt + update.__doc__ = CovStruct.update.__doc__ + covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__ + covariance_matrix_solve.__doc__ = CovStruct.covariance_matrix_solve.__doc__ + def summary(self): return "Observations within a cluster are independent." @@ -205,38 +209,42 @@ def update(self, params): def covariance_matrix(self, expval, index): dim = len(expval) dp = self.dep_params * np.ones((dim, dim), dtype=np.float64) - return dp + (1 - self.dep_params) * np.eye(dim), True + return dp + (1. - self.dep_params) * np.eye(dim), True - def covariance_matrix_solve(self, expval, i, stdev, rhs): + def covariance_matrix_solve(self, expval, index, stdev, rhs): k = len(expval) - c = self.dep_params / (1 - self.dep_params) - c /= 1 + self.dep_params * (k - 1) + c = self.dep_params / (1. - self.dep_params) + c /= 1. + self.dep_params * (k - 1) rslt = [] for x in rhs: if x.ndim == 1: x1 = x / stdev - y = x1 / (1 - self.dep_params) + y = x1 / (1. - self.dep_params) y -= c * sum(x1) y /= stdev else: x1 = x / stdev[:, None] - y = x1 / (1 - self.dep_params) + y = x1 / (1. - self.dep_params) y -= c * x1.sum(0) y /= stdev[:, None] rslt.append(y) return rslt + update.__doc__ = CovStruct.update.__doc__ + covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__ + covariance_matrix_solve.__doc__ = CovStruct.covariance_matrix_solve.__doc__ + def summary(self): return ("The correlation between two observations in the " + "same cluster is %.3f" % self.dep_params) - class Nested(CovStruct): - """A nested working dependence structure. + """ + A nested working dependence structure. A working dependence structure that captures a nested hierarchy of groups, each level of which contributes to the random error term @@ -285,7 +293,6 @@ class Nested(CovStruct): variance components are shared by r and r'. """ - def initialize(self, model): """ Called on the first call to update @@ -348,7 +355,6 @@ def initialize(self, model): self.designx_s = svd[1] self.designx_v = svd[2].T - def update(self, params): endog = self.model.endog_li @@ -368,10 +374,6 @@ def update(self, params): scale = 0. for i in range(self.model.num_group): - # Needed? - if len(endog[i]) == 0: - continue - expval, _ = cached_means[i] stdev = np.sqrt(varfunc(expval)) @@ -395,7 +397,6 @@ def update(self, params): self.dep_params = self.vcomp_coeff.copy() - def covariance_matrix(self, expval, index): dim = len(expval) @@ -411,6 +412,8 @@ def covariance_matrix(self, expval, index): vmat /= self.scale return vmat, True + update.__doc__ = CovStruct.update.__doc__ + covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__ def summary(self): """ @@ -504,9 +507,9 @@ def update(self, params): cached_means = self.model.cached_means # Weights - var = 1 - self.dep_params**(2*designx) - var /= 1 - self.dep_params**2 - wts = 1 / var + var = 1. - self.dep_params**(2*designx) + var /= 1. - self.dep_params**2 + wts = 1. / var wts /= wts.sum() residmat = [] @@ -543,9 +546,9 @@ def fitfunc(a): # Right bracket point b_rgt, f_rgt = 0.75, fitfunc(0.75) while f_rgt < f_ctr: - b_rgt = b_rgt + (1 - b_rgt) / 2 + b_rgt = b_rgt + (1. - b_rgt) / 2 f_rgt = fitfunc(b_rgt) - if b_rgt > 1 - 1e-6: + if b_rgt > 1. - 1e-6: raise ValueError( "Autoregressive: unable to find right bracket") @@ -560,7 +563,7 @@ def covariance_matrix(self, endog_expval, index): cmat = self.dep_params**np.abs(idx[:, None] - idx[None, :]) return cmat, True - def covariance_matrix_solve(self, expval, i, stdev, rhs): + def covariance_matrix_solve(self, expval, index, stdev, rhs): # The inverse of an AR(1) covariance matrix is tri-diagonal. k = len(expval) @@ -573,7 +576,7 @@ def covariance_matrix_solve(self, expval, i, stdev, rhs): # LHS has 2 columns if k == 2: mat = np.array([[1, -self.dep_params], [-self.dep_params, 1]]) - mat /= (1 - self.dep_params**2) + mat /= (1. - self.dep_params**2) for x in rhs: if x.ndim == 1: x1 = x / stdev @@ -591,9 +594,9 @@ def covariance_matrix_solve(self, expval, i, stdev, rhs): # the inverse. c0 is on the diagonal, except for the first # and last position. c1 is on the first and last position of # the diagonal. c2 is on the sub/super diagonal. - c0 = (1 + self.dep_params**2) / (1 - self.dep_params**2) - c1 = 1 / (1 - self.dep_params**2) - c2 = self.dep_params / (self.dep_params**2 - 1) + c0 = (1. + self.dep_params**2) / (1. - self.dep_params**2) + c1 = 1. / (1. - self.dep_params**2) + c2 = -self.dep_params / (1. - self.dep_params**2) soln = [] for x in rhs: flatten = False @@ -619,6 +622,10 @@ def covariance_matrix_solve(self, expval, i, stdev, rhs): return soln + update.__doc__ = CovStruct.update.__doc__ + covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__ + covariance_matrix_solve.__doc__ = CovStruct.covariance_matrix_solve.__doc__ + def summary(self): return ("Autoregressive(1) dependence parameter: %.3f\n" % @@ -687,7 +694,7 @@ def initialize(self, model): cpp1 = {} for k1 in range(self.ncut): for k2 in range(k1+1): - v1, v2 = np.nonzero((j1==k1) & (j2==k2)) + v1, v2 = np.nonzero((j1 == k1) & (j2 == k2)) cpp1[(k2, k1)] = \ np.hstack((v2[:, None], v1[:, None])) cpp.append(cpp1) @@ -754,9 +761,9 @@ def observed_crude_oddsratio(self): # The observed joint values for the current cluster yvec = endog[i] endog_11 = np.outer(yvec, yvec) - endog_10 = np.outer(yvec, 1 - yvec) - endog_01 = np.outer(1 - yvec, yvec) - endog_00 = np.outer(1 - yvec, 1 - yvec) + endog_10 = np.outer(yvec, 1. - yvec) + endog_01 = np.outer(1. - yvec, yvec) + endog_00 = np.outer(1. - yvec, 1. - yvec) cpp1 = cpp[i] for ky in iterkeys(cpp1): @@ -784,10 +791,10 @@ def get_eyy(self, endog_expval, index): else: psum = endog_expval[:, None] + endog_expval[None, :] pprod = endog_expval[:, None] * endog_expval[None, :] - pfac = np.sqrt((1 + psum * (current_or - 1))**2 + - 4 * current_or * (1 - current_or) * pprod) - vmat = 1 + psum * (current_or - 1) - pfac - vmat /= 2 * (current_or - 1) + pfac = np.sqrt((1. + psum * (current_or - 1.))**2 + + 4 * current_or * (1. - current_or) * pprod) + vmat = 1. + psum * (current_or - 1.) - pfac + vmat /= 2. * (current_or - 1) # Fix E[YY'] for elements that belong to same observation for bdl in ibd: @@ -821,15 +828,12 @@ def update(self, params): for i in range(self.model.num_group): - if len(endog[i]) == 0: - continue - endog_expval, _ = cached_means[i] emat_11 = self.get_eyy(endog_expval, i) emat_10 = endog_expval[:, None] - emat_11 emat_01 = -emat_11 + endog_expval - emat_00 = 1 - (emat_11 + emat_10 + emat_01) + emat_00 = 1. - (emat_11 + emat_10 + emat_01) cpp1 = cpp[i] for ky in iterkeys(cpp1): @@ -843,6 +847,9 @@ def update(self, params): self.dep_params *= self.crude_or / cor_expval + update.__doc__ = CovStruct.update.__doc__ + covariance_matrix.__doc__ = CovStruct.covariance_matrix.__doc__ + def summary(self): return "Global odds ratio: %.3f\n" % self.dep_params diff --git a/statsmodels/genmod/tests/test_gee.py b/statsmodels/genmod/tests/test_gee.py index a1b2c0cff76..32e6f42f64c 100644 --- a/statsmodels/genmod/tests/test_gee.py +++ b/statsmodels/genmod/tests/test_gee.py @@ -11,7 +11,6 @@ from __future__ import print_function from statsmodels.compat import lrange import numpy as np - import os from numpy.testing import assert_almost_equal from statsmodels.genmod.generalized_estimating_equations import (GEE,