From 04aa54710fd1fc3aa1f55d8b0016ee58e62c5807 Mon Sep 17 00:00:00 2001 From: Josef Perktold Date: Tue, 13 Nov 2018 21:56:43 -0500 Subject: [PATCH] ENH: add OLSVectorized, initial version --- statsmodels/regression/linear_model.py | 19 ++- .../regression/special_linear_model.py | 161 ++++++++++++++++++ .../tests/test_special_linear_model.py | 74 ++++++++ 3 files changed, 248 insertions(+), 6 deletions(-) create mode 100644 statsmodels/regression/special_linear_model.py create mode 100644 statsmodels/regression/tests/test_special_linear_model.py diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py index 5b86dd8f824..d04a27725cb 100644 --- a/statsmodels/regression/linear_model.py +++ b/statsmodels/regression/linear_model.py @@ -306,7 +306,14 @@ def fit(self, method="pinv", cov_type='nonrobust', cov_kwds=None, if self._df_resid is None: self.df_resid = self.nobs - self.rank - if isinstance(self, OLS): + results_class = getattr(self, '_results_class', None) + if results_class is not None: + lfit = results_class( + self, beta, + normalized_cov_params=self.normalized_cov_params, + cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t, + **kwargs) + elif isinstance(self, OLS): lfit = OLSResults( self, beta, normalized_cov_params=self.normalized_cov_params, @@ -842,7 +849,7 @@ def loglike(self, params, scale=None): resid = self.endog - np.dot(self.exog, params) if hasattr(self, 'offset'): resid -= self.offset - ssr = np.sum(resid**2) + ssr = np.sum(resid**2, axis=0) if scale is None: # profile log likelihood llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2 @@ -1555,7 +1562,7 @@ def resid(self): @cache_writable() def scale(self): wresid = self.wresid - return np.dot(wresid, wresid) / self.df_resid + return wresid.T.dot(wresid) / self.df_resid @cache_readonly def ssr(self): @@ -1568,8 +1575,8 @@ def centered_tss(self): weights = getattr(model, 'weights', None) sigma = getattr(model, 'sigma', None) if weights is not None: - mean = np.average(model.endog, weights=weights) - return np.sum(weights * (model.endog - mean)**2) + mean = np.average(model.endog, weights=weights, axis=0) + return np.sum(weights * (model.endog - mean).T**2, axis=-1) elif sigma is not None: # Exactly matches WLS when sigma is diagonal iota = np.ones_like(model.endog) @@ -1579,7 +1586,7 @@ def centered_tss(self): err = model.whiten(err) return np.sum(err**2) else: - centered_endog = model.wendog - model.wendog.mean() + centered_endog = model.wendog - model.wendog.mean(0) return np.dot(centered_endog, centered_endog) @cache_readonly diff --git a/statsmodels/regression/special_linear_model.py b/statsmodels/regression/special_linear_model.py new file mode 100644 index 00000000000..13c1549f2c9 --- /dev/null +++ b/statsmodels/regression/special_linear_model.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 13 16:24:12 2018 + +Author: Josef Perktold +""" + +import numpy as np +from statsmodels.tools.decorators import (cache_readonly, + cache_writable) + +from statsmodels.regression.linear_model import OLS, RegressionResults + +from numpy.testing import assert_allclose + + +class OLSVectorizedResults(RegressionResults): + r"""results class for vectorized OLS + """ + + @cache_writable() + def scale(self): + wresid = self.wresid + return (wresid * wresid).sum(0) / self.df_resid + + + @cache_readonly + def bse(self): + return np.sqrt(self.scale * + np.diag(self.normalized_cov_params)[:, None]) + + @cache_readonly + def ssr(self): + wresid = self.wresid + return (wresid * wresid).sum(0) + + @cache_readonly + def uncentered_tss(self): + wendog = self.model.wendog + return (wendog * wendog).sum(0) + + def conf_int(self, alpha=.05, cols=None): + #print('using OLSVectorizedResults.conf_int') + ci = super(OLSVectorizedResults, self).conf_int(alpha=alpha, + cols=cols) + + return np.rollaxis(ci, -1) + + def t_test(self, r_matrix, cov_p=None, scale=None, use_t=None): + """ + Compute a t-test for a single linear hypothesis of the form Rb = q + + This is an adjusted version of the corresponding LikelihoodModelResults + method adjusted to be vectorized across endog. + + Parameters + ---------- + r_matrix : array-like, str, tuple + - array : If an array is given, a p x k 2d array or length k 1d + array specifying the linear restrictions. It is assumed + that the linear combination is equal to zero. + - str : The full hypotheses to test can be given as a string. + See the examples. + - tuple : A tuple of arrays in the form (R, q). If q is given, + can be either a scalar or a length p row vector. + cov_p : array-like, optional + An alternative estimate for the parameter covariance matrix. + If None is given, self.normalized_cov_params is used. + scale : float, optional + An optional `scale` to use. Default is the scale specified + by the model fit. + use_t : bool, optional + If use_t is None, then the default of the model is used. + If use_t is True, then the p-values are based on the t + distribution. + If use_t is False, then the p-values are based on the normal + distribution. + + Returns + ------- + res : ContrastResults instance + The results for the test are attributes of this results instance. + The available results have the same elements as the parameter table + in `summary()`. + + Examples + -------- + >>> import numpy as np + >>> import statsmodels.api as sm + + See Also + --------- + tvalues : individual t statistics + f_test : for F tests + patsy.DesignInfo.linear_constraint + """ + from statsmodels.tools.tools import recipr + from statsmodels.stats.contrast import ContrastResults + from patsy import DesignInfo + names = self.model.data.param_names + LC = DesignInfo(names).linear_constraint(r_matrix) + r_matrix, q_matrix = LC.coefs, LC.constants + num_ttests = r_matrix.shape[0] + num_params = r_matrix.shape[1] + + if (cov_p is None and self.normalized_cov_params is None and + not hasattr(self, 'cov_params_default')): + raise ValueError('Need covariance of parameters for computing ' + 'T statistics') + if num_params != self.params.shape[0]: + raise ValueError('r_matrix and params are not aligned') + if q_matrix is None: + q_matrix = np.zeros(num_ttests) + else: + q_matrix = np.asarray(q_matrix) + q_matrix = q_matrix.squeeze() + if q_matrix.size > 1: + if q_matrix.shape[0] != num_ttests: + raise ValueError("r_matrix and q_matrix must have the same " + "number of rows") + + if use_t is None: + # switch to use_t false if undefined + use_t = (hasattr(self, 'use_t') and self.use_t) + + _t = _sd = None + + _effect = np.dot(r_matrix, self.params) + + # vectorized OLS: we use initially scale=1 and then add the scale + cov_p = self.normalized_cov_params + # Perform the test + if num_ttests > 1: + _sd = np.sqrt(np.diag(self.cov_params( + r_matrix=r_matrix, cov_p=cov_p))) + else: + _sd = np.sqrt(self.cov_params(r_matrix=r_matrix, cov_p=cov_p)) + + _sd = np.squeeze(_sd) * np.sqrt(self.scale) + _effect = np.squeeze(_effect) + _t = (_effect - q_matrix) * recipr(_sd) + + df_resid = getattr(self, 'df_resid_inference', self.df_resid) + + if use_t: + return ContrastResults(effect=_effect, t=_t, sd=_sd, + df_denom=df_resid) + else: + return ContrastResults(effect=_effect, statistic=_t, sd=_sd, + df_denom=df_resid, + distribution='norm') + + def summary(self): + from statsmodels.iolib.summary import summary_params_2dflat + summ = summary_params_2dflat(self, + endog_names=self.model.endog_names)[1] + return summ + + +class OLSVectorized(OLS): + _results_class = OLSVectorizedResults diff --git a/statsmodels/regression/tests/test_special_linear_model.py b/statsmodels/regression/tests/test_special_linear_model.py new file mode 100644 index 00000000000..5591b58af9b --- /dev/null +++ b/statsmodels/regression/tests/test_special_linear_model.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Nov 13 20:33:02 2018 + +Author: Josef Perktold +""" + +import numpy as np +from numpy.testing import assert_allclose +from statsmodels.regression.linear_model import OLS +from statsmodels.regression.special_linear_model import OLSVectorized + + +class TestOLSVectorized(object): + + @classmethod + def setup_class(cls): + np.random.seed(3) + sig_e = 0.1 + + exog = (np.repeat(np.arange(3), 20)[:,None] == + np.arange(3)).astype(float) + + params_true = np.array([[1, 1, 1], [1, 1, 1], + [1, 1.5, 2], [1, 1.5, 2]]).T + + y_true = exog.dot(params_true) + endog = y_true + sig_e * np.random.randn(*y_true.shape) + + cls.res1 = OLSVectorized(endog, exog).fit() + cls.res2_list = [OLS(endog[:, i], exog).fit() for i in range(4)] + + def test_attributes(self): + res1 = self.res1 + res2_list = self.res2_list + + + attrs = ['params', 'scale', 'bse', 'tvalues', 'pvalues', 'ssr', 'centered_tss', + 'rsquared', 'llf', 'aic', 'bic', 'fvalue', 'f_pvalue'] + for attr in attrs: + value = getattr(res1, attr) + res2_value = np.array([getattr(r, attr) for r in res2_list]).T + assert_allclose(value, res2_value, rtol=1e-13) + + def test_methods(self): + res1 = self.res1 + res2_list = self.res2_list + + ci1 = res1.conf_int() + ci2 = np.array([r.conf_int() for r in res2_list]) + ci2 = np.rollaxis(ci2, 0, 3) + assert_allclose(ci1, ci2, rtol=1e-13) + + # smoke test + res1.summary() + + def test_wald(self): + res1 = self.res1 + res2_list = self.res2_list + + tt1 = res1.t_test('x1=x2') + tt2_list = [r.t_test('x1=x2') for r in res2_list] + attrs = ['effect', 'sd', 'tvalue', 'pvalue', 'df_denom', 'statistic'] + for attr in attrs: + value = getattr(tt1, attr) + res2_value = np.array([getattr(t, attr) for t in tt2_list]).squeeze() + assert_allclose(value, res2_value, rtol=1e-12, err_msg=attr) + + ci1 = tt1.conf_int() + ci2 = np.array([t.conf_int() for t in tt2_list]).squeeze() + assert_allclose(ci1, ci2, rtol=1e-12) + + # smoke test + tt1.summary()