Skip to content

Commit

Permalink
ENH: add OLSVectorized, initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
josef-pkt committed Nov 14, 2018
1 parent 14fb572 commit 04aa547
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 6 deletions.
19 changes: 13 additions & 6 deletions statsmodels/regression/linear_model.py
Expand Up @@ -306,7 +306,14 @@ def fit(self, method="pinv", cov_type='nonrobust', cov_kwds=None,
if self._df_resid is None:
self.df_resid = self.nobs - self.rank

if isinstance(self, OLS):
results_class = getattr(self, '_results_class', None)
if results_class is not None:
lfit = results_class(
self, beta,
normalized_cov_params=self.normalized_cov_params,
cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t,
**kwargs)
elif isinstance(self, OLS):
lfit = OLSResults(
self, beta,
normalized_cov_params=self.normalized_cov_params,
Expand Down Expand Up @@ -842,7 +849,7 @@ def loglike(self, params, scale=None):
resid = self.endog - np.dot(self.exog, params)
if hasattr(self, 'offset'):
resid -= self.offset
ssr = np.sum(resid**2)
ssr = np.sum(resid**2, axis=0)
if scale is None:
# profile log likelihood
llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
Expand Down Expand Up @@ -1555,7 +1562,7 @@ def resid(self):
@cache_writable()
def scale(self):
wresid = self.wresid
return np.dot(wresid, wresid) / self.df_resid
return wresid.T.dot(wresid) / self.df_resid

@cache_readonly
def ssr(self):
Expand All @@ -1568,8 +1575,8 @@ def centered_tss(self):
weights = getattr(model, 'weights', None)
sigma = getattr(model, 'sigma', None)
if weights is not None:
mean = np.average(model.endog, weights=weights)
return np.sum(weights * (model.endog - mean)**2)
mean = np.average(model.endog, weights=weights, axis=0)
return np.sum(weights * (model.endog - mean).T**2, axis=-1)
elif sigma is not None:
# Exactly matches WLS when sigma is diagonal
iota = np.ones_like(model.endog)
Expand All @@ -1579,7 +1586,7 @@ def centered_tss(self):
err = model.whiten(err)
return np.sum(err**2)
else:
centered_endog = model.wendog - model.wendog.mean()
centered_endog = model.wendog - model.wendog.mean(0)
return np.dot(centered_endog, centered_endog)

@cache_readonly
Expand Down
161 changes: 161 additions & 0 deletions statsmodels/regression/special_linear_model.py
@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 13 16:24:12 2018
Author: Josef Perktold
"""

import numpy as np
from statsmodels.tools.decorators import (cache_readonly,
cache_writable)

from statsmodels.regression.linear_model import OLS, RegressionResults

from numpy.testing import assert_allclose


class OLSVectorizedResults(RegressionResults):
r"""results class for vectorized OLS
"""

@cache_writable()
def scale(self):
wresid = self.wresid
return (wresid * wresid).sum(0) / self.df_resid


@cache_readonly
def bse(self):
return np.sqrt(self.scale *
np.diag(self.normalized_cov_params)[:, None])

@cache_readonly
def ssr(self):
wresid = self.wresid
return (wresid * wresid).sum(0)

@cache_readonly
def uncentered_tss(self):
wendog = self.model.wendog
return (wendog * wendog).sum(0)

def conf_int(self, alpha=.05, cols=None):
#print('using OLSVectorizedResults.conf_int')
ci = super(OLSVectorizedResults, self).conf_int(alpha=alpha,
cols=cols)

return np.rollaxis(ci, -1)

def t_test(self, r_matrix, cov_p=None, scale=None, use_t=None):
"""
Compute a t-test for a single linear hypothesis of the form Rb = q
This is an adjusted version of the corresponding LikelihoodModelResults
method adjusted to be vectorized across endog.
Parameters
----------
r_matrix : array-like, str, tuple
- array : If an array is given, a p x k 2d array or length k 1d
array specifying the linear restrictions. It is assumed
that the linear combination is equal to zero.
- str : The full hypotheses to test can be given as a string.
See the examples.
- tuple : A tuple of arrays in the form (R, q). If q is given,
can be either a scalar or a length p row vector.
cov_p : array-like, optional
An alternative estimate for the parameter covariance matrix.
If None is given, self.normalized_cov_params is used.
scale : float, optional
An optional `scale` to use. Default is the scale specified
by the model fit.
use_t : bool, optional
If use_t is None, then the default of the model is used.
If use_t is True, then the p-values are based on the t
distribution.
If use_t is False, then the p-values are based on the normal
distribution.
Returns
-------
res : ContrastResults instance
The results for the test are attributes of this results instance.
The available results have the same elements as the parameter table
in `summary()`.
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
See Also
---------
tvalues : individual t statistics
f_test : for F tests
patsy.DesignInfo.linear_constraint
"""
from statsmodels.tools.tools import recipr
from statsmodels.stats.contrast import ContrastResults
from patsy import DesignInfo
names = self.model.data.param_names
LC = DesignInfo(names).linear_constraint(r_matrix)
r_matrix, q_matrix = LC.coefs, LC.constants
num_ttests = r_matrix.shape[0]
num_params = r_matrix.shape[1]

if (cov_p is None and self.normalized_cov_params is None and
not hasattr(self, 'cov_params_default')):
raise ValueError('Need covariance of parameters for computing '
'T statistics')
if num_params != self.params.shape[0]:
raise ValueError('r_matrix and params are not aligned')
if q_matrix is None:
q_matrix = np.zeros(num_ttests)
else:
q_matrix = np.asarray(q_matrix)
q_matrix = q_matrix.squeeze()
if q_matrix.size > 1:
if q_matrix.shape[0] != num_ttests:
raise ValueError("r_matrix and q_matrix must have the same "
"number of rows")

if use_t is None:
# switch to use_t false if undefined
use_t = (hasattr(self, 'use_t') and self.use_t)

_t = _sd = None

_effect = np.dot(r_matrix, self.params)

# vectorized OLS: we use initially scale=1 and then add the scale
cov_p = self.normalized_cov_params
# Perform the test
if num_ttests > 1:
_sd = np.sqrt(np.diag(self.cov_params(
r_matrix=r_matrix, cov_p=cov_p)))
else:
_sd = np.sqrt(self.cov_params(r_matrix=r_matrix, cov_p=cov_p))

_sd = np.squeeze(_sd) * np.sqrt(self.scale)
_effect = np.squeeze(_effect)
_t = (_effect - q_matrix) * recipr(_sd)

df_resid = getattr(self, 'df_resid_inference', self.df_resid)

if use_t:
return ContrastResults(effect=_effect, t=_t, sd=_sd,
df_denom=df_resid)
else:
return ContrastResults(effect=_effect, statistic=_t, sd=_sd,
df_denom=df_resid,
distribution='norm')

def summary(self):
from statsmodels.iolib.summary import summary_params_2dflat
summ = summary_params_2dflat(self,
endog_names=self.model.endog_names)[1]
return summ


class OLSVectorized(OLS):
_results_class = OLSVectorizedResults
74 changes: 74 additions & 0 deletions statsmodels/regression/tests/test_special_linear_model.py
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 13 20:33:02 2018
Author: Josef Perktold
"""

import numpy as np
from numpy.testing import assert_allclose
from statsmodels.regression.linear_model import OLS
from statsmodels.regression.special_linear_model import OLSVectorized


class TestOLSVectorized(object):

@classmethod
def setup_class(cls):
np.random.seed(3)
sig_e = 0.1

exog = (np.repeat(np.arange(3), 20)[:,None] ==
np.arange(3)).astype(float)

params_true = np.array([[1, 1, 1], [1, 1, 1],
[1, 1.5, 2], [1, 1.5, 2]]).T

y_true = exog.dot(params_true)
endog = y_true + sig_e * np.random.randn(*y_true.shape)

cls.res1 = OLSVectorized(endog, exog).fit()
cls.res2_list = [OLS(endog[:, i], exog).fit() for i in range(4)]

def test_attributes(self):
res1 = self.res1
res2_list = self.res2_list


attrs = ['params', 'scale', 'bse', 'tvalues', 'pvalues', 'ssr', 'centered_tss',
'rsquared', 'llf', 'aic', 'bic', 'fvalue', 'f_pvalue']
for attr in attrs:
value = getattr(res1, attr)
res2_value = np.array([getattr(r, attr) for r in res2_list]).T
assert_allclose(value, res2_value, rtol=1e-13)

def test_methods(self):
res1 = self.res1
res2_list = self.res2_list

ci1 = res1.conf_int()
ci2 = np.array([r.conf_int() for r in res2_list])
ci2 = np.rollaxis(ci2, 0, 3)
assert_allclose(ci1, ci2, rtol=1e-13)

# smoke test
res1.summary()

def test_wald(self):
res1 = self.res1
res2_list = self.res2_list

tt1 = res1.t_test('x1=x2')
tt2_list = [r.t_test('x1=x2') for r in res2_list]
attrs = ['effect', 'sd', 'tvalue', 'pvalue', 'df_denom', 'statistic']
for attr in attrs:
value = getattr(tt1, attr)
res2_value = np.array([getattr(t, attr) for t in tt2_list]).squeeze()
assert_allclose(value, res2_value, rtol=1e-12, err_msg=attr)

ci1 = tt1.conf_int()
ci2 = np.array([t.conf_int() for t in tt2_list]).squeeze()
assert_allclose(ci1, ci2, rtol=1e-12)

# smoke test
tt1.summary()

0 comments on commit 04aa547

Please sign in to comment.