Merge pull request #2385 from kshedden/elastic_net2

ENH Generic elastic net code
statsmodels · May 7, 2016 · de15ec8 · de15ec8
2 parents 0fac349 + f40c7f2
commit de15ec8
Show file tree

Hide file tree

Showing 20 changed files with 1,505 additions and 401 deletions.
diff --git a/statsmodels/base/_constraints.py b/statsmodels/base/_constraints.py
@@ -250,7 +250,10 @@ def fit_constrained(model, constraint_matrix, constraint_values,
     #need copy, because we don't want to change it, we don't need deepcopy
     import copy
     init_kwds = copy.copy(self._get_init_kwds())
-    del init_kwds['offset']  # TODO: refactor to combine with above or offset_all
+
+    # TODO: refactor to combine with above or offset_all
+    if 'offset' in init_kwds:
+        del init_kwds['offset']
 
     # using offset as keywords is not supported in all modules
     mod_constr = self.__class__(endog, exogp_st, offset=offset, **init_kwds)

diff --git a/statsmodels/base/elastic_net.py b/statsmodels/base/elastic_net.py
@@ -0,0 +1,327 @@
+import numpy as np
+import statsmodels.base.wrapper as wrap
+from statsmodels.base.model import Results
+import statsmodels.base.wrapper as wrap
+from statsmodels.tools.decorators import cache_readonly
+
+"""
+Elastic net regularization.
+
+Routines for fitting regression models using elastic net
+regularization.  The elastic net minimizes the objective function
+
+-llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 + L1_wt * sum(abs(params)))
+
+The algorithm implemented here closely follows the implementation in
+the R glmnet package, documented here:
+
+http://cran.r-project.org/web/packages/glmnet/index.html
+
+and here:
+
+http://www.jstatsoft.org/v33/i01/paper
+
+This routine should work for any regression model that implements
+loglike, score, and hess.
+"""
+
+
+def _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds):
+    """
+    Negative penalized log-likelihood functions.
+
+    Returns the negative penalized log-likelihood, its derivative, and
+    its Hessian.  The penalty only includes the smooth (L2) term.
+
+    All three functions have argument signature (x, model), where
+    ``x`` is a point in the parameter space and ``model`` is an
+    arbitrary statsmodels regression model.
+    """
+
+    def nploglike(params, model):
+        nobs = model.nobs
+        pen_llf = alpha[k] * (1 - L1_wt) * np.sum(params**2) / 2
+        llf = model.loglike(np.r_[params], **loglike_kwds)
+        return - llf / nobs + pen_llf
+
+    def npscore(params, model):
+        nobs = model.nobs
+        pen_grad = alpha[k] * (1 - L1_wt) * params
+        gr = -model.score(np.r_[params], **score_kwds)[0] / nobs
+        return gr + pen_grad
+
+    def nphess(params, model):
+        nobs = model.nobs
+        pen_hess = alpha[k] * (1 - L1_wt)
+        h = -model.hessian(np.r_[params], **hess_kwds)[0,0] / nobs + pen_hess
+        return h
+
+    return nploglike, npscore, nphess
+
+
+
+def fit_elasticnet(model, method="coord_descent", maxiter=100, alpha=0.,
+         L1_wt=1., start_params=None, cnvrg_tol=1e-7, zero_tol=1e-8,
+         refit=False, loglike_kwds=None, score_kwds=None,
+         hess_kwds=None):
+    """
+    Return an elastic net regularized fit to a regression model.
+
+    Parameters
+    ----------
+    model : model object
+        A statsmodels object implementing ``loglike``, ``score``, and
+        ``hessian``.
+    method :
+        Only the coordinate descent algorithm is implemented.
+    maxiter : integer
+        The maximum number of iteration cycles (an iteration cycle
+        involves running coordinate descent on all variables).
+    alpha : scalar or array-like
+        The penalty weight.  If a scalar, the same penalty weight
+        applies to all variables in the model.  If a vector, it
+        must have the same length as `params`, and contains a
+        penalty weight for each coefficient.
+    L1_wt : scalar
+        The fraction of the penalty given to the L1 penalty term.
+        Must be between 0 and 1 (inclusive).  If 0, the fit is
+        a ridge fit, if 1 it is a lasso fit.
+    start_params : array-like
+        Starting values for `params`.
+    cnvrg_tol : scalar
+        If `params` changes by less than this amount (in sup-norm)
+        in one iteration cycle, the algorithm terminates with
+        convergence.
+    zero_tol : scalar
+        Any estimated coefficient smaller than this value is
+        replaced with zero.
+    refit : bool
+        If True, the model is refit using only the variables that have
+        non-zero coefficients in the regularized fit.  The refitted
+        model is not regularized.
+    loglike_kwds : dict-like or None
+        Keyword arguments for the log-likelihood function.
+    score_kwds : dict-like or None
+        Keyword arguments for the score function.
+    hess_kwds : dict-like or None
+        Keyword arguments for the Hessian function.
+
+    Returns
+    -------
+    A results object.
+
+    Notes
+    -----
+    The ``elastic net`` penalty is a combination of L1 and L2
+    penalties.
+
+    The function that is minimized is:
+
+    -loglike/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1)
+
+    where |*|_1 and |*|_2 are the L1 and L2 norms.
+
+    The computational approach used here is to obtain a quadratic
+    approximation to the smooth part of the target function:
+
+    -loglike/n + alpha*(1-L1_wt)*|params|_2^2/2
+
+    then repeatedly optimize the L1 penalized version of this function
+    along coordinate axes.
+    """
+
+    k_exog = model.exog.shape[1]
+    n_exog = model.exog.shape[0]
+
+    loglike_kwds = {} if loglike_kwds is None else loglike_kwds
+    score_kwds = {} if score_kwds is None else score_kwds
+    hess_kwds = {} if hess_kwds is None else hess_kwds
+
+    if np.isscalar(alpha):
+        alpha = alpha * np.ones(k_exog)
+
+    # Define starting params
+    if start_params is None:
+        params = np.zeros(k_exog)
+    else:
+        params = start_params.copy()
+
+    converged = False
+    btol = 1e-8
+    params_zero = np.zeros(len(params), dtype=bool)
+
+    init_args = dict([(k, getattr(model, k)) for k in model._init_keys
+                      if k != "offset" and hasattr(model, k)])
+
+    fgh_list = [_gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds)
+                for k in range(k_exog)]
+
+    for itr in range(maxiter):
+
+        # Sweep through the parameters
+        params_save = params.copy()
+        for k in range(k_exog):
+
+            # Under the active set method, if a parameter becomes
+            # zero we don't try to change it again.
+            # TODO : give the user the option to switch this off
+            if params_zero[k]:
+                continue
+
+            # Set the offset to account for the variables that are
+            # being held fixed in the current coordinate
+            # optimization.
+            params0 = params.copy()
+            params0[k] = 0
+            offset = np.dot(model.exog, params0)
+            if hasattr(model, "offset") and model.offset is not None:
+                offset += model.offset
+
+            # Create a one-variable model for optimization.
+            model_1var = model.__class__(model.endog, model.exog[:, k], offset=offset,
+                                         **init_args)
+
+            # Do the one-dimensional optimization.
+            func, grad, hess = fgh_list[k]
+            params[k] = _opt_1d(func, grad, hess, model_1var, params[k], alpha[k]*L1_wt, tol=btol)
+
+            # Update the active set
+            if itr > 0 and np.abs(params[k]) < zero_tol:
+                params_zero[k] = True
+                params[k] = 0.
+
+        # Check for convergence
+        pchange = np.max(np.abs(params - params_save))
+        if pchange < cnvrg_tol:
+            converged = True
+            break
+
+    # Set approximate zero coefficients to be exactly zero
+    params[np.abs(params) < zero_tol] = 0
+
+    if not refit:
+        results = RegularizedResults(model, params)
+        return RegularizedResultsWrapper(results)
+
+    # Fit the reduced model to get standard errors and other
+    # post-estimation results.
+    ii = np.flatnonzero(params)
+    cov = np.zeros((k_exog, k_exog))
+    init_args = dict([(k, getattr(model, k, None)) for k in model._init_keys])
+    if len(ii) > 0:
+        model1 = model.__class__(model.endog, model.exog[:, ii],
+                               **init_args)
+        rslt = model1.fit()
+        cov[np.ix_(ii, ii)] = rslt.normalized_cov_params
+    else:
+        # Hack: no variables were selected but we need to run fit in
+        # order to get the correct results class.  So just fit a model
+        # with one variable.
+        model1 = model.__class__(model.endog, model.exog[:, 0], **init_args)
+        rslt = model1.fit(maxiter=0)
+
+    # fit may return a results or a results wrapper
+    if issubclass(rslt.__class__, wrap.ResultsWrapper):
+        klass = rslt._results.__class__
+    else:
+        klass = rslt.__class__
+
+    # Not all models have a scale
+    if hasattr(rslt, 'scale'):
+        scale = rslt.scale
+    else:
+        scale = 1.
+
+    # Assuming a standard signature for creating results classes.
+    refit = klass(model, params, cov, scale=scale)
+    refit.regularized = True
+    refit.method = method
+    refit.fit_history = {'iteration' : itr + 1}
+
+    return refit
+
+
+def _opt_1d(func, grad, hess, model, start, L1_wt, tol):
+    """
+    One-dimensional helper for elastic net.
+
+    Parameters:
+    -----------
+    func : function
+        A smooth function of a single variable to be optimized
+        with L1 penaty.
+    grad : function
+        The gradient of `func`.
+    hess : function
+        The Hessian of `func`.
+    model : statsmodels model
+        The model being fit.
+    start : real
+        A starting value for the function argument
+    L1_wt : non-negative real
+        The weight for the L1 penalty function.
+    tol : non-negative real
+        A convergence threshold.
+
+    Notes
+    -----
+    ``func``, ``grad``, and ``hess`` have argument signature (x,
+    model), where ``x`` is a point in the parameter space and
+    ``model`` is the model being fit.
+
+    If the log-likelihood for the model is exactly quadratic, the
+    global minimum is returned in one step.  Otherwise numerical
+    bisection is used.
+
+    Returns:
+    --------
+    The argmin of the objective function.
+    """
+
+    from scipy.optimize import brent
+
+    x = start
+    f = func(x, model)
+    b = grad(x, model)
+    c = hess(x, model)
+    d = b - c*x
+
+    if L1_wt > np.abs(d):
+        return 0.
+
+    if d >= 0:
+        h = (L1_wt - b) / c
+    elif d < 0:
+        h = -(L1_wt + b) / c
+    else:
+        return np.nan
+
+    f1 = func(x + h, model) + L1_wt*np.abs(x + h)
+    if f1 <= f + L1_wt*np.abs(x) + 1e-10:
+        return x + h
+
+    x_opt = brent(func, args=(model,), brack=(x-1, x+1), tol=tol)
+    return x_opt
+
+
+class RegularizedResults(Results):
+
+    def __init__(self, model, params):
+        super(RegularizedResults, self).__init__(model, params)
+
+    @cache_readonly
+    def fittedvalues(self):
+        return self.model.predict(self.params)
+
+
+class RegularizedResultsWrapper(wrap.ResultsWrapper):
+    _attrs = {
+        'params': 'columns',
+        'resid': 'rows',
+        'fittedvalues': 'rows',
+    }
+
+    _wrap_attrs = _attrs
+
+wrap.populate_wrapper(RegularizedResultsWrapper,
+                      RegularizedResults)
diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py
@@ -770,6 +770,10 @@ def predict(self, exog=None, transform=True, *args, **kwargs):
             return predict_results
 
 
+    def summary(self):
+        pass
+
+
 #TODO: public method?
 class LikelihoodModelResults(Results):
     """
@@ -1705,10 +1709,20 @@ def remove_data(self):
         Not fully tested for time series models, tsa, and might delete too much
         for prediction or not all that would be possible.
 
-        The list of arrays to delete is maintained as an attribute of the
-        result and model instance, except for cached values. These lists could
-        be changed before calling remove_data.
+        The lists of arrays to delete are maintained as attributes of
+        the result and model instance, except for cached values. These
+        lists could be changed before calling remove_data.
+
+        The attributes to remove are named in:
+
+        model._data_attr : arrays attached to both the model instance
+            and the results instance with the same attribute name.
+
+        result.data_in_cache : arrays that may exist as values in
+            result._cache (TODO : should privatize name)
 
+        result._data_attr_model : arrays attached to the model
+            instance but not to the results instance
         '''
         def wipe(obj, att):
             #get to last element in attribute path
@@ -1725,8 +1739,9 @@ def wipe(obj, att):
             except AttributeError:
                 pass
 
+        model_only = ['model.' + i for i in getattr(self, "_data_attr_model", [])]
         model_attr = ['model.' + i for i in self.model._data_attr]
-        for att in self._data_attr + model_attr:
+        for att in self._data_attr + model_attr + model_only:
             #print('removing', att)
             wipe(self, att)