From aa12688b5ed96b113b320862b27359c52e31349d Mon Sep 17 00:00:00 2001
From: alex <argriffi@ncsu.edu>
Date: Tue, 29 Oct 2013 09:58:42 -0400
Subject: [PATCH 1/5] ENH: add MNLogit loglike_and_score

---
 statsmodels/discrete/discrete_model.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py
index 2b65a03bdb6..f60f5cd5ea6 100644
--- a/statsmodels/discrete/discrete_model.py
+++ b/statsmodels/discrete/discrete_model.py
@@ -1588,6 +1588,21 @@ def score(self, params):
         #NOTE: might need to switch terms if params is reshaped
         return np.dot(firstterm.T, self.exog).flatten()
 
+    def loglike_and_score(self, params):
+        """
+        Returns log likelihood and score, efficiently reusing calculations.
+
+        Note that both of these returned quantities will need to be negated
+        before being minimized by the maximum likelihood fitting machinery.
+
+        """
+        params = params.reshape(self.K, -1, order='F')
+        cdf_dot_exog_params = self.cdf(np.dot(self.exog, params))
+        loglike_value = np.sum(self.wendog * np.log(cdf_dot_exog_params))
+        firstterm = self.wendog[:, 1:] - cdf_dot_exog_params[:, 1:]
+        score_array = np.dot(firstterm.T, self.exog).flatten()
+        return loglike_value, score_array
+
     def jac(self, params):
         """
         Jacobian matrix for multinomial logit model log-likelihood

From 0474015ad015aa2e92ea26860178deb6e24fd31b Mon Sep 17 00:00:00 2001
From: alex <argriffi@ncsu.edu>
Date: Tue, 29 Oct 2013 14:06:35 -0400
Subject: [PATCH 2/5] ENH: mle lbfgs now uses simultaneous loglike and score,
 tested with MNLogit

---
 statsmodels/base/model.py                   | 61 +++++++++++++++++----
 statsmodels/discrete/tests/test_discrete.py | 47 ++++++++++++----
 2 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py
index 9d0f5e3432d..c7a704f0f15 100644
--- a/statsmodels/base/model.py
+++ b/statsmodels/base/model.py
@@ -501,6 +501,21 @@ def _fit_mle_bfgs(f, score, start_params, fargs, kwargs, disp=True,
 def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
                     maxiter=None, callback=None, retall=False,
                     full_output=True, hess=None):
+    """
+    Parameters
+    ----------
+    f : function
+        Returns negative log likelihood given parameters.
+    score : function
+        Returns gradient of negative log likelihood with respect to params.
+
+    Notes
+    -----
+    Within the mle part of statsmodels, the log likelihood function and
+    its gradient with respect to the parameters do not have notationally
+    consistent sign.
+
+    """
 
     # The maxiter may be set at multiple points throughout statsmodels.
     # In the following lines of code, we track how its value changes
@@ -509,31 +524,53 @@ def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
     if maxiter is None:
         maxiter = 100
 
+    # Use unconstrained optimization by default.
+    bounds = kwargs.setdefault('bounds', [(None, None)] * len(start_params))
+
     # Pass the following keyword argument names through to fmin_l_bfgs_b
     # if they are present in kwargs, otherwise use the fmin_l_bfgs_b
     # default values.
-    names = ('m', 'pgtol', 'factr', 'maxfun', 'approx_grad')
+    names = ('m', 'pgtol', 'factr', 'maxfun', 'epsilon', 'approx_grad')
     extra_kwargs = dict((x, kwargs[x]) for x in names if x in kwargs)
 
-    if extra_kwargs.get('approx_grad', False):
-        score = None
-
-    epsilon = kwargs.setdefault('epsilon', 1e-8)
-    bounds = kwargs.setdefault('bounds', [(None, None)] * len(start_params))
+    # Extract values for the options related to the gradient.
+    approx_grad = extra_kwargs.get('approx_grad', False)
+    loglike_and_score = kwargs.get('loglike_and_score', None)
+    epsilon = kwargs.get('epsilon', None)
+
+    # Choose among three options for dealing with the gradient (the gradient
+    # of a log likelihood function with respect to its parameters
+    # is more specifically called the score in statistics terminology).
+    # The first option is to use the finite-differences
+    # approximation that is built into the fmin_l_bfgs_b optimizer.
+    # The second option is to use the provided score function.
+    # The third option is to use the score component of a provided
+    # function that simultaneously evaluates the log likelihood and score.
+    if epsilon and not approx_grad:
+        raise ValueError('a finite-differences epsilon was provided '
+                'even though we are not using approx_grad')
+    if approx_grad and (score or loglike_and_score):
+        raise ValueError('gradient approximation was requested '
+                'even though an analytic score function was given')
+    if loglike_and_score:
+        func = lambda p, *a : tuple(-x for x in loglike_and_score(p, *a))
+    elif score:
+        func = f
+        extra_kwargs['fprime'] = score
+    elif approx_grad:
+        func = f
 
     # Customize the fmin_l_bfgs_b call according to the scipy version.
     # Old scipy does not support maxiter and callback.
     scipy_version_curr = distutils.version.LooseVersion(scipy.__version__)
     scipy_version_12 = distutils.version.LooseVersion('0.12.0')
     if scipy_version_curr < scipy_version_12:
-        retvals = optimize.fmin_l_bfgs_b(f, start_params,
-                fprime=score, args=fargs,
-                bounds=bounds, epsilon=epsilon, disp=disp, **extra_kwargs)
+        retvals = optimize.fmin_l_bfgs_b(func, start_params,
+                args=fargs, bounds=bounds, disp=disp, **extra_kwargs)
     else:
-        retvals = optimize.fmin_l_bfgs_b(f, start_params,
-                fprime=score, args=fargs,
+        retvals = optimize.fmin_l_bfgs_b(func, start_params,
                 maxiter=maxiter, callback=callback,
-                bounds=bounds, epsilon=epsilon, disp=disp, **extra_kwargs)
+                args=fargs, bounds=bounds, disp=disp, **extra_kwargs)
 
     if full_output:
         xopt, fopt, d = retvals
diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py
index 672f97f1823..2a91b2dd312 100644
--- a/statsmodels/discrete/tests/test_discrete.py
+++ b/statsmodels/discrete/tests/test_discrete.py
@@ -1062,18 +1062,7 @@ def test_bse(self):
     test_jac = no_info
 
 
-class TestMNLogitNewtonBaseZero(CheckModelResults):
-    @classmethod
-    def setupClass(cls):
-        from results.results_discrete import Anes
-        data = sm.datasets.anes96.load()
-        cls.data = data
-        exog = data.exog
-        exog = sm.add_constant(exog, prepend=False)
-        cls.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0)
-        res2 = Anes()
-        res2.mnlogit_basezero()
-        cls.res2 = res2
+class CheckMNLogitBaseZero(CheckModelResults):
 
     def test_margeff_overall(self):
         me = self.res1.get_margeff()
@@ -1178,6 +1167,40 @@ def test_pred_table(self):
     def test_resid(self):
         assert_array_equal(self.res1.resid_misclassified, self.res2.resid)
 
+
+class TestMNLogitNewtonBaseZero(CheckMNLogitBaseZero):
+    @classmethod
+    def setupClass(cls):
+        from results.results_discrete import Anes
+        data = sm.datasets.anes96.load()
+        cls.data = data
+        exog = data.exog
+        exog = sm.add_constant(exog, prepend=False)
+        cls.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0)
+        res2 = Anes()
+        res2.mnlogit_basezero()
+        cls.res2 = res2
+
+class TestMNLogitLBFGSBaseZero(CheckMNLogitBaseZero):
+    @classmethod
+    def setupClass(cls):
+        from results.results_discrete import Anes
+        data = sm.datasets.anes96.load()
+        cls.data = data
+        exog = data.exog
+        exog = sm.add_constant(exog, prepend=False)
+        mymodel = MNLogit(data.endog, exog)
+        cls.res1 = mymodel.fit(method="lbfgs", disp=0, maxiter=50000,
+                #m=12, pgtol=1e-7, factr=1e3, # 5 failures
+                #m=20, pgtol=1e-8, factr=1e2, # 3 failures
+                #m=30, pgtol=1e-9, factr=1e1, # 1 failure
+                m=40, pgtol=1e-10, factr=5e0,
+                loglike_and_score=mymodel.loglike_and_score)
+        res2 = Anes()
+        res2.mnlogit_basezero()
+        cls.res2 = res2
+
+
 def test_perfect_prediction():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
     iris_dir = os.path.join(cur_dir, '..', '..', 'genmod', 'tests', 'results')

From 81a6f528cdb0af21c65c25587380d74ad15c124d Mon Sep 17 00:00:00 2001
From: alex <argriffi@ncsu.edu>
Date: Tue, 29 Oct 2013 14:59:32 -0400
Subject: [PATCH 3/5] MAINT: change default maxiter to match docstring

---
 statsmodels/tsa/arima_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py
index 0ef4d1f82cd..6d8c8fd2a98 100644
--- a/statsmodels/tsa/arima_model.py
+++ b/statsmodels/tsa/arima_model.py
@@ -954,7 +954,7 @@ def _get_predict_end(self, end, dynamic=False):
         return end - self.k_diff, out_of_sample
 
     def fit(self, start_params=None, trend='c', method = "css-mle",
-            transparams=True, solver='lbfgs', maxiter=35, full_output=1,
+            transparams=True, solver='lbfgs', maxiter=50, full_output=1,
             disp=5, callback=None, **kwargs):
         """
         Fits ARIMA(p,d,q) model by exact maximum likelihood via Kalman filter.

From 907122b797b4bf19ad3cf5e25686bc40c4464c51 Mon Sep 17 00:00:00 2001
From: alex <argriffi@ncsu.edu>
Date: Tue, 29 Oct 2013 15:03:26 -0400
Subject: [PATCH 4/5] MAINT: do not tell lbfgs to approximate the gradient if
 we are already passing it a score function

---
 statsmodels/tsa/arima_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py
index 6d8c8fd2a98..e902b7f9077 100644
--- a/statsmodels/tsa/arima_model.py
+++ b/statsmodels/tsa/arima_model.py
@@ -861,10 +861,9 @@ def fit(self, order=None, start_params=None, trend='c', method = "css-mle",
             kwargs.setdefault('pgtol', 1e-8)
             kwargs.setdefault('factr', 1e2)
             kwargs.setdefault('m', 12)
-            kwargs.setdefault('approx_grad', True)
         mlefit = super(ARMA, self).fit(start_params, method=solver,
                     maxiter=maxiter, full_output=full_output, disp=disp,
-                    callback = callback, **kwargs)
+                    callback=callback, **kwargs)
         self.mlefit = mlefit
         params = mlefit.params
 

From 3b14099a1969d55400d9898e5336a1eb9b1649ba Mon Sep 17 00:00:00 2001
From: alex <argriffi@ncsu.edu>
Date: Sat, 2 Nov 2013 10:35:44 -0400
Subject: [PATCH 5/5] MAINT: restore approx_grad flag superpowers

---
 statsmodels/base/model.py      | 19 ++++++++-----------
 statsmodels/tsa/arima_model.py |  1 +
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/statsmodels/base/model.py b/statsmodels/base/model.py
index c7a704f0f15..b63a1bc9f12 100644
--- a/statsmodels/base/model.py
+++ b/statsmodels/base/model.py
@@ -499,7 +499,7 @@ def _fit_mle_bfgs(f, score, start_params, fargs, kwargs, disp=True,
 
 
 def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
-                    maxiter=None, callback=None, retall=False,
+                    maxiter=100, callback=None, retall=False,
                     full_output=True, hess=None):
     """
     Parameters
@@ -517,13 +517,6 @@ def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
 
     """
 
-    # The maxiter may be set at multiple points throughout statsmodels.
-    # In the following lines of code, we track how its value changes
-    # between layers.
-    maxiter_ = maxiter
-    if maxiter is None:
-        maxiter = 100
-
     # Use unconstrained optimization by default.
     bounds = kwargs.setdefault('bounds', [(None, None)] * len(start_params))
 
@@ -534,10 +527,14 @@ def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
     extra_kwargs = dict((x, kwargs[x]) for x in names if x in kwargs)
 
     # Extract values for the options related to the gradient.
-    approx_grad = extra_kwargs.get('approx_grad', False)
+    approx_grad = kwargs.get('approx_grad', False)
     loglike_and_score = kwargs.get('loglike_and_score', None)
     epsilon = kwargs.get('epsilon', None)
 
+    # The approx_grad flag has superpowers nullifying the score function arg.
+    if approx_grad:
+        score = None
+
     # Choose among three options for dealing with the gradient (the gradient
     # of a log likelihood function with respect to its parameters
     # is more specifically called the score in statistics terminology).
@@ -549,9 +546,9 @@ def _fit_mle_lbfgs(f, score, start_params, fargs, kwargs, disp=True,
     if epsilon and not approx_grad:
         raise ValueError('a finite-differences epsilon was provided '
                 'even though we are not using approx_grad')
-    if approx_grad and (score or loglike_and_score):
+    if approx_grad and loglike_and_score:
         raise ValueError('gradient approximation was requested '
-                'even though an analytic score function was given')
+                'even though an analytic loglike_and_score function was given')
     if loglike_and_score:
         func = lambda p, *a : tuple(-x for x in loglike_and_score(p, *a))
     elif score:
diff --git a/statsmodels/tsa/arima_model.py b/statsmodels/tsa/arima_model.py
index e902b7f9077..fbf977df720 100644
--- a/statsmodels/tsa/arima_model.py
+++ b/statsmodels/tsa/arima_model.py
@@ -861,6 +861,7 @@ def fit(self, order=None, start_params=None, trend='c', method = "css-mle",
             kwargs.setdefault('pgtol', 1e-8)
             kwargs.setdefault('factr', 1e2)
             kwargs.setdefault('m', 12)
+            kwargs.setdefault('approx_grad', True)
         mlefit = super(ARMA, self).fit(start_params, method=solver,
                     maxiter=maxiter, full_output=full_output, disp=disp,
                     callback=callback, **kwargs)