Merge 7664e04 into 4b55fa4

statsmodels · Jun 2, 2015 · 0a77d67 · 0a77d67
2 parents 4b55fa4 + 7664e04
commit 0a77d67
Show file tree

Hide file tree

Showing 4 changed files with 602 additions and 271 deletions.
diff --git a/statsmodels/tsa/statespace/mlemodel.py b/statsmodels/tsa/statespace/mlemodel.py
@@ -726,6 +726,189 @@ def zvalues(self):
         """
         return self.params / self.bse
 
+    def test_normality(self):
+        """
+        Jarque-Bera test for normaility of standardized residuals.
+
+        Notes
+        -----
+        If the first `d` loglikelihood values were burned (i.e. in the
+        specified model, `loglikelihood_burn=d`), then this test is calculated
+        ignoring the first `d` residuals.
+        """
+        from statsmodels.stats.stattools import jarque_bera
+        return np.array(jarque_bera(
+            self.standardized_forecasts_error[:, self.loglikelihood_burn:],
+            axis=1
+        )).transpose()
+
+    def test_heteroskedasticity(self, alternative='two-sided',
+                                asymptotic=False):
+        """
+        Sum-of-squares test for heteroskedasticity of standardized residuals
+
+        Tests whether the sum-of-squares in the first third of the sample is
+        significantly different than the sum-of-squares in the last third
+        of the sample. Analogous to a Goldfeld-Quandt test.
+
+        Parameters
+        ----------
+        alternative : string, 'increasing', 'decreasing' or 'two-sided'
+            This specifies the alternative for the p-value calculation. Default
+            is two-sided.
+        asymptotic : boolean, optional
+            Whether or not to compare against the asymptotic distribution
+            (chi-squared) or the approximate small-sample distribution (F).
+            Default is False (i.e. default is to compare against an F
+            distribution).
+
+        Notes
+        -----
+        The null hypothesis is of no heteroskedasticity. That means different
+        things depending on which alternative is selected:
+
+        - Increasing: Null hypothesis is that the variance is not increasing
+          throughout the sample; that the sum-of-squares in the later
+          subsample is *not* greater than the sum-of-squares in the earlier
+          subsample.
+        - Decreasing: Null hypothesis is that the variance is not decreasing
+          throughout the sample; that the sum-of-squares in the earlier
+          subsample is *not* greater than the sum-of-squares in the later
+          subsample.
+        - Two-sided: Null hypothesis is that the variance is not changing
+          throughout the sample. Both that the sum-of-squares in the earlier
+          subsample is not greater than the sum-of-squares in the later
+          subsample *and* that the sum-of-squares in the later subsample is
+          not greater than the sum-of-squares in the earlier subsample.
+
+        For :math:`h = [T/3]`, the test statistic is:
+
+        .. math::
+
+            H(h) = \sum_{t=T-h+1}^T  \tilde v_t^2
+            \left / \sum_{t=d+1}^{d+1+h} \tilde v_t^2 \right .
+
+        where :math:`d` is the number of periods in which the loglikelihood was
+        burned in the parent model (usually corresponding to diffuse
+        initialization).
+
+        This statistic can be tested against an :math:`F(h,h)` distribution.
+        Alternatively, :math:`h H(h)` is asymptotically distributed according
+        to :math:`\chi_h^2`; this second test can be applied by passing
+        `asymptotic=True` as an argument.
+
+        See section 5.4 of [1]_ for the above formula and discussion, as well
+        as additional details.
+
+        TODO
+
+        - Allow specification of :math:`h`
+
+        Returns
+        -------
+        output : array
+            An array with `(test_statistic, pvalue)` for each endogenous
+            variable. The array is then sized `(k_endog, 2)`. If the method is
+            called as `het = res.test_heteroskedasticity()`, then `het[0]` is
+            an array of size 2 corresponding to the first endogenous variable,
+            where `het[0][0]` is the test statistic, and `het[0][1]` is the
+            p-value.
+
+        References
+        ----------
+        .. [1] Harvey, Andrew C. 1990.
+           Forecasting, Structural Time Series Models and the Kalman Filter.
+           Cambridge University Press.
+
+        """
+        # Store some values
+        squared_resid = self.standardized_forecasts_error**2
+        d = self.loglikelihood_burn
+        h = np.round((self.nobs - d) / 3)
+
+        # Calculate the test statistics for each endogenous variable
+        test_statistics = np.array([
+            np.sum(squared_resid[i][-h:]) / np.sum(squared_resid[i][d:d+h])
+            for i in range(self.k_endog)
+        ])
+
+        # Setup functions to calculate the p-values
+        if not asymptotic:
+            from scipy.stats import f
+            pval_lower = lambda test_statistics: f.cdf(test_statistics, h, h)
+            pval_upper = lambda test_statistics: f.sf(test_statistics, h, h)
+        else:
+            from scipy.stats import chi2
+            pval_lower = lambda test_statistics: chi2.cdf(h*test_statistics, h)
+            pval_upper = lambda test_statistics: chi2.sf(h*test_statistics, h)
+
+        # Calculate the one- or two-sided p-values
+        alternative = alternative.lower()
+        if alternative in ['i', 'inc', 'increasing']:
+            p_values = pval_upper(test_statistics)
+        elif alternative in ['d', 'dec', 'decreasing']:
+            test_statistics = 1. / test_statistics
+            p_values = pval_upper(test_statistics)
+        elif alternative in ['2', '2-sided', 'two-sided']:
+            p_values = 2 * np.minimum(
+                pval_lower(test_statistics),
+                pval_upper(test_statistics)
+            )
+        else:
+            raise ValueError('Invalid alternative.')
+
+        return np.c_[test_statistics, p_values]
+
+    def test_serial_correlation(self, lags=None, boxpierce=False):
+        """
+        Ljung-box test for no serial correlation of standardized residuals
+
+        Null hypothesis is no serial correlation.
+
+        Parameters
+        ----------
+        lags : None, int or array_like
+            If lags is an integer then this is taken to be the largest lag
+            that is included, the test result is reported for all smaller lag
+            length.
+            If lags is a list or array, then all lags are included up to the
+            largest lag in the list, however only the tests for the lags in the
+            list are reported.
+            If lags is None, then the default maxlag is 12*(nobs/100)^{1/4}
+        boxpierce : {False, True}
+            If true, then additional to the results of the Ljung-Box test also
+            the Box-Pierce test results are returned.
+
+        Returns
+        -------
+        output : array
+            An array with `(test_statistic, pvalue)` for each endogenous
+            variable and each lag. The array is then sized
+            `(k_endog, 2, lags)`. If the method is called as
+            `ljungbox = res.test_serial_correlation()`, then `ljungbox[i]`
+            holds the results of the Ljung-Box test (as would be returned by
+            `statsmodels.stats.diagnostic.acorr_ljungbox`) for the `i`th
+            endogenous variable.
+
+        Notes
+        -----
+        If the first `d` loglikelihood values were burned (i.e. in the
+        specified model, `loglikelihood_burn=d`), then this test is calculated
+        ignoring the first `d` residuals.
+
+        See Also
+        --------
+        statsmodels.stats.diagnostic.acorr_ljungbox
+
+        """
+        from statsmodels.stats.diagnostic import acorr_ljungbox
+        return np.c_[[
+            acorr_ljungbox(
+                self.standardized_forecasts_error[i][self.loglikelihood_burn:]
+            )
+            for i in range(self.k_endog)
+        ]]
+
     def predict(self, start=None, end=None, dynamic=False, full_results=False,
                 **kwargs):
         """
@@ -838,6 +1021,86 @@ def forecast(self, steps=1, **kwargs):
         """
         return self.predict(start=self.nobs, end=self.nobs+steps-1, **kwargs)
 
+    def plot_diagnostics(self, variable=0, lags=10, fig=None, figsize=None):
+        """
+        Diagnostic plots for the standardized residual assocaited with one
+        endogenous variable.
+
+        Parameters
+        ----------
+        variable : integer, optional
+            Index of the endogenous variable for which the diagnostic plots
+            should be created. Default is 0.
+        lags : integer, optional
+            Number of lags to include in the correlogram. Default is 10.
+        fig : Matplotlib Figure instance, optional
+            If given, subplots are created in this figure instead of in a new
+            figure. Note that the 2x2 grid will be created in the provided
+            figure using `fig.add_subplot()`.
+        figsize : tuple, optional
+            If a figure is created, this argument allows specifying a size.
+            The tuple is (width, height).
+
+        Notes
+        -----
+        Produces a 2x2 plot grid with the following plots (ordered clockwise
+        from top left):
+
+        1. Standardized residuals over time
+        2. Histogram plus estimated density of standardized residulas, along
+           with a Normal(0,1) density plotted for reference.
+        3. Normal Q-Q plot, with Normal reference line.
+        4. Correlogram
+
+        """
+        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
+        _ = _import_mpl()
+        fig = create_mpl_fig(fig, figsize)
+        # Eliminate residuals associated with burned likelihoods
+        resid = self.standardized_forecasts_error[variable,
+                                                  self.loglikelihood_burn:]
+
+        # Top-left: residuals vs time
+        ax = fig.add_subplot(221)
+        if hasattr(self.data, 'dates') and self.data.dates is not None:
+            x = self.data.dates[self.loglikelihood_burn:]._mpl_repr()
+        else:
+            x = np.arange(len(resid))
+        ax.plot(x, resid)
+        ax.hlines(0, x[0], x[-1], alpha=0.5)
+        ax.set_xlim(x[0], x[-1])
+        ax.set_title('Standardized residual')
+
+        # Top-right: histogram, Gaussian kernel density, Normal density
+        ax = fig.add_subplot(222)
+        ax.hist(resid, normed=True, label='Hist')
+        from scipy.stats import gaussian_kde, norm
+        kde = gaussian_kde(resid)
+        xlim = (-1.96*2, 1.96*2)
+        x = np.linspace(xlim[0], xlim[1])
+        ax.plot(x, kde(x), label='KDE')
+        ax.plot(x, norm.pdf(x), label='N(0,1)')
+        ax.set_xlim(xlim)
+        ax.legend()
+        ax.set_title('Histogram plus estimated density')
+
+        # Bottom-left: QQ plot
+        ax = fig.add_subplot(223)
+        from statsmodels.graphics.gofplots import qqplot
+        qqplot(resid, line='s', ax=ax)
+        ax.set_title('Normal Q-Q')
+
+        # Bottom-right: Correlogram
+        ax = fig.add_subplot(224)
+        from statsmodels.graphics.tsaplots import plot_acf
+        plot_acf(resid, ax=ax, lags=10)
+        ax.set_title('Correlogram')
+
+        ax.set_ylim(-1,1)
+
+        return fig
+
+
     def summary(self, alpha=.05, start=None, model_name=None):
         """
         Summarize the Model
@@ -862,6 +1125,8 @@ def summary(self, alpha=.05, start=None, model_name=None):
         statsmodels.iolib.summary.Summary
         """
         from statsmodels.iolib.summary import Summary
+
+        # Model specification results
         model = self.model
         title = 'Statespace Model Results'
 
@@ -879,6 +1144,13 @@ def summary(self, alpha=.05, start=None, model_name=None):
         if model_name is None:
             model_name = model.__class__.__name__
 
+        # Diagnostic tests results
+        het = self.test_heteroskedasticity()
+        lb = self.test_serial_correlation()
+        jb = self.test_normality()
+
+        # Create the tables
+
         top_left = [
             ('Dep. Variable:', None),
             ('Model:', [model_name]),
@@ -896,10 +1168,25 @@ def summary(self, alpha=.05, start=None, model_name=None):
             ('HQIC', ["%#5.3f" % self.hqic])
         ]
 
+        format_str = lambda array: [', '.join(['{:.2f}'.format(i) for i in array])]
+        diagn_left = [('Ljung-Box (Q):', format_str(lb[:,0,-1])),
+                      ('Prob(Q):', format_str(lb[:,1,-1])),
+                      ('Heteroskedasticity (H):', format_str(het[:,0])),
+                      ('Prob(H) (two-sided):', format_str(het[:,1]))
+                      ]
+
+        diagn_right = [('Jarque-Bera (JB):', format_str(jb[:,0])),
+                       ('Prob(JB):', format_str(jb[:,1])),
+                       ('Skew:', format_str(jb[:,2])),
+                       ('Kurtosis:', format_str(jb[:,3]))
+                       ]
+
         summary = Summary()
         summary.add_table_2cols(self, gleft=top_left, gright=top_right,
                                 title=title)
         summary.add_table_params(self, alpha=alpha, xname=self._param_names,
                                  use_t=False)
+        summary.add_table_2cols(self, gleft=diagn_left, gright=diagn_right,
+                                title="")
 
         return summary