statsmodels · josef-pkt · Dec 30, 2013 · Dec 26, 2013 · Dec 26, 2013 · Dec 26, 2013
diff --git a/statsmodels/examples/ex_kde_normalreference.py b/statsmodels/examples/ex_kde_normalreference.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""
+Author: Padarn Wilson
+
+Performance of normal reference plug-in estimator vs silverman. Sample is drawn
+from a mixture of gaussians. Distribution has been chosen to be reasoanbly close
+to normal.
+"""
+
+import numpy as np
+from scipy import stats
+import matplotlib.pyplot as plt
+import statsmodels.nonparametric.api as npar
+from statsmodels.sandbox.nonparametric import kernels
+from statsmodels.distributions.mixture_rvs import mixture_rvs
+
+# example from test_kde.py mixture of two normal distributions
+np.random.seed(12345)
+x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm],
+                kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5)))
+
+kde = npar.KDEUnivariate(x)
+
+
+kernel_names = ['Gaussian', 'Epanechnikov', 'Biweight',
+                'Triangular', 'Triweight', 'Cosine'
+                ]
+
+kernel_switch = ['gau', 'epa', 'tri', 'biw',
+                 'triw', 'cos'
+                 ]
+
+
+def true_pdf(x):
+    pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5)
+    pdf += 0.9 * stats.norm.pdf(x, loc=1, scale=0.5)
+    return pdf
+
+fig = plt.figure()
+for ii, kn in enumerate(kernel_switch):
+
+    ax = fig.add_subplot(2, 3, ii + 1)   # without uniform
+    ax.hist(x, bins=20, normed=True, alpha=0.25)
+
+    kde.fit(kernel=kn, bw='silverman', fft=False)
+    ax.plot(kde.support, kde.density)
+
+    kde.fit(kernel=kn, bw='normal_reference', fft=False)
+    ax.plot(kde.support, kde.density)
+
+    ax.plot(kde.support, true_pdf(kde.support), color='black', linestyle='--')
+
+    ax.set_title(kernel_names[ii])
+
+
+ax.legend(['silverman', 'normal reference', 'true pdf'], loc='lower right')
+ax.set_title('200 points')
+plt.show()
diff --git a/statsmodels/nonparametric/bandwidths.py b/statsmodels/nonparametric/bandwidths.py
@@ -1,5 +1,7 @@
 import numpy as np
 from scipy.stats import scoreatpercentile as sap
+from statsmodels.sandbox.nonparametric import kernels
+
 
 #from scipy.stats import norm
 
@@ -20,14 +22,16 @@ def _select_sigma(X):
 
 
 ## Univariate Rule of Thumb Bandwidths ##
-def bw_scott(x):
+def bw_scott(x, kernel=None):
     """
     Scott's Rule of Thumb
 
     Parameters
     ----------
     x : array-like
         Array for which to get the bandwidth
+    kernel : CustomKernel object
+        Unused
 
     Returns
     -------
@@ -49,16 +53,18 @@ def bw_scott(x):
     """
     A = _select_sigma(x)
     n = len(x)
-    return 1.059 * A * n ** -.2
+    return 1.059 * A * n ** (-0.2)
 
-def bw_silverman(x):
+def bw_silverman(x, kernel=None):
     """
     Silverman's Rule of Thumb
 
     Parameters
     ----------
     x : array-like
         Array for which to get the bandwidth
+    kernel : CustomKernel object
+        Unused
 
     Returns
     -------
@@ -79,15 +85,64 @@ def bw_silverman(x):
     """
     A = _select_sigma(x)
     n = len(x)
-    return .9 * A * n ** -.2
+    return .9 * A * n ** (-0.2)
+
+
+def bw_normal_reference(x, kernel=kernels.Gaussian):
+    """
+    Plug-in bandwidth with kernel specific constant based on normal reference.
+
+    This bandwidth minimizes the mean integrated square error if the true
+    distribution is the normal. This choice is an appropriate bandwidth for
+    single peaked distributions that are similar to the normal distribution.
+
+    Parameters
+    ----------
+    x : array-like
+        Array for which to get the bandwidth
+    kernel : CustomKernel object
+        Used to calcualate the constant for the plug-in bandwidth.
+
+    Returns
+    -------
+    bw : float
+        The estimate of the bandwidth
+
+    Notes
+    -----
+    Returns C * A * n ** (-1/5.) where ::
+
+       A = min(std(x, ddof=1), IQR/1.349)
+       IQR = np.subtract.reduce(np.percentile(x, [75,25]))
+       C = constant from Hansen (2009)
+
+    When using a gaussian kernel this is equivalent to the 'scott' bandwidth up
+    to two decimal places. This is the accuracy to which the 'scott' constant is
+    specified.
+
+    References
+    ----------
+
+    Silverman, B.W. (1986) `Density Estimation.`
+    Hansen, B.E. (2009) `Lecture Notes on Nonparametrics.`
+    """
+    C = kernel.normal_reference_constant
+    A = _select_sigma(x)
+    n = len(x)
+    return C * A * n ** (-0.2)
 
 ## Plug-In Methods ##
 
 ## Least Squares Cross-Validation ##
 
 ## Helper Functions ##
 
-bandwidth_funcs = dict(scott=bw_scott,silverman=bw_silverman)
+bandwidth_funcs = {
+    "scott": bw_scott,
+    "silverman": bw_silverman,
+    "normal_reference": bw_normal_reference
+}
+
 
 def select_bandwidth(x, bw, kernel):
     """
@@ -111,11 +166,11 @@ def select_bandwidth(x, bw, kernel):
 
     """
     bw = bw.lower()
-    if bw not in ["scott","silverman"]:
+    if bw not in ["scott","silverman","normal_reference"]:
         raise ValueError("Bandwidth %s not understood" % bw)
 #TODO: uncomment checks when we have non-rule of thumb bandwidths for diff. kernels
 #    if kernel == "gauss":
-    return bandwidth_funcs[bw](x)
+    return bandwidth_funcs[bw](x, kernel)
 #    else:
 #        raise ValueError("Only Gaussian Kernels are currently supported")
 
diff --git a/statsmodels/nonparametric/kde.py b/statsmodels/nonparametric/kde.py
@@ -82,7 +82,7 @@ class KDEUnivariate(object):
     def __init__(self, endog):
         self.endog = np.asarray(endog)
 
-    def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
+    def fit(self, kernel="gau", bw="normal_reference", fft=True, weights=None,
             gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf)):
         """
         Attach the density estimate to the KDEUnivariate class.
@@ -107,6 +107,9 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
               `min(std(X),IQR/1.34)`
             - "silverman" - .9 * A * nobs ** (-1/5.), where A is
               `min(std(X),IQR/1.34)`
+            - "normal_reference" - C * A * nobs ** (-1/5.), where C is
+               calculated from the kernel. Equivalent (up to 2 dp) to the
+               "scott" bandwidth for gaussian kernels. See bandwidths.py
             - If a float is given, it is the bandwidth.
 
         fft : bool
@@ -270,7 +273,7 @@ def __init__(self, endog):
 
 #### Kernel Density Estimator Functions ####
 
-def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
+def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
              adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True):
     """
     Rosenblatt-Parzen univariate kernel density estimator.
@@ -345,11 +348,14 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
         weights = weights[clip_x.squeeze()]
         q = weights.sum()
 
+    # Get kernel object corresponding to selection
+    kern = kernel_switch[kernel]()
+
     # if bw is None, select optimal bandwidth for kernel
     try:
         bw = float(bw)
     except:
-        bw = bandwidths.select_bandwidth(X, bw, kernel)
+        bw = bandwidths.select_bandwidth(X, bw, kern)
     bw *= adjust
 
     a = np.min(X,axis=0) - cut*bw
@@ -358,8 +364,9 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
 
     k = (X.T - grid[:,None])/bw  # uses broadcasting to make a gridsize x nobs
 
-    # instantiate kernel class
-    kern = kernel_switch[kernel](h=bw)
+    # set kernel bandwidth
+    kern.seth(bw)
+
     # truncate to domain
     if kern.domain is not None: # won't work for piecewise kernels like parzen
         z_lo, z_high = kern.domain
@@ -378,7 +385,7 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
     else:
         return dens, bw
 
-def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None,
+def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
                 adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True):
     """
     Rosenblatt-Parzen univariate kernel density estimator
@@ -452,10 +459,14 @@ def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None,
     X = np.asarray(X)
     X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
                                                 # will affect underlying data?
+
+    # Get kernel object corresponding to selection
+    kern = kernel_switch[kernel]()
+
     try:
         bw = float(bw)
     except:
-        bw = bandwidths.select_bandwidth(X, bw, kernel) # will cross-val fit this pattern?
+        bw = bandwidths.select_bandwidth(X, bw, kern) # will cross-val fit this pattern?
     bw *= adjust
 
     nobs = float(len(X)) # after trim

diff --git a/statsmodels/nonparametric/tests/test_bandwidths.py b/statsmodels/nonparametric/tests/test_bandwidths.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""
+
+Tests for bandwidth selection and calculation.
+
+Author: Padarn Wilson
+"""
+
+import numpy as np
+from scipy import stats
+
+from statsmodels.sandbox.nonparametric import kernels
+from statsmodels.distributions.mixture_rvs import mixture_rvs
+from statsmodels.nonparametric.kde import KDEUnivariate as KDE
+from statsmodels.nonparametric.bandwidths import select_bandwidth
+
+
+
+from numpy.testing import assert_allclose
+
+# setup test data
+
+np.random.seed(12345)
+Xi = mixture_rvs([.25,.75], size=200, dist=[stats.norm, stats.norm],
+                kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))
+
+class TestBandwidthCalculation(object):
+
+    def test_calculate_bandwidth_gaussian(self):
+
+        bw_expected = [0.29774853596742024,
+                       0.25304408155871411,
+                       0.29781147113698891]
+
+        kern = kernels.Gaussian()
+
+        bw_calc = [0, 0, 0]
+        for ii, bw in enumerate(['scott','silverman','normal_reference']):
+            bw_calc[ii] = select_bandwidth(Xi, bw, kern)
+
+        assert_allclose(bw_expected, bw_calc)
+
+
+class CheckNormalReferenceConstant(object):
+
+    def test_calculate_normal_reference_constant(self):
+        const = self.constant
+        kern = self.kern
+        assert_allclose(const, kern.normal_reference_constant, 1e-2)
+
+
+class TestEpanechnikov(CheckNormalReferenceConstant):
+
+    kern = kernels.Epanechnikov()
+    constant = 2.34
+
+
+class TestGaussian(CheckNormalReferenceConstant):
+
+    kern = kernels.Gaussian()
+    constant = 1.06
+
+
+class TestBiweight(CheckNormalReferenceConstant):
+
+    kern = kernels.Biweight()
+    constant = 2.78
+
+
+class TestTriweight(CheckNormalReferenceConstant):
+
+    kern = kernels.Triweight()
+    constant = 3.15
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb'], exit=False)
diff --git a/statsmodels/nonparametric/tests/test_kde.py b/statsmodels/nonparametric/tests/test_kde.py
@@ -138,7 +138,8 @@ def setupClass(cls):
         cls.x = x = KDEWResults['x']
         weights = KDEWResults['weights']
         res1 = KDE(x)
-        res1.fit(kernel=cls.kernel_name, weights=weights, fft=False)
+        # default kernel was scott when reference values computed
+        res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott")
         cls.res1 = res1
         cls.res_density = KDEWResults[cls.res_kernel_name]
 

diff --git a/statsmodels/nonparametric/tests/test_kernel_density.py b/statsmodels/nonparametric/tests/test_kernel_density.py
@@ -66,7 +66,7 @@ class TestKDEUnivariate(MyTest):
     def test_pdf_non_fft(self):
 
         kde = nparam.KDEUnivariate(self.noise)
-        kde.fit(fft=False)
+        kde.fit(fft=False, bw='scott')
 
 
         grid = kde.support
@@ -92,7 +92,7 @@ def test_pdf_non_fft(self):
     def test_weighted_pdf_non_fft(self):
 
         kde = nparam.KDEUnivariate(self.noise)
-        kde.fit(weights=self.weights, fft=False)
+        kde.fit(weights=self.weights, fft=False, bw='scott')
 
         grid = kde.support
         testx = [grid[10*i] for i in range(6)]