Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Silverman enhancement - Issue #1243 #1271

Merged
merged 12 commits into from Dec 30, 2013
58 changes: 58 additions & 0 deletions statsmodels/examples/ex_kde_normalreference.py
@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
"""
Author: Padarn Wilson

Performance of normal reference plug-in estimator vs silverman. Sample is drawn
from a mixture of gaussians. Distribution has been chosen to be reasoanbly close
to normal.
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.nonparametric.api as npar
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs

# example from test_kde.py mixture of two normal distributions
np.random.seed(12345)
x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm],
kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5)))

kde = npar.KDEUnivariate(x)


kernel_names = ['Gaussian', 'Epanechnikov', 'Biweight',
'Triangular', 'Triweight', 'Cosine'
]

kernel_switch = ['gau', 'epa', 'tri', 'biw',
'triw', 'cos'
]


def true_pdf(x):
pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5)
pdf += 0.9 * stats.norm.pdf(x, loc=1, scale=0.5)
return pdf

fig = plt.figure()
for ii, kn in enumerate(kernel_switch):

ax = fig.add_subplot(2, 3, ii + 1) # without uniform
ax.hist(x, bins=20, normed=True, alpha=0.25)

kde.fit(kernel=kn, bw='silverman', fft=False)
ax.plot(kde.support, kde.density)

kde.fit(kernel=kn, bw='normal_reference', fft=False)
ax.plot(kde.support, kde.density)

ax.plot(kde.support, true_pdf(kde.support), color='black', linestyle='--')

ax.set_title(kernel_names[ii])


ax.legend(['silverman', 'normal reference', 'true pdf'], loc='lower right')
ax.set_title('200 points')
plt.show()
69 changes: 62 additions & 7 deletions statsmodels/nonparametric/bandwidths.py
@@ -1,5 +1,7 @@
import numpy as np
from scipy.stats import scoreatpercentile as sap
from statsmodels.sandbox.nonparametric import kernels


#from scipy.stats import norm

Expand All @@ -20,14 +22,16 @@ def _select_sigma(X):


## Univariate Rule of Thumb Bandwidths ##
def bw_scott(x):
def bw_scott(x, kernel=None):
"""
Scott's Rule of Thumb

Parameters
----------
x : array-like
Array for which to get the bandwidth
kernel : CustomKernel object
Unused

Returns
-------
Expand All @@ -49,16 +53,18 @@ def bw_scott(x):
"""
A = _select_sigma(x)
n = len(x)
return 1.059 * A * n ** -.2
return 1.059 * A * n ** (-0.2)

def bw_silverman(x):
def bw_silverman(x, kernel=None):
"""
Silverman's Rule of Thumb

Parameters
----------
x : array-like
Array for which to get the bandwidth
kernel : CustomKernel object
Unused

Returns
-------
Expand All @@ -79,15 +85,64 @@ def bw_silverman(x):
"""
A = _select_sigma(x)
n = len(x)
return .9 * A * n ** -.2
return .9 * A * n ** (-0.2)


def bw_normal_reference(x, kernel=kernels.Gaussian):
"""
Plug-in bandwidth with kernel specific constant based on normal reference.

This bandwidth minimizes the mean integrated square error if the true
distribution is the normal. This choice is an appropriate bandwidth for
single peaked distributions that are similar to the normal distribution.

Parameters
----------
x : array-like
Array for which to get the bandwidth
kernel : CustomKernel object
Used to calcualate the constant for the plug-in bandwidth.

Returns
-------
bw : float
The estimate of the bandwidth

Notes
-----
Returns C * A * n ** (-1/5.) where ::
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would add a sentence here first, saying that
The bandwidth is optimal (minimizes expected integrated squared error ?) if the true distribution is the normal. This is an appropriate bandwidth for single peaked distributions that are similar to the normal distribution.
or something along those lines


A = min(std(x, ddof=1), IQR/1.349)
IQR = np.subtract.reduce(np.percentile(x, [75,25]))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

subtract reduce in docstring sounds strange, takes some time and experience to understand
q3 - q1 where q1, q3 are the first and third quartiles
?

C = constant from Hansen (2009)

When using a gaussian kernel this is equivalent to the 'scott' bandwidth up
to two decimal places. This is the accuracy to which the 'scott' constant is
specified.

References
----------

Silverman, B.W. (1986) `Density Estimation.`
Hansen, B.E. (2009) `Lecture Notes on Nonparametrics.`
"""
C = kernel.normal_reference_constant
A = _select_sigma(x)
n = len(x)
return C * A * n ** (-0.2)

## Plug-In Methods ##

## Least Squares Cross-Validation ##

## Helper Functions ##

bandwidth_funcs = dict(scott=bw_scott,silverman=bw_silverman)
bandwidth_funcs = {
"scott": bw_scott,
"silverman": bw_silverman,
"normal_reference": bw_normal_reference
}


def select_bandwidth(x, bw, kernel):
"""
Expand All @@ -111,11 +166,11 @@ def select_bandwidth(x, bw, kernel):

"""
bw = bw.lower()
if bw not in ["scott","silverman"]:
if bw not in ["scott","silverman","normal_reference"]:
raise ValueError("Bandwidth %s not understood" % bw)
#TODO: uncomment checks when we have non-rule of thumb bandwidths for diff. kernels
# if kernel == "gauss":
return bandwidth_funcs[bw](x)
return bandwidth_funcs[bw](x, kernel)
# else:
# raise ValueError("Only Gaussian Kernels are currently supported")

25 changes: 18 additions & 7 deletions statsmodels/nonparametric/kde.py
Expand Up @@ -82,7 +82,7 @@ class KDEUnivariate(object):
def __init__(self, endog):
self.endog = np.asarray(endog)

def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
def fit(self, kernel="gau", bw="normal_reference", fft=True, weights=None,
gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf)):
"""
Attach the density estimate to the KDEUnivariate class.
Expand All @@ -107,6 +107,9 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
`min(std(X),IQR/1.34)`
- "silverman" - .9 * A * nobs ** (-1/5.), where A is
`min(std(X),IQR/1.34)`
- "normal_reference" - C * A * nobs ** (-1/5.), where C is
calculated from the kernel. Equivalent (up to 2 dp) to the
"scott" bandwidth for gaussian kernels. See bandwidths.py
- If a float is given, it is the bandwidth.

fft : bool
Expand Down Expand Up @@ -270,7 +273,7 @@ def __init__(self, endog):

#### Kernel Density Estimator Functions ####

def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True):
"""
Rosenblatt-Parzen univariate kernel density estimator.
Expand Down Expand Up @@ -345,11 +348,14 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
weights = weights[clip_x.squeeze()]
q = weights.sum()

# Get kernel object corresponding to selection
kern = kernel_switch[kernel]()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the same is on line 365, which looks now redundant since it's already here. ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On line 365 it is

kernel_switch[kernel](h=bw)

So that the kernel is given it's calculated bandwidth. But this could be replaced by

kern.setH(bw)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I got confused, if setH works, then that's better
otherwise it would also be possible to get the kernel class just once kernel_class = kernel_switch[kernel] or something like that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, now that I look at this I'm a bit concerned there is an error in kernels.py. The kernel's domain does not seem to be affected by the bandwidth (no matter how you set it), this can't be right.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm actually maybe it is right. Will just use seth then.


# if bw is None, select optimal bandwidth for kernel
try:
bw = float(bw)
except:
bw = bandwidths.select_bandwidth(X, bw, kernel)
bw = bandwidths.select_bandwidth(X, bw, kern)
bw *= adjust

a = np.min(X,axis=0) - cut*bw
Expand All @@ -358,8 +364,9 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,

k = (X.T - grid[:,None])/bw # uses broadcasting to make a gridsize x nobs

# instantiate kernel class
kern = kernel_switch[kernel](h=bw)
# set kernel bandwidth
kern.seth(bw)

# truncate to domain
if kern.domain is not None: # won't work for piecewise kernels like parzen
z_lo, z_high = kern.domain
Expand All @@ -378,7 +385,7 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None,
else:
return dens, bw

def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None,
def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None,
adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True):
"""
Rosenblatt-Parzen univariate kernel density estimator
Expand Down Expand Up @@ -452,10 +459,14 @@ def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None,
X = np.asarray(X)
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
# will affect underlying data?

# Get kernel object corresponding to selection
kern = kernel_switch[kernel]()

try:
bw = float(bw)
except:
bw = bandwidths.select_bandwidth(X, bw, kernel) # will cross-val fit this pattern?
bw = bandwidths.select_bandwidth(X, bw, kern) # will cross-val fit this pattern?
bw *= adjust

nobs = float(len(X)) # after trim
Expand Down
78 changes: 78 additions & 0 deletions statsmodels/nonparametric/tests/test_bandwidths.py
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""

Tests for bandwidth selection and calculation.

Author: Padarn Wilson
"""

import numpy as np
from scipy import stats

from statsmodels.sandbox.nonparametric import kernels
from statsmodels.distributions.mixture_rvs import mixture_rvs
from statsmodels.nonparametric.kde import KDEUnivariate as KDE
from statsmodels.nonparametric.bandwidths import select_bandwidth



from numpy.testing import assert_allclose

# setup test data

np.random.seed(12345)
Xi = mixture_rvs([.25,.75], size=200, dist=[stats.norm, stats.norm],
kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))

class TestBandwidthCalculation(object):

def test_calculate_bandwidth_gaussian(self):

bw_expected = [0.29774853596742024,
0.25304408155871411,
0.29781147113698891]

kern = kernels.Gaussian()

bw_calc = [0, 0, 0]
for ii, bw in enumerate(['scott','silverman','normal_reference']):
bw_calc[ii] = select_bandwidth(Xi, bw, kern)

assert_allclose(bw_expected, bw_calc)


class CheckNormalReferenceConstant(object):

def test_calculate_normal_reference_constant(self):
const = self.constant
kern = self.kern
assert_allclose(const, kern.normal_reference_constant, 1e-2)


class TestEpanechnikov(CheckNormalReferenceConstant):

kern = kernels.Epanechnikov()
constant = 2.34


class TestGaussian(CheckNormalReferenceConstant):

kern = kernels.Gaussian()
constant = 1.06


class TestBiweight(CheckNormalReferenceConstant):

kern = kernels.Biweight()
constant = 2.78


class TestTriweight(CheckNormalReferenceConstant):

kern = kernels.Triweight()
constant = 3.15


if __name__ == "__main__":
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb'], exit=False)
3 changes: 2 additions & 1 deletion statsmodels/nonparametric/tests/test_kde.py
Expand Up @@ -138,7 +138,8 @@ def setupClass(cls):
cls.x = x = KDEWResults['x']
weights = KDEWResults['weights']
res1 = KDE(x)
res1.fit(kernel=cls.kernel_name, weights=weights, fft=False)
# default kernel was scott when reference values computed
res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott")
cls.res1 = res1
cls.res_density = KDEWResults[cls.res_kernel_name]

Expand Down
4 changes: 2 additions & 2 deletions statsmodels/nonparametric/tests/test_kernel_density.py
Expand Up @@ -66,7 +66,7 @@ class TestKDEUnivariate(MyTest):
def test_pdf_non_fft(self):

kde = nparam.KDEUnivariate(self.noise)
kde.fit(fft=False)
kde.fit(fft=False, bw='scott')


grid = kde.support
Expand All @@ -92,7 +92,7 @@ def test_pdf_non_fft(self):
def test_weighted_pdf_non_fft(self):

kde = nparam.KDEUnivariate(self.noise)
kde.fit(weights=self.weights, fft=False)
kde.fit(weights=self.weights, fft=False, bw='scott')

grid = kde.support
testx = [grid[10*i] for i in range(6)]
Expand Down