New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Silverman enhancement - Issue #1243 #1271
Changes from all commits
2268a27
d100a48
e860186
4db8e9f
b504ba3
96c9b7f
edf3792
7050daa
d76e27a
c22cd3b
2a97860
f6a5a1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Author: Padarn Wilson | ||
|
||
Performance of normal reference plug-in estimator vs silverman. Sample is drawn | ||
from a mixture of gaussians. Distribution has been chosen to be reasoanbly close | ||
to normal. | ||
""" | ||
|
||
import numpy as np | ||
from scipy import stats | ||
import matplotlib.pyplot as plt | ||
import statsmodels.nonparametric.api as npar | ||
from statsmodels.sandbox.nonparametric import kernels | ||
from statsmodels.distributions.mixture_rvs import mixture_rvs | ||
|
||
# example from test_kde.py mixture of two normal distributions | ||
np.random.seed(12345) | ||
x = mixture_rvs([.1, .9], size=200, dist=[stats.norm, stats.norm], | ||
kwargs=(dict(loc=0, scale=.5), dict(loc=1, scale=.5))) | ||
|
||
kde = npar.KDEUnivariate(x) | ||
|
||
|
||
kernel_names = ['Gaussian', 'Epanechnikov', 'Biweight', | ||
'Triangular', 'Triweight', 'Cosine' | ||
] | ||
|
||
kernel_switch = ['gau', 'epa', 'tri', 'biw', | ||
'triw', 'cos' | ||
] | ||
|
||
|
||
def true_pdf(x): | ||
pdf = 0.1 * stats.norm.pdf(x, loc=0, scale=0.5) | ||
pdf += 0.9 * stats.norm.pdf(x, loc=1, scale=0.5) | ||
return pdf | ||
|
||
fig = plt.figure() | ||
for ii, kn in enumerate(kernel_switch): | ||
|
||
ax = fig.add_subplot(2, 3, ii + 1) # without uniform | ||
ax.hist(x, bins=20, normed=True, alpha=0.25) | ||
|
||
kde.fit(kernel=kn, bw='silverman', fft=False) | ||
ax.plot(kde.support, kde.density) | ||
|
||
kde.fit(kernel=kn, bw='normal_reference', fft=False) | ||
ax.plot(kde.support, kde.density) | ||
|
||
ax.plot(kde.support, true_pdf(kde.support), color='black', linestyle='--') | ||
|
||
ax.set_title(kernel_names[ii]) | ||
|
||
|
||
ax.legend(['silverman', 'normal reference', 'true pdf'], loc='lower right') | ||
ax.set_title('200 points') | ||
plt.show() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
import numpy as np | ||
from scipy.stats import scoreatpercentile as sap | ||
from statsmodels.sandbox.nonparametric import kernels | ||
|
||
|
||
#from scipy.stats import norm | ||
|
||
|
@@ -20,14 +22,16 @@ def _select_sigma(X): | |
|
||
|
||
## Univariate Rule of Thumb Bandwidths ## | ||
def bw_scott(x): | ||
def bw_scott(x, kernel=None): | ||
""" | ||
Scott's Rule of Thumb | ||
|
||
Parameters | ||
---------- | ||
x : array-like | ||
Array for which to get the bandwidth | ||
kernel : CustomKernel object | ||
Unused | ||
|
||
Returns | ||
------- | ||
|
@@ -49,16 +53,18 @@ def bw_scott(x): | |
""" | ||
A = _select_sigma(x) | ||
n = len(x) | ||
return 1.059 * A * n ** -.2 | ||
return 1.059 * A * n ** (-0.2) | ||
|
||
def bw_silverman(x): | ||
def bw_silverman(x, kernel=None): | ||
""" | ||
Silverman's Rule of Thumb | ||
|
||
Parameters | ||
---------- | ||
x : array-like | ||
Array for which to get the bandwidth | ||
kernel : CustomKernel object | ||
Unused | ||
|
||
Returns | ||
------- | ||
|
@@ -79,15 +85,64 @@ def bw_silverman(x): | |
""" | ||
A = _select_sigma(x) | ||
n = len(x) | ||
return .9 * A * n ** -.2 | ||
return .9 * A * n ** (-0.2) | ||
|
||
|
||
def bw_normal_reference(x, kernel=kernels.Gaussian): | ||
""" | ||
Plug-in bandwidth with kernel specific constant based on normal reference. | ||
|
||
This bandwidth minimizes the mean integrated square error if the true | ||
distribution is the normal. This choice is an appropriate bandwidth for | ||
single peaked distributions that are similar to the normal distribution. | ||
|
||
Parameters | ||
---------- | ||
x : array-like | ||
Array for which to get the bandwidth | ||
kernel : CustomKernel object | ||
Used to calcualate the constant for the plug-in bandwidth. | ||
|
||
Returns | ||
------- | ||
bw : float | ||
The estimate of the bandwidth | ||
|
||
Notes | ||
----- | ||
Returns C * A * n ** (-1/5.) where :: | ||
|
||
A = min(std(x, ddof=1), IQR/1.349) | ||
IQR = np.subtract.reduce(np.percentile(x, [75,25])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. subtract reduce in docstring sounds strange, takes some time and experience to understand |
||
C = constant from Hansen (2009) | ||
|
||
When using a gaussian kernel this is equivalent to the 'scott' bandwidth up | ||
to two decimal places. This is the accuracy to which the 'scott' constant is | ||
specified. | ||
|
||
References | ||
---------- | ||
|
||
Silverman, B.W. (1986) `Density Estimation.` | ||
Hansen, B.E. (2009) `Lecture Notes on Nonparametrics.` | ||
""" | ||
C = kernel.normal_reference_constant | ||
A = _select_sigma(x) | ||
n = len(x) | ||
return C * A * n ** (-0.2) | ||
|
||
## Plug-In Methods ## | ||
|
||
## Least Squares Cross-Validation ## | ||
|
||
## Helper Functions ## | ||
|
||
bandwidth_funcs = dict(scott=bw_scott,silverman=bw_silverman) | ||
bandwidth_funcs = { | ||
"scott": bw_scott, | ||
"silverman": bw_silverman, | ||
"normal_reference": bw_normal_reference | ||
} | ||
|
||
|
||
def select_bandwidth(x, bw, kernel): | ||
""" | ||
|
@@ -111,11 +166,11 @@ def select_bandwidth(x, bw, kernel): | |
|
||
""" | ||
bw = bw.lower() | ||
if bw not in ["scott","silverman"]: | ||
if bw not in ["scott","silverman","normal_reference"]: | ||
raise ValueError("Bandwidth %s not understood" % bw) | ||
#TODO: uncomment checks when we have non-rule of thumb bandwidths for diff. kernels | ||
# if kernel == "gauss": | ||
return bandwidth_funcs[bw](x) | ||
return bandwidth_funcs[bw](x, kernel) | ||
# else: | ||
# raise ValueError("Only Gaussian Kernels are currently supported") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,7 +82,7 @@ class KDEUnivariate(object): | |
def __init__(self, endog): | ||
self.endog = np.asarray(endog) | ||
|
||
def fit(self, kernel="gau", bw="scott", fft=True, weights=None, | ||
def fit(self, kernel="gau", bw="normal_reference", fft=True, weights=None, | ||
gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf)): | ||
""" | ||
Attach the density estimate to the KDEUnivariate class. | ||
|
@@ -107,6 +107,9 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None, | |
`min(std(X),IQR/1.34)` | ||
- "silverman" - .9 * A * nobs ** (-1/5.), where A is | ||
`min(std(X),IQR/1.34)` | ||
- "normal_reference" - C * A * nobs ** (-1/5.), where C is | ||
calculated from the kernel. Equivalent (up to 2 dp) to the | ||
"scott" bandwidth for gaussian kernels. See bandwidths.py | ||
- If a float is given, it is the bandwidth. | ||
|
||
fft : bool | ||
|
@@ -270,7 +273,7 @@ def __init__(self, endog): | |
|
||
#### Kernel Density Estimator Functions #### | ||
|
||
def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None, | ||
def kdensity(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, | ||
adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True): | ||
""" | ||
Rosenblatt-Parzen univariate kernel density estimator. | ||
|
@@ -345,11 +348,14 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None, | |
weights = weights[clip_x.squeeze()] | ||
q = weights.sum() | ||
|
||
# Get kernel object corresponding to selection | ||
kern = kernel_switch[kernel]() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the same is on line 365, which looks now redundant since it's already here. ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On line 365 it is
So that the kernel is given it's calculated bandwidth. But this could be replaced by
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I got confused, if setH works, then that's better There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, now that I look at this I'm a bit concerned there is an error in kernels.py. The kernel's domain does not seem to be affected by the bandwidth (no matter how you set it), this can't be right. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm actually maybe it is right. Will just use |
||
|
||
# if bw is None, select optimal bandwidth for kernel | ||
try: | ||
bw = float(bw) | ||
except: | ||
bw = bandwidths.select_bandwidth(X, bw, kernel) | ||
bw = bandwidths.select_bandwidth(X, bw, kern) | ||
bw *= adjust | ||
|
||
a = np.min(X,axis=0) - cut*bw | ||
|
@@ -358,8 +364,9 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None, | |
|
||
k = (X.T - grid[:,None])/bw # uses broadcasting to make a gridsize x nobs | ||
|
||
# instantiate kernel class | ||
kern = kernel_switch[kernel](h=bw) | ||
# set kernel bandwidth | ||
kern.seth(bw) | ||
|
||
# truncate to domain | ||
if kern.domain is not None: # won't work for piecewise kernels like parzen | ||
z_lo, z_high = kern.domain | ||
|
@@ -378,7 +385,7 @@ def kdensity(X, kernel="gau", bw="scott", weights=None, gridsize=None, | |
else: | ||
return dens, bw | ||
|
||
def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None, | ||
def kdensityfft(X, kernel="gau", bw="normal_reference", weights=None, gridsize=None, | ||
adjust=1, clip=(-np.inf,np.inf), cut=3, retgrid=True): | ||
""" | ||
Rosenblatt-Parzen univariate kernel density estimator | ||
|
@@ -452,10 +459,14 @@ def kdensityfft(X, kernel="gau", bw="scott", weights=None, gridsize=None, | |
X = np.asarray(X) | ||
X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns. | ||
# will affect underlying data? | ||
|
||
# Get kernel object corresponding to selection | ||
kern = kernel_switch[kernel]() | ||
|
||
try: | ||
bw = float(bw) | ||
except: | ||
bw = bandwidths.select_bandwidth(X, bw, kernel) # will cross-val fit this pattern? | ||
bw = bandwidths.select_bandwidth(X, bw, kern) # will cross-val fit this pattern? | ||
bw *= adjust | ||
|
||
nobs = float(len(X)) # after trim | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
|
||
Tests for bandwidth selection and calculation. | ||
|
||
Author: Padarn Wilson | ||
""" | ||
|
||
import numpy as np | ||
from scipy import stats | ||
|
||
from statsmodels.sandbox.nonparametric import kernels | ||
from statsmodels.distributions.mixture_rvs import mixture_rvs | ||
from statsmodels.nonparametric.kde import KDEUnivariate as KDE | ||
from statsmodels.nonparametric.bandwidths import select_bandwidth | ||
|
||
|
||
|
||
from numpy.testing import assert_allclose | ||
|
||
# setup test data | ||
|
||
np.random.seed(12345) | ||
Xi = mixture_rvs([.25,.75], size=200, dist=[stats.norm, stats.norm], | ||
kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5))) | ||
|
||
class TestBandwidthCalculation(object): | ||
|
||
def test_calculate_bandwidth_gaussian(self): | ||
|
||
bw_expected = [0.29774853596742024, | ||
0.25304408155871411, | ||
0.29781147113698891] | ||
|
||
kern = kernels.Gaussian() | ||
|
||
bw_calc = [0, 0, 0] | ||
for ii, bw in enumerate(['scott','silverman','normal_reference']): | ||
bw_calc[ii] = select_bandwidth(Xi, bw, kern) | ||
|
||
assert_allclose(bw_expected, bw_calc) | ||
|
||
|
||
class CheckNormalReferenceConstant(object): | ||
|
||
def test_calculate_normal_reference_constant(self): | ||
const = self.constant | ||
kern = self.kern | ||
assert_allclose(const, kern.normal_reference_constant, 1e-2) | ||
|
||
|
||
class TestEpanechnikov(CheckNormalReferenceConstant): | ||
|
||
kern = kernels.Epanechnikov() | ||
constant = 2.34 | ||
|
||
|
||
class TestGaussian(CheckNormalReferenceConstant): | ||
|
||
kern = kernels.Gaussian() | ||
constant = 1.06 | ||
|
||
|
||
class TestBiweight(CheckNormalReferenceConstant): | ||
|
||
kern = kernels.Biweight() | ||
constant = 2.78 | ||
|
||
|
||
class TestTriweight(CheckNormalReferenceConstant): | ||
|
||
kern = kernels.Triweight() | ||
constant = 3.15 | ||
|
||
|
||
if __name__ == "__main__": | ||
import nose | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb'], exit=False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would add a sentence here first, saying that
The bandwidth is optimal (minimizes expected integrated squared error ?) if the true distribution is the normal. This is an appropriate bandwidth for single peaked distributions that are similar to the normal distribution.
or something along those lines