Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additions to Univariate KDEs #973

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 94 additions & 20 deletions statsmodels/nonparametric/kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import warnings

import numpy as np
from scipy import integrate, stats
from scipy import integrate, stats, interpolate
from statsmodels.sandbox.nonparametric import kernels
from statsmodels.tools.decorators import (cache_readonly,
resettable_cache)
Expand Down Expand Up @@ -115,7 +115,7 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
is implemented. If FFT is False, then a 'nobs' x 'gridsize'
intermediate array is created.
gridsize : int
If gridsize is None, max(len(X), 50) is used.
If gridsize is None, max(len(X), 512) is used.
cut : float
Defines the length of the grid past the lowest and highest values
of X so that the kernel goes to zero. The end points are
Expand Down Expand Up @@ -149,6 +149,8 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
self.bw = bw
self.kernel = kernel_switch[kernel](h=bw) # we instantiate twice,
# should this passed to funcs?


# put here to ensure empty cache after re-fit with new options
self._cache = resettable_cache()

Expand All @@ -160,23 +162,55 @@ def cdf(self):
Notes
-----
Will not work if fit has not been called.

If there is an analytic integrated kernel avaliable for the kernel then
this is used to find the cdf on self.support. Otherwise the cdf is evaluated
numerically.
"""
_checkisfit(self)
density = self.density
kern = self.kernel
if kern.domain is None: # TODO: test for grid point at domain bound
a,b = -np.inf,np.inf

if getattr(self.kernel, 'cdf', None):
return sum(self.kernel.cdf(np.tile(self.endog,[len(self.support),1]).T, self.support, self.bw))/len(self.endog)

else:
a,b = kern.domain
func = lambda x,s: kern.density(s,x)
density = self.density
kern = self.kernel
if kern.domain is None: # TODO: test for grid point at domain bound
a,b = -np.inf,np.inf
else:
a,b = kern.domain
func = lambda x,s: kern.density(s,x)

support = self.support
support = np.r_[a,support]
gridsize = len(support)
endog = self.endog
probs = [integrate.quad(func, support[i-1], support[i],
args=endog)[0] for i in xrange(1,gridsize)]
return np.cumsum(probs)

def cdf_eval(self, x):
"""
Returns the cumulative distribution function evaluated at x.

support = self.support
support = np.r_[a,support]
gridsize = len(support)
endog = self.endog
probs = [integrate.quad(func, support[i-1], support[i],
args=endog)[0] for i in xrange(1,gridsize)]
return np.cumsum(probs)
Notes
-----
Will not work if fit has not been called.

If there is an analytic integrated kernel avaliable for the kernel then
this is used to find the cdf on self.support. Otherwise the cdf is evaluated
numerically.
"""
_checkisfit(self)

if getattr(self.kernel, 'cdf', None):
return sum(self.kernel.cdf(self.endog, x, self.bw))/len(self.endog)

else:
kern = self.kernel
func = lambda y: kern.density(self.endog, y)

return integrate.quad(func, self.support[0], x)[0]

@cache_readonly
def cumhazard(self):
Expand Down Expand Up @@ -231,19 +265,59 @@ def entr(x,s):
return -integrate.quad(entr, a,b, args=(endog,))[0]

@cache_readonly
def icdf(self):
def icdf(self, sample_quantile = False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might need a third option for the exact calculation, then instead of a boolean sample_quantiles we might need a string, something like method='interpolate'

"""
Inverse Cumulative Distribution (Quantile) Function
Inverse Cumulative Distribution (Quantile) Function over the
range of the cdf stored.

Notes
-----
Will not work if fit has not been called. Uses
`scipy.stats.mstats.mquantiles`.

Uses linear interpolation to get values on a uniform
grid.
"""
_checkisfit(self)
gridsize = len(self.density)
return stats.mstats.mquantiles(self.endog, np.linspace(0,1,
gridsize))

if sample_quantile:
gridsize = len(self.density)
return stats.mstats.mquantiles(self.endog, np.linspace(0,1,
gridsize))

else:
icdf_interp = interpolate.interp1d(self.cdf, self.support, kind='linear')
return icdf_interp(np.linspace(self.cdf[0],self.cdf[-1],len(self.support)))

def icdf_eval(self, x):

_checkisfit(self)

if getattr(self.kernel, 'icdf', None):
print sum(self.kernel.icdf(self.endog, x, self.bw))/len(self.endog)

if x >= self.support[-1]:
return np.infty
if x <= self.support[0]:
return -1*np.infty

index = np.searchsorted(self.cdf, x)
support = self.support
cdf = self.cdf

return (x-cdf[index-1])*1.0/(cdf[index]-cdf[index-1])*(support[index]-support[index-1])+ support[index-1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split into several lines (line length) and make separate return


def variance(self, point):
"""
Evaluates the variance of the kernel estimator according
to the approximate formula
v = 1/n * (1/h^2 sum(K(x-X_i/h)^2) - fhat(x,h)^2)

NOTE : NOT WORKING
"""

_checkisfit(self)
return self.kernel.density_variance(self.endog, point)

def evaluate(self, point):
"""
Expand Down
9 changes: 6 additions & 3 deletions statsmodels/nonparametric/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@


import numpy as np
from scipy.special import erf

from scipy.stats import norm

#TODO:
# - make sure we only receive int input for wang-ryzin and aitchison-aitken
Expand Down Expand Up @@ -155,7 +154,11 @@ def aitchison_aitken_convolution(h, Xi, Xj):


def gaussian_cdf(h, Xi, x):
return 0.5 * h * (1 + erf((x - Xi) / (h * np.sqrt(2))))
return h * norm._cdf((x - Xi)/h)


def gaussian_icdf(h, Xi, x):
return Xi + h**2 * norm._ppf(x)


def aitchison_aitken_cdf(h, Xi, x_u):
Expand Down
30 changes: 28 additions & 2 deletions statsmodels/sandbox/nonparametric/kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np
import scipy.integrate
from numpy import exp, multiply, square, divide, subtract, inf

from scipy.special import erf

class NdKernel(object):
"""Generic N-dimensial kernel
Expand Down Expand Up @@ -68,7 +68,6 @@ def setH(self, value):
def density(self, xs, x):
n = len(xs)
#xs = self.inDomain( xs, xs, x )[0]

if len(xs)>0: ## Need to do product of marginal distributions
#w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
#vectorized doesn't work:
Expand Down Expand Up @@ -190,6 +189,29 @@ def density(self, xs, x):
else:
return np.nan

def density_variance(self, xs, x):
"""Returns the variance of the kernel density estimate for a point x
based on x-values xs.

This uses the approximate variance formula:
$var(f(x))=\frac{1}{nh}R(K)-\frac{f(x)}{n}$
"""

xs = np.asarray(xs)
n = len(xs)
xs = self.inDomain( xs, xs, x )[0]
if xs.ndim == 1:
xs = xs[:, None]
if len(xs)>0:
h = self.h
# Note 05/07/13 - _L2Norm actually seems to store the square of the
# norm. This is equal to R(K).
R = self._L2Norm
w = (1./h * self.density(xs, x) * R - self.density(xs, x)**2)/n
return w
else:
return np.nan

def smooth(self, xs, ys, x):
"""Returns the kernel smoothing estimate for point x based on x-values
xs and y-values ys.
Expand Down Expand Up @@ -375,6 +397,7 @@ def __init__(self, h=1.0):
np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))


def smooth(self, xs, ys, x):
"""Returns the kernel smoothing estimate for point x based on x-values
xs and y-values ys.
Expand All @@ -388,6 +411,9 @@ def smooth(self, xs, ys, x):
self.h)), -0.5))))
return v/w

def cdf(self, Xi, x, h):
return 0.5 * (1 + erf( (x - Xi)/(h * np.sqrt(2) ) ) )

class Cosine(CustomKernel):
"""
Cosine Kernel
Expand Down