statsmodels · Padarn · Jun 27, 2013 · Jul 2, 2013 · Jul 5, 2013 · Jul 5, 2013
diff --git a/statsmodels/nonparametric/kde.py b/statsmodels/nonparametric/kde.py
@@ -16,7 +16,7 @@
 import warnings
 
 import numpy as np
-from scipy import integrate, stats
+from scipy import integrate, stats, interpolate
 from statsmodels.sandbox.nonparametric import kernels
 from statsmodels.tools.decorators import (cache_readonly,
                                                     resettable_cache)
@@ -115,7 +115,7 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
             is implemented. If FFT is False, then a 'nobs' x 'gridsize'
             intermediate array is created.
         gridsize : int
-            If gridsize is None, max(len(X), 50) is used.
+            If gridsize is None, max(len(X), 512) is used.
         cut : float
             Defines the length of the grid past the lowest and highest values
             of X so that the kernel goes to zero. The end points are
@@ -149,6 +149,8 @@ def fit(self, kernel="gau", bw="scott", fft=True, weights=None,
         self.bw = bw
         self.kernel = kernel_switch[kernel](h=bw) # we instantiate twice,
                                                 # should this passed to funcs?
+
+
         # put here to ensure empty cache after re-fit with new options
         self._cache = resettable_cache()
 
@@ -160,23 +162,55 @@ def cdf(self):
         Notes
         -----
         Will not work if fit has not been called.
+
+        If there is an analytic integrated kernel avaliable for the kernel then
+        this is used to find the cdf on self.support. Otherwise the cdf is evaluated
+        numerically.
         """
         _checkisfit(self)
-        density = self.density
-        kern = self.kernel
-        if kern.domain is None: # TODO: test for grid point at domain bound
-            a,b = -np.inf,np.inf
+
+        if getattr(self.kernel, 'cdf', None):
+            return sum(self.kernel.cdf(np.tile(self.endog,[len(self.support),1]).T, self.support, self.bw))/len(self.endog)
+
         else:
-            a,b = kern.domain
-        func = lambda x,s: kern.density(s,x)
+            density = self.density
+            kern = self.kernel
+            if kern.domain is None: # TODO: test for grid point at domain bound
+                a,b = -np.inf,np.inf
+            else:
+                a,b = kern.domain
+            func = lambda x,s: kern.density(s,x)
+
+            support = self.support
+            support = np.r_[a,support]
+            gridsize = len(support)
+            endog = self.endog
+            probs = [integrate.quad(func, support[i-1], support[i],
+                        args=endog)[0] for i in xrange(1,gridsize)]
+            return np.cumsum(probs)
+
+    def cdf_eval(self, x):
+        """
+        Returns the cumulative distribution function evaluated at x.
 
-        support = self.support
-        support = np.r_[a,support]
-        gridsize = len(support)
-        endog = self.endog
-        probs = [integrate.quad(func, support[i-1], support[i],
-                    args=endog)[0] for i in xrange(1,gridsize)]
-        return np.cumsum(probs)
+        Notes
+        -----
+        Will not work if fit has not been called.
+
+        If there is an analytic integrated kernel avaliable for the kernel then
+        this is used to find the cdf on self.support. Otherwise the cdf is evaluated
+        numerically.
+        """
+        _checkisfit(self)
+
+        if getattr(self.kernel, 'cdf', None):
+            return sum(self.kernel.cdf(self.endog, x, self.bw))/len(self.endog)
+
+        else:
+            kern = self.kernel
+            func = lambda y: kern.density(self.endog, y)
+
+            return integrate.quad(func, self.support[0], x)[0]
 
     @cache_readonly
     def cumhazard(self):
@@ -231,19 +265,59 @@ def entr(x,s):
         return -integrate.quad(entr, a,b, args=(endog,))[0]
 
     @cache_readonly
-    def icdf(self):
+    def icdf(self, sample_quantile = False):
         """
-        Inverse Cumulative Distribution (Quantile) Function
+        Inverse Cumulative Distribution (Quantile) Function over the
+        range of the cdf stored.
 
         Notes
         -----
         Will not work if fit has not been called. Uses
         `scipy.stats.mstats.mquantiles`.
+
+        Uses linear interpolation to get values on a uniform
+        grid.
         """
         _checkisfit(self)
-        gridsize = len(self.density)
-        return stats.mstats.mquantiles(self.endog, np.linspace(0,1,
-                    gridsize))
+
+        if sample_quantile:
+            gridsize = len(self.density)
+            return stats.mstats.mquantiles(self.endog, np.linspace(0,1,
+                        gridsize))
+
+        else:
+            icdf_interp = interpolate.interp1d(self.cdf, self.support, kind='linear')
+            return icdf_interp(np.linspace(self.cdf[0],self.cdf[-1],len(self.support)))
+
+    def icdf_eval(self, x):
+
+        _checkisfit(self)
+
+        if getattr(self.kernel, 'icdf', None):
+            print sum(self.kernel.icdf(self.endog, x, self.bw))/len(self.endog)
+
+        if x >= self.support[-1]:
+            return np.infty
+        if x <= self.support[0]:
+            return -1*np.infty
+
+        index = np.searchsorted(self.cdf, x)
+        support = self.support
+        cdf = self.cdf
+
+        return (x-cdf[index-1])*1.0/(cdf[index]-cdf[index-1])*(support[index]-support[index-1])+ support[index-1]
+
+    def variance(self, point):
+        """
+        Evaluates the variance of the kernel estimator according
+        to the approximate formula
+         v = 1/n * (1/h^2 sum(K(x-X_i/h)^2) - fhat(x,h)^2)
+
+         NOTE : NOT WORKING
+        """
+
+        _checkisfit(self)
+        return self.kernel.density_variance(self.endog, point)
 
     def evaluate(self, point):
         """

diff --git a/statsmodels/nonparametric/kernels.py b/statsmodels/nonparametric/kernels.py
@@ -13,8 +13,7 @@
 
 
 import numpy as np
-from scipy.special import erf
-
+from scipy.stats import norm
 
 #TODO:
 # - make sure we only receive int input for wang-ryzin and aitchison-aitken
@@ -155,7 +154,11 @@ def aitchison_aitken_convolution(h, Xi, Xj):
 
 
 def gaussian_cdf(h, Xi, x):
-    return 0.5 * h * (1 + erf((x - Xi) / (h * np.sqrt(2))))
+    return h * norm._cdf((x - Xi)/h)
+
+
+def gaussian_icdf(h, Xi, x):
+    return Xi + h**2 * norm._ppf(x)
 
 
 def aitchison_aitken_cdf(h, Xi, x_u):

diff --git a/statsmodels/sandbox/nonparametric/kernels.py b/statsmodels/sandbox/nonparametric/kernels.py
@@ -22,7 +22,7 @@
 import numpy as np
 import scipy.integrate
 from numpy import exp, multiply, square, divide, subtract, inf
-
+from scipy.special import erf
 
 class NdKernel(object):
     """Generic N-dimensial kernel
@@ -68,7 +68,6 @@ def setH(self, value):
     def density(self, xs, x):
         n = len(xs)
         #xs = self.inDomain( xs, xs, x )[0]
-
         if len(xs)>0:  ## Need to do product of marginal distributions
             #w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
             #vectorized doesn't work:
@@ -190,6 +189,29 @@ def density(self, xs, x):
         else:
             return np.nan
 
+    def density_variance(self, xs, x):
+        """Returns the variance of the kernel density estimate for a point x 
+        based on x-values xs. 
+
+        This uses the approximate variance formula:
+        $var(f(x))=\frac{1}{nh}R(K)-\frac{f(x)}{n}$
+        """
+
+        xs = np.asarray(xs)
+        n = len(xs)
+        xs = self.inDomain( xs, xs, x )[0]
+        if xs.ndim == 1:
+            xs = xs[:, None]
+        if len(xs)>0:
+            h = self.h
+            # Note 05/07/13 - _L2Norm actually seems to store the square of the
+            # norm. This is equal to R(K).
+            R = self._L2Norm
+            w =  (1./h * self.density(xs, x) * R - self.density(xs, x)**2)/n
+            return w
+        else:
+            return np.nan
+
     def smooth(self, xs, ys, x):
         """Returns the kernel smoothing estimate for point x based on x-values
         xs and y-values ys.
@@ -375,6 +397,7 @@ def __init__(self, h=1.0):
                         np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
         self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
 
+
     def smooth(self, xs, ys, x):
         """Returns the kernel smoothing estimate for point x based on x-values
         xs and y-values ys.
@@ -388,6 +411,9 @@ def smooth(self, xs, ys, x):
                                                           self.h)), -0.5))))
         return v/w
 
+    def cdf(self, Xi, x, h):
+        return 0.5 * (1 + erf( (x - Xi)/(h * np.sqrt(2) ) ) )
+
 class Cosine(CustomKernel):
     """
     Cosine Kernel