statsmodels/statsmodels

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
184 lines (157 sloc) 4.93 KB
 """ Empirical CDF Functions """ import numpy as np from scipy.interpolate import interp1d def _conf_set(F, alpha=.05): r""" Constructs a Dvoretzky-Kiefer-Wolfowitz confidence band for the eCDF. Parameters ---------- F : array-like The empirical distributions alpha : float Set alpha for a (1 - alpha) % confidence band. Notes ----- Based on the DKW inequality. .. math:: P \left( \sup_x \left| F(x) - \hat(F)_n(X) \right| > \epsilon \right) \leq 2e^{-2n\epsilon^2} References ---------- Wasserman, L. 2006. All of Nonparametric Statistics. Springer. """ nobs = len(F) epsilon = np.sqrt(np.log(2./alpha) / (2 * nobs)) lower = np.clip(F - epsilon, 0, 1) upper = np.clip(F + epsilon, 0, 1) return lower, upper class StepFunction(object): """ A basic step function. Values at the ends are handled in the simplest way possible: everything to the left of x[0] is set to ival; everything to the right of x[-1] is set to y[-1]. Parameters ---------- x : array-like y : array-like ival : float ival is the value given to the values to the left of x[0]. Default is 0. sorted : bool Default is False. side : {'left', 'right'}, optional Default is 'left'. Defines the shape of the intervals constituting the steps. 'right' correspond to [a, b) intervals and 'left' to (a, b]. Examples -------- >>> import numpy as np >>> from statsmodels.distributions.empirical_distribution import StepFunction >>> >>> x = np.arange(20) >>> y = np.arange(20) >>> f = StepFunction(x, y) >>> >>> print(f(3.2)) 3.0 >>> print(f([[3.2,4.5],[24,-3.1]])) [[ 3. 4.] [ 19. 0.]] >>> f2 = StepFunction(x, y, side='right') >>> >>> print(f(3.0)) 2.0 >>> print(f2(3.0)) 3.0 """ def __init__(self, x, y, ival=0., sorted=False, side='left'): if side.lower() not in ['right', 'left']: msg = "side can take the values 'right' or 'left'" raise ValueError(msg) self.side = side _x = np.asarray(x) _y = np.asarray(y) if _x.shape != _y.shape: msg = "x and y do not have the same shape" raise ValueError(msg) if len(_x.shape) != 1: msg = 'x and y must be 1-dimensional' raise ValueError(msg) self.x = np.r_[-np.inf, _x] self.y = np.r_[ival, _y] if not sorted: asort = np.argsort(self.x) self.x = np.take(self.x, asort, 0) self.y = np.take(self.y, asort, 0) self.n = self.x.shape[0] def __call__(self, time): tind = np.searchsorted(self.x, time, self.side) - 1 return self.y[tind] class ECDF(StepFunction): """ Return the Empirical CDF of an array as a step function. Parameters ---------- x : array-like Observations side : {'left', 'right'}, optional Default is 'right'. Defines the shape of the intervals constituting the steps. 'right' correspond to [a, b) intervals and 'left' to (a, b]. Returns ------- Empirical CDF as a step function. Examples -------- >>> import numpy as np >>> from statsmodels.distributions.empirical_distribution import ECDF >>> >>> ecdf = ECDF([3, 3, 1, 4]) >>> >>> ecdf([3, 55, 0.5, 1.5]) array([ 0.75, 1. , 0. , 0.25]) """ def __init__(self, x, side='right'): step = True if step: #TODO: make this an arg and have a linear interpolation option? x = np.array(x, copy=True) x.sort() nobs = len(x) y = np.linspace(1./nobs,1,nobs) super(ECDF, self).__init__(x, y, side=side, sorted=True) else: return interp1d(x,y,drop_errors=False,fill_values=ival) def monotone_fn_inverter(fn, x, vectorized=True, **keywords): """ Given a monotone function fn (no checking is done to verify monotonicity) and a set of x values, return an linearly interpolated approximation to its inverse from its values on x. """ x = np.asarray(x) if vectorized: y = fn(x, **keywords) else: y = [] for _x in x: y.append(fn(_x, **keywords)) y = np.array(y) a = np.argsort(y) return interp1d(y[a], x[a]) if __name__ == "__main__": #TODO: Make sure everything is correctly aligned and make a plotting # function from statsmodels.compat.python import urlopen import matplotlib.pyplot as plt nerve_data = urlopen('http://www.statsci.org/data/general/nerve.txt') nerve_data = np.loadtxt(nerve_data) x = nerve_data / 50. # was in 1/50 seconds cdf = ECDF(x) x.sort() F = cdf(x) plt.step(x, F) lower, upper = _conf_set(F) plt.step(x, lower, 'r') plt.step(x, upper, 'r') plt.xlim(0, 1.5) plt.ylim(0, 1.05) plt.vlines(x, 0, .05) plt.show()