Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

remove models code because it isn't ready for release

  • Loading branch information...
commit 174e708f823162bb56248c63ee1985f672797503 1 parent d38ffbe
@jarrodmillman jarrodmillman authored
Showing with 0 additions and 5,790 deletions.
  1. +0 −28 scipy/stats/models/LICENSE.txt
  2. +0 −36 scipy/stats/models/TODO.txt
  3. +0 −20 scipy/stats/models/__init__.py
  4. +0 −663 scipy/stats/models/bspline.py
  5. +0 −154 scipy/stats/models/contrast.py
  6. +0 −210 scipy/stats/models/cox.py
  7. +0 −14 scipy/stats/models/family/__init__.py
  8. +0 −258 scipy/stats/models/family/family.py
  9. +0 −390 scipy/stats/models/family/links.py
  10. +0 −87 scipy/stats/models/family/varfuncs.py
  11. +0 −754 scipy/stats/models/formula.py
  12. +0 −248 scipy/stats/models/gam.py
  13. +0 −89 scipy/stats/models/glm.py
  14. +0 −28 scipy/stats/models/info.py
  15. +0 −346 scipy/stats/models/mixed.py
  16. +0 −192 scipy/stats/models/model.py
  17. +0 −376 scipy/stats/models/regression.py
  18. +0 −80 scipy/stats/models/rlm.py
  19. +0 −8 scipy/stats/models/robust/__init__.py
  20. +0 −221 scipy/stats/models/robust/norms.py
  21. +0 −123 scipy/stats/models/robust/scale.py
  22. +0 −20 scipy/stats/models/setup.py
  23. +0 −20 scipy/stats/models/setupscons.py
  24. +0 −250 scipy/stats/models/smoothers.py
  25. +0 −135 scipy/stats/models/src/bspline_ext.c
  26. +0 −274 scipy/stats/models/src/bspline_impl.c
  27. +0 −18 scipy/stats/models/survival.py
  28. +0 −48 scipy/stats/models/tests/test_bspline.py
  29. +0 −318 scipy/stats/models/tests/test_formula.py
  30. +0 −34 scipy/stats/models/tests/test_glm.py
  31. +0 −45 scipy/stats/models/tests/test_regression.py
  32. +0 −30 scipy/stats/models/tests/test_rlm.py
  33. +0 −58 scipy/stats/models/tests/test_scale.py
  34. +0 −58 scipy/stats/models/tests/test_utils.py
  35. +0 −155 scipy/stats/models/utils.py
  36. +0 −1  scipy/stats/setup.py
  37. +0 −1  scipy/stats/setupscons.py
View
28 scipy/stats/models/LICENSE.txt
@@ -1,28 +0,0 @@
-Copyright (C) 2006, Jonathan E. Taylor
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
-3. The name of the author may not be used to endorse or promote
- products derived from this software without specific prior
- written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
View
36 scipy/stats/models/TODO.txt
@@ -1,36 +0,0 @@
-TODO for scipy.stats.models
-===========================
-
-In converting the bspline.so from a weave build to a C extension, we
-found several things that should be fixed or looked into more
-thoroughly. Hopefully we can dedicate some time to this effort at the
-Scipy Conf 2008. However, many of these items should be addressed
-before stats.models goes into a release of scipy.
-
-Items
------
-
-* Run pychecker on the stats.models and fix numerous errors. There
- are import errors, undefined globals, undefined attrs,
- etc... Running the command below in stats/models produced 140+
- errors.::
-
- # Run pychecker on all python modules except __init__.py
- $ grind "[a-z|_][a-z]*.py" | xargs pychecker
-
-* Address the FIXME issues in the code.
-
-* Determine and cleanup the public API. Functions/classes used
- internally should be private (leading underscore). Public functions
- should be obvious and documented. Packaging should be reviewed and
- cleaned up.
-
-* Update documentation to scipy standards. Especially adding example
- sections showing how to use the public functions.
-
-* Tests! Robust tests are needed! Of the subset of tests we looked
- at, most only checked attribute setting, not the results of applying
- the function to data.
-
-* Remove code duplication. smoothers.py and bspline.py define
- SmoothingSpline class.
View
20 scipy/stats/models/__init__.py
@@ -1,20 +0,0 @@
-#
-# models - Statistical Models
-#
-
-__docformat__ = 'restructuredtext'
-
-from scipy.stats.models.info import __doc__
-
-import scipy.stats.models.model
-import scipy.stats.models.formula
-import scipy.stats.models.regression
-import scipy.stats.models.robust
-import scipy.stats.models.family
-from scipy.stats.models.glm import Model as glm
-from scipy.stats.models.rlm import Model as rlm
-
-__all__ = filter(lambda s:not s.startswith('_'),dir())
-
-from numpy.testing import Tester
-test = Tester().test
View
663 scipy/stats/models/bspline.py
@@ -1,663 +0,0 @@
-'''
-Bspines and smoothing splines.
-
-General references:
-
- Craven, P. and Wahba, G. (1978) "Smoothing noisy data with spline functions.
- Estimating the correct degree of smoothing by
- the method of generalized cross-validation."
- Numerische Mathematik, 31(4), 377-403.
-
- Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
- Learning." Springer-Verlag. 536 pages.
-
- Hutchison, M. and Hoog, F. "Smoothing noisy data with spline functions."
- Numerische Mathematik, 47(1), 99-106.
-'''
-
-import numpy as N
-import numpy.linalg as L
-
-from scipy.linalg import solveh_banded
-from scipy.optimize import golden
-from scipy.stats.models import _hbspline
-
-
-# Issue warning regarding heavy development status of this module
-import warnings
-_msg = "The bspline code is technology preview and requires significant work\
-on the public API and documentation. The API will likely change in the future"
-warnings.warn(_msg, UserWarning)
-
-
-def _band2array(a, lower=0, symmetric=False, hermitian=False):
- """
- Take an upper or lower triangular banded matrix and return a
- numpy array.
-
- INPUTS:
- a -- a matrix in upper or lower triangular banded matrix
- lower -- is the matrix upper or lower triangular?
- symmetric -- if True, return the original result plus its transpose
- hermitian -- if True (and symmetric False), return the original
- result plus its conjugate transposed
-
- """
-
- n = a.shape[1]
- r = a.shape[0]
- _a = 0
-
- if not lower:
- for j in range(r):
- _b = N.diag(a[r-1-j],k=j)[j:(n+j),j:(n+j)]
- _a += _b
- if symmetric and j > 0: _a += _b.T
- elif hermitian and j > 0: _a += _b.conjugate().T
- else:
- for j in range(r):
- _b = N.diag(a[j],k=j)[0:n,0:n]
- _a += _b
- if symmetric and j > 0: _a += _b.T
- elif hermitian and j > 0: _a += _b.conjugate().T
- _a = _a.T
-
- return _a
-
-
-def _upper2lower(ub):
- """
- Convert upper triangular banded matrix to lower banded form.
-
- INPUTS:
- ub -- an upper triangular banded matrix
-
- OUTPUTS: lb
- lb -- a lower triangular banded matrix with same entries
- as ub
- """
-
- lb = N.zeros(ub.shape, ub.dtype)
- nrow, ncol = ub.shape
- for i in range(ub.shape[0]):
- lb[i,0:(ncol-i)] = ub[nrow-1-i,i:ncol]
- lb[i,(ncol-i):] = ub[nrow-1-i,0:i]
- return lb
-
-def _lower2upper(lb):
- """
- Convert lower triangular banded matrix to upper banded form.
-
- INPUTS:
- lb -- a lower triangular banded matrix
-
- OUTPUTS: ub
- ub -- an upper triangular banded matrix with same entries
- as lb
- """
-
- ub = N.zeros(lb.shape, lb.dtype)
- nrow, ncol = lb.shape
- for i in range(lb.shape[0]):
- ub[nrow-1-i,i:ncol] = lb[i,0:(ncol-i)]
- ub[nrow-1-i,0:i] = lb[i,(ncol-i):]
- return ub
-
-def _triangle2unit(tb, lower=0):
- """
- Take a banded triangular matrix and return its diagonal and the
- unit matrix: the banded triangular matrix with 1's on the diagonal,
- i.e. each row is divided by the corresponding entry on the diagonal.
-
- INPUTS:
- tb -- a lower triangular banded matrix
- lower -- if True, then tb is assumed to be lower triangular banded,
- in which case return value is also lower triangular banded.
-
- OUTPUTS: d, b
- d -- diagonal entries of tb
- b -- unit matrix: if lower is False, b is upper triangular
- banded and its rows of have been divided by d,
- else lower is True, b is lower triangular banded
- and its columns have been divieed by d.
-
- """
-
- if lower: d = tb[0].copy()
- else: d = tb[-1].copy()
-
- if lower: return d, (tb / d)
- else:
- l = _upper2lower(tb)
- return d, _lower2upper(l / d)
-
-def _trace_symbanded(a, b, lower=0):
- """
- Compute the trace(ab) for two upper or banded real symmetric matrices
- stored either in either upper or lower form.
-
- INPUTS:
- a, b -- two banded real symmetric matrices (either lower or upper)
- lower -- if True, a and b are assumed to be the lower half
-
-
- OUTPUTS: trace
- trace -- trace(ab)
-
- """
-
- if lower:
- t = _zero_triband(a * b, lower=1)
- return t[0].sum() + 2 * t[1:].sum()
- else:
- t = _zero_triband(a * b, lower=0)
- return t[-1].sum() + 2 * t[:-1].sum()
-
-
-def _zero_triband(a, lower=0):
- """
- Explicitly zero out unused elements of a real symmetric banded matrix.
-
- INPUTS:
- a -- a real symmetric banded matrix (either upper or lower hald)
- lower -- if True, a is assumed to be the lower half
-
- """
-
- nrow, ncol = a.shape
- if lower:
- for i in range(nrow): a[i,(ncol-i):] = 0.
- else:
- for i in range(nrow): a[i,0:i] = 0.
- return a
-
-
-class BSpline(object):
-
- '''
-
- Bsplines of a given order and specified knots.
-
- Implementation is based on description in Chapter 5 of
-
- Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
- Learning." Springer-Verlag. 536 pages.
-
-
- INPUTS:
- knots -- a sorted array of knots with knots[0] the lower boundary,
- knots[1] the upper boundary and knots[1:-1] the internal
- knots.
- order -- order of the Bspline, default is 4 which yields cubic
- splines
- M -- number of additional boundary knots, if None it defaults
- to order
- coef -- an optional array of real-valued coefficients for the Bspline
- of shape (knots.shape + 2 * (M - 1) - order,).
- x -- an optional set of x values at which to evaluate the
- Bspline to avoid extra evaluation in the __call__ method
-
- '''
- # FIXME: update parameter names, replace single character names
- # FIXME: `order` should be actual spline order (implemented as order+1)
- ## FIXME: update the use of spline order in extension code (evaluate is recursively called)
- # FIXME: eliminate duplicate M and m attributes (m is order, M is related to tau size)
-
- def __init__(self, knots, order=4, M=None, coef=None, x=None):
-
- knots = N.squeeze(N.unique(N.asarray(knots)))
-
- if knots.ndim != 1:
- raise ValueError, 'expecting 1d array for knots'
-
- self.m = order
- if M is None:
- M = self.m
- self.M = M
-
- self.tau = N.hstack([[knots[0]]*(self.M-1), knots, [knots[-1]]*(self.M-1)])
-
- self.K = knots.shape[0] - 2
- if coef is None:
- self.coef = N.zeros((self.K + 2 * self.M - self.m), N.float64)
- else:
- self.coef = N.squeeze(coef)
- if self.coef.shape != (self.K + 2 * self.M - self.m):
- raise ValueError, 'coefficients of Bspline have incorrect shape'
- if x is not None:
- self.x = x
-
- def _setx(self, x):
- self._x = x
- self._basisx = self.basis(self._x)
-
- def _getx(self):
- return self._x
-
- x = property(_getx, _setx)
-
- def __call__(self, *args):
- """
- Evaluate the BSpline at a given point, yielding
- a matrix B and return
-
- B * self.coef
-
-
- INPUTS:
- args -- optional arguments. If None, it returns self._basisx,
- the BSpline evaluated at the x values passed in __init__.
- Otherwise, return the BSpline evaluated at the
- first argument args[0].
-
- OUTPUTS: y
- y -- value of Bspline at specified x values
-
- BUGS:
- If self has no attribute x, an exception will be raised
- because self has no attribute _basisx.
-
- """
-
- if not args:
- b = self._basisx.T
- else:
- x = args[0]
- b = N.asarray(self.basis(x)).T
- return N.squeeze(N.dot(b, self.coef))
-
- def basis_element(self, x, i, d=0):
- """
- Evaluate a particular basis element of the BSpline,
- or its derivative.
-
- INPUTS:
- x -- x values at which to evaluate the basis element
- i -- which element of the BSpline to return
- d -- the order of derivative
-
- OUTPUTS: y
- y -- value of d-th derivative of the i-th basis element
- of the BSpline at specified x values
-
- """
-
- x = N.asarray(x, N.float64)
- _shape = x.shape
- if _shape == ():
- x.shape = (1,)
- x.shape = (N.product(_shape,axis=0),)
- if i < self.tau.shape[0] - 1:
- ## TODO: OWNDATA flags...
- v = _hbspline.evaluate(x, self.tau, self.m, d, i, i+1)
- else:
- return N.zeros(x.shape, N.float64)
-
- if (i == self.tau.shape[0] - self.m):
- v = N.where(N.equal(x, self.tau[-1]), 1, v)
- v.shape = _shape
- return v
-
- def basis(self, x, d=0, lower=None, upper=None):
- """
- Evaluate the basis of the BSpline or its derivative.
- If lower or upper is specified, then only
- the [lower:upper] elements of the basis are returned.
-
- INPUTS:
- x -- x values at which to evaluate the basis element
- i -- which element of the BSpline to return
- d -- the order of derivative
- lower -- optional lower limit of the set of basis
- elements
- upper -- optional upper limit of the set of basis
- elements
-
- OUTPUTS: y
- y -- value of d-th derivative of the basis elements
- of the BSpline at specified x values
-
- """
- x = N.asarray(x)
- _shape = x.shape
- if _shape == ():
- x.shape = (1,)
- x.shape = (N.product(_shape,axis=0),)
-
- if upper is None:
- upper = self.tau.shape[0] - self.m
- if lower is None:
- lower = 0
- upper = min(upper, self.tau.shape[0] - self.m)
- lower = max(0, lower)
-
- d = N.asarray(d)
- if d.shape == ():
- v = _hbspline.evaluate(x, self.tau, self.m, int(d), lower, upper)
- else:
- if d.shape[0] != 2:
- raise ValueError, "if d is not an integer, expecting a jx2 \
- array with first row indicating order \
- of derivative, second row coefficient in front."
-
- v = 0
- for i in range(d.shape[1]):
- v += d[1,i] * _hbspline.evaluate(x, self.tau, self.m, d[0,i], lower, upper)
-
- v.shape = (upper-lower,) + _shape
- if upper == self.tau.shape[0] - self.m:
- v[-1] = N.where(N.equal(x, self.tau[-1]), 1, v[-1])
- return v
-
- def gram(self, d=0):
- """
- Compute Gram inner product matrix, storing it in lower
- triangular banded form.
-
- The (i,j) entry is
-
- G_ij = integral b_i^(d) b_j^(d)
-
- where b_i are the basis elements of the BSpline and (d) is the
- d-th derivative.
-
- If d is a matrix then, it is assumed to specify a differential
- operator as follows: the first row represents the order of derivative
- with the second row the coefficient corresponding to that order.
-
- For instance:
-
- [[2, 3],
- [3, 1]]
-
- represents 3 * f^(2) + 1 * f^(3).
-
- INPUTS:
- d -- which derivative to apply to each basis element,
- if d is a matrix, it is assumed to specify
- a differential operator as above
-
- OUTPUTS: gram
- gram -- the matrix of inner products of (derivatives)
- of the BSpline elements
-
- """
-
- d = N.squeeze(d)
- if N.asarray(d).shape == ():
- self.g = _hbspline.gram(self.tau, self.m, int(d), int(d))
- else:
- d = N.asarray(d)
- if d.shape[0] != 2:
- raise ValueError, "if d is not an integer, expecting a jx2 \
- array with first row indicating order \
- of derivative, second row coefficient in front."
- if d.shape == (2,):
- d.shape = (2,1)
- self.g = 0
- for i in range(d.shape[1]):
- for j in range(d.shape[1]):
- self.g += d[1,i]* d[1,j] * _hbspline.gram(self.tau, self.m, int(d[0,i]), int(d[0,j]))
- self.g = self.g.T
- self.d = d
- return N.nan_to_num(self.g)
-
-class SmoothingSpline(BSpline):
-
- penmax = 30.
- method = "target_df"
- target_df = 5
- default_pen = 1.0e-03
- optimize = True
-
- '''
- A smoothing spline, which can be used to smooth scatterplots, i.e.
- a list of (x,y) tuples.
-
- See fit method for more information.
-
- '''
-
- def fit(self, y, x=None, weights=None, pen=0.):
- """
- Fit the smoothing spline to a set of (x,y) pairs.
-
- INPUTS:
- y -- response variable
- x -- if None, uses self.x
- weights -- optional array of weights
- pen -- constant in front of Gram matrix
-
- OUTPUTS: None
- The smoothing spline is determined by self.coef,
- subsequent calls of __call__ will be the smoothing spline.
-
- ALGORITHM:
- Formally, this solves a minimization:
-
- fhat = ARGMIN_f SUM_i=1^n (y_i-f(x_i))^2 + pen * int f^(2)^2
-
- int is integral. pen is lambda (from Hastie)
-
- See Chapter 5 of
-
- Hastie, Tibshirani and Friedman (2001). "The Elements of Statistical
- Learning." Springer-Verlag. 536 pages.
-
- for more details.
-
- TODO:
- Should add arbitrary derivative penalty instead of just
- second derivative.
- """
-
- banded = True
-
- if x is None:
- x = self._x
- bt = self._basisx.copy()
- else:
- bt = self.basis(x)
-
- if pen == 0.: # can't use cholesky for singular matrices
- banded = False
-
- if x.shape != y.shape:
- raise ValueError, 'x and y shape do not agree, by default x are \
- the Bspline\'s internal knots'
-
- if pen >= self.penmax:
- pen = self.penmax
-
-
- if weights is not None:
- self.weights = weights
- else:
- self.weights = 1.
-
- _w = N.sqrt(self.weights)
- bt *= _w
-
- # throw out rows with zeros (this happens at boundary points!)
-
- mask = N.flatnonzero(1 - N.alltrue(N.equal(bt, 0), axis=0))
-
- bt = bt[:,mask]
- y = y[mask]
-
- self.df_total = y.shape[0]
-
- bty = N.squeeze(N.dot(bt, _w * y))
- self.N = y.shape[0]
-
- if not banded:
- self.btb = N.dot(bt, bt.T)
- _g = _band2array(self.g, lower=1, symmetric=True)
- self.coef, _, self.rank = L.lstsq(self.btb + pen*_g, bty)[0:3]
- self.rank = min(self.rank, self.btb.shape[0])
- del(_g)
- else:
- self.btb = N.zeros(self.g.shape, N.float64)
- nband, nbasis = self.g.shape
- for i in range(nbasis):
- for k in range(min(nband, nbasis-i)):
- self.btb[k,i] = (bt[i] * bt[i+k]).sum()
-
- bty.shape = (1,bty.shape[0])
- self.pen = pen
- self.chol, self.coef = solveh_banded(self.btb +
- pen*self.g,
- bty, lower=1)
-
- self.coef = N.squeeze(self.coef)
- self.resid = y * self.weights - N.dot(self.coef, bt)
- self.pen = pen
-
- del(bty); del(mask); del(bt)
-
- def smooth(self, y, x=None, weights=None):
-
- if self.method == "target_df":
- if hasattr(self, 'pen'):
- self.fit(y, x=x, weights=weights, pen=self.pen)
- else:
- self.fit_target_df(y, x=x, weights=weights, df=self.target_df)
- elif self.method == "optimize_gcv":
- self.fit_optimize_gcv(y, x=x, weights=weights)
-
-
- def gcv(self):
- """
- Generalized cross-validation score of current fit.
-
- Craven, P. and Wahba, G. "Smoothing noisy data with spline functions.
- Estimating the correct degree of smoothing by
- the method of generalized cross-validation."
- Numerische Mathematik, 31(4), 377-403.
- """
-
- norm_resid = (self.resid**2).sum()
- return norm_resid / (self.df_total - self.trace())
-
- def df_resid(self):
- """
- Residual degrees of freedom in the fit.
-
- self.N - self.trace()
-
- where self.N is the number of observations of last fit.
- """
-
- return self.N - self.trace()
-
- def df_fit(self):
- """
- How many degrees of freedom used in the fit?
-
- self.trace()
-
- """
- return self.trace()
-
- def trace(self):
- """
- Trace of the smoothing matrix S(pen)
-
- TODO: addin a reference to Wahba, and whoever else I used.
- """
-
- if self.pen > 0:
- _invband = _hbspline.invband(self.chol.copy())
- tr = _trace_symbanded(_invband, self.btb, lower=1)
- return tr
- else:
- return self.rank
-
- def fit_target_df(self, y, x=None, df=None, weights=None, tol=1.0e-03,
- apen=0, bpen=1.0e-03):
-
- """
- Fit smoothing spline with approximately df degrees of freedom
- used in the fit, i.e. so that self.trace() is approximately df.
-
- Uses binary search strategy.
-
- In general, df must be greater than the dimension of the null space
- of the Gram inner product. For cubic smoothing splines, this means
- that df > 2.
-
- INPUTS:
- y -- response variable
- x -- if None, uses self.x
- df -- target degrees of freedom
- weights -- optional array of weights
- tol -- (relative) tolerance for convergence
- apen -- lower bound of penalty for binary search
- bpen -- upper bound of penalty for binary search
-
- OUTPUTS: None
- The smoothing spline is determined by self.coef,
- subsequent calls of __call__ will be the smoothing spline.
-
- """
-
- df = df or self.target_df
-
- olddf = y.shape[0] - self.m
-
- if hasattr(self, "pen"):
- self.fit(y, x=x, weights=weights, pen=self.pen)
- curdf = self.trace()
- if N.fabs(curdf - df) / df < tol:
- return
- if curdf > df:
- apen, bpen = self.pen, 2 * self.pen
- else:
- apen, bpen = 0., self.pen
-
- while True:
-
- curpen = 0.5 * (apen + bpen)
- self.fit(y, x=x, weights=weights, pen=curpen)
- curdf = self.trace()
- if curdf > df:
- apen, bpen = curpen, 2 * curpen
- else:
- apen, bpen = apen, curpen
- if apen >= self.penmax:
- raise ValueError, "penalty too large, try setting penmax \
- higher or decreasing df"
- if N.fabs(curdf - df) / df < tol:
- break
-
- def fit_optimize_gcv(self, y, x=None, weights=None, tol=1.0e-03,
- brack=(-100,20)):
- """
- Fit smoothing spline trying to optimize GCV.
-
- Try to find a bracketing interval for scipy.optimize.golden
- based on bracket.
-
- It is probably best to use target_df instead, as it is
- sometimes difficult to find a bracketing interval.
-
- INPUTS:
- y -- response variable
- x -- if None, uses self.x
- df -- target degrees of freedom
- weights -- optional array of weights
- tol -- (relative) tolerance for convergence
- brack -- an initial guess at the bracketing interval
-
- OUTPUTS: None
- The smoothing spline is determined by self.coef,
- subsequent calls of __call__ will be the smoothing spline.
-
- """
-
- def _gcv(pen, y, x):
- self.fit(y, x=x, pen=N.exp(pen))
- a = self.gcv()
- return a
-
- a = golden(_gcv, args=(y,x), brack=bracket, tol=tol)
View
154 scipy/stats/models/contrast.py
@@ -1,154 +0,0 @@
-import copy
-
-import numpy as N
-from numpy.linalg import pinv
-from scipy.stats.models import utils
-
-class ContrastResults:
- """
- Results from looking at a particular contrast of coefficients in
- a parametric model. The class does nothing, it is a container
- for the results from T and F contrasts.
- """
-
- def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None,
- df_num=None):
- if F is not None:
- self.F = F
- self.df_denom = df_denom
- self.df_num = df_num
- else:
- self.t = t
- self.sd = sd
- self.effect = effect
- self.df_denom = df_denom
-
- def __str__(self):
- if hasattr(self, 'F'):
- return '<F contrast: F=%s, df_denom=%d, df_num=%d>' % \
- (`self.F`, self.df_denom, self.df_num)
- else:
- return '<T contrast: effect=%s, sd=%s, t=%s, df_denom=%d>' % \
- (`self.effect`, `self.sd`, `self.t`, self.df_denom)
-
-
-class Contrast:
- """
- This class is used to construct contrast matrices in regression models.
- They are specified by a (term, formula) pair.
-
- The term, T, is a linear combination of columns of the design
- matrix D=formula(). The getmatrix method constructs
- a contrast matrix C so that
-
- colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
-
- where pinv(D) is the generalized inverse of D. Further, the matrix
-
- Tnew = dot(C, D)
-
- is full rank. The rank attribute is the rank of
-
- dot(D, dot(pinv(D), T))
-
- In a regression model, the contrast tests that E(dot(Tnew, Y)) = 0
- for each column of Tnew.
-
- """
-
- def __init__(self, term, formula, name=''):
- self.term = term
- self.formula = formula
- if name is '':
- self.name = str(term)
- else:
- self.name = name
-
- def __str__(self):
- return '<contrast:%s>' % \
- `{'term':str(self.term), 'formula':str(self.formula)}`
-
- def getmatrix(self, *args, **kw):
- """
- Construct a contrast matrix C so that
-
- colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T)))
-
- where pinv(D) is the generalized inverse of D=self.D=self.formula().
-
- If the design, self.D is already set,
- then evaldesign can be set to False.
- """
-
- t = copy.copy(self.term)
- t.namespace = self.formula.namespace
- T = N.transpose(N.array(t(*args, **kw)))
-
- if T.ndim == 1:
- T.shape = (T.shape[0], 1)
-
- self.T = utils.clean0(T)
-
- self.D = self.formula.design(*args, **kw)
-
- self.matrix = contrastfromcols(self.T, self.D)
- try:
- self.rank = self.matrix.shape[1]
- except:
- self.rank = 1
-
-
-def contrastfromcols(L, D, pseudo=None):
- """
- From an n x p design matrix D and a matrix L, tries
- to determine a p x q contrast matrix C which
- determines a contrast of full rank, i.e. the
- n x q matrix
-
- dot(transpose(C), pinv(D))
-
- is full rank.
-
- L must satisfy either L.shape[0] == n or L.shape[1] == p.
-
- If L.shape[0] == n, then L is thought of as representing
- columns in the column space of D.
-
- If L.shape[1] == p, then L is thought of as what is known
- as a contrast matrix. In this case, this function returns an estimable
- contrast corresponding to the dot(D, L.T)
-
- Note that this always produces a meaningful contrast, not always
- with the intended properties because q is always non-zero unless
- L is identically 0. That is, it produces a contrast that spans
- the column space of L (after projection onto the column space of D).
-
- """
-
- L = N.asarray(L)
- D = N.asarray(D)
-
- n, p = D.shape
-
- if L.shape[0] != n and L.shape[1] != p:
- raise ValueError, 'shape of L and D mismatched'
-
- if pseudo is None:
- pseudo = pinv(D)
-
- if L.shape[0] == n:
- C = N.dot(pseudo, L).T
- else:
- C = L
- C = N.dot(pseudo, N.dot(D, C.T)).T
-
- Lp = N.dot(D, C.T)
-
- if len(Lp.shape) == 1:
- Lp.shape = (n, 1)
-
- if utils.rank(Lp) != Lp.shape[1]:
- Lp = utils.fullrank(Lp)
- C = N.dot(pseudo, Lp).T
-
- return N.squeeze(C)
View
210 scipy/stats/models/cox.py
@@ -1,210 +0,0 @@
-import shutil
-import tempfile
-
-import numpy as N
-
-from scipy.stats.models import survival, model
-
-class Discrete:
-
- """
- A simple little class for working with discrete random vectors.
- """
-
- def __init__(self, x, w=None):
- self.x = N.squeeze(x)
- if self.x.shape == ():
- self.x = N.array([self.x])
- self.n = self.x.shape[0]
- if w is None:
- w = N.ones(self.n, N.float64)
- else:
- if w.shape[0] != self.n:
- raise ValueError, 'incompatible shape for weights w'
- if N.any(N.less(w, 0)):
- raise ValueError, 'weights should be non-negative'
- self.w = w / w.sum()
-
- def mean(self, f=None):
- if f is None:
- fx = self.x
- else:
- fx = f(self.x)
- return (fx * self.w).sum()
-
- def cov(self):
- mu = self.mean()
- dx = self.x - N.multiply.outer(mu, self.x.shape[1])
- return N.dot(dx, N.transpose(dx))
-
-class Observation(survival.RightCensored):
-
- def __getitem__(self, item):
- if self.namespace is not None:
- return self.namespace[item]
- else:
- return getattr(self, item)
-
- def __init__(self, time, delta, namespace=None):
- self.namespace = namespace
- survival.RightCensored.__init__(self, time, delta)
-
- def __call__(self, formula, time=None, **extra):
- return formula(namespace=self, time=time, **extra)
-
-class CoxPH(model.LikelihoodModel):
- """Cox proportional hazards regression model."""
-
- def __init__(self, subjects, formula, time_dependent=False):
- self.subjects, self.formula = subjects, formula
- self.time_dependent = time_dependent
- self.initialize(self.subjects)
-
- def initialize(self, subjects):
-
- self.failures = {}
- for i in range(len(subjects)):
- s = subjects[i]
- if s.delta:
- if s.time not in self.failures:
- self.failures[s.time] = [i]
- else:
- self.failures[s.time].append(i)
-
- self.failure_times = self.failures.keys()
- self.failure_times.sort()
-
- def cache(self):
- if self.time_dependent:
- self.cachedir = tempfile.mkdtemp()
-
- self.design = {}
- self.risk = {}
- first = True
-
- for t in self.failures.keys():
- if self.time_dependent:
- d = N.array([s(self.formula, time=t)
- for s in self.subjects]).astype('<f8')
- dshape = d.shape
- dfile = file(tempfile.mkstemp(dir=self.cachedir)[1], 'w')
- d.tofile(dfile)
- dfile.close()
- del(d)
- self.design[t] = N.memmap(dfile.name,
- dtype=N.dtype('<f8'),
- shape=dshape)
- elif first:
- d = N.array([s(self.formula, time=t)
- for s in self.subjects]).astype(N.float64)
- self.design[t] = d
- else:
- self.design[t] = d
- self.risk[t] = N.compress([s.atrisk(t) for s in self.subjects],
- N.arange(self.design[t].shape[0]),axis=-1)
- def __del__(self):
- shutil.rmtree(self.cachedir, ignore_errors=True)
-
- def logL(self, b, ties='breslow'):
-
- logL = 0
- for t in self.failures.keys():
- fail = self.failures[t]
- d = len(fail)
- risk = self.risk[t]
- Zb = N.dot(self.design[t], b)
-
- logL += Zb[fail].sum()
-
- if ties == 'breslow':
- s = N.exp(Zb[risk]).sum()
- logL -= N.log(N.exp(Zb[risk]).sum()) * d
- elif ties == 'efron':
- s = N.exp(Zb[risk]).sum()
- r = N.exp(Zb[fail]).sum()
- for j in range(d):
- logL -= N.log(s - j * r / d)
- elif ties == 'cox':
- raise NotImplementedError, 'Cox tie breaking method not implemented'
- else:
- raise NotImplementedError, 'tie breaking method not recognized'
- return logL
-
- def score(self, b, ties='breslow'):
-
- score = 0
- for t in self.failures.keys():
- fail = self.failures[t]
- d = len(fail)
- risk = self.risk[t]
- Z = self.design[t]
-
- score += Z[fail].sum()
-
- if ties == 'breslow':
- w = N.exp(N.dot(Z, b))
- rv = discrete(Z[risk], w=w[risk])
- score -= rv.mean() * d
- elif ties == 'efron':
- w = N.exp(N.dot(Z, b))
- score += Z[fail].sum()
- for j in range(d):
- efron_w = w
- efron_w[fail] -= i * w[fail] / d
- rv = discrete(Z[risk], w=efron_w[risk])
- score -= rv.mean()
- elif ties == 'cox':
- raise NotImplementedError, 'Cox tie breaking method not implemented'
- else:
- raise NotImplementedError, 'tie breaking method not recognized'
- return N.array([score])
-
- def information(self, b, ties='breslow'):
-
- info = 0
- score = 0
- for t in self.failures.keys():
- fail = self.failures[t]
- d = len(fail)
- risk = self.risk[t]
- Z = self.design[t]
-
- if ties == 'breslow':
- w = N.exp(N.dot(Z, b))
- rv = discrete(Z[risk], w=w[risk])
- info += rv.cov()
- elif ties == 'efron':
- w = N.exp(N.dot(Z, b))
- score += Z[fail].sum()
- for j in range(d):
- efron_w = w
- efron_w[fail] -= i * w[fail] / d
- rv = discrete(Z[risk], w=efron_w[risk])
- info += rv.cov()
- elif ties == 'cox':
- raise NotImplementedError, 'Cox tie breaking method not implemented'
- else:
- raise NotImplementedError, 'tie breaking method not recognized'
- return score
-
-if __name__ == '__main__':
- import numpy.random as R
- n = 100
- X = N.array([0]*n + [1]*n)
- b = 0.4
- lin = 1 + b*X
- Y = R.standard_exponential((2*n,)) / lin
- delta = R.binomial(1, 0.9, size=(2*n,))
-
- subjects = [observation(Y[i], delta[i]) for i in range(2*n)]
- for i in range(2*n):
- subjects[i].X = X[i]
-
- import scipy.stats.models.formula as F
- x = F.quantitative('X')
- f = F.formula(x)
-
- c = coxph(subjects, f)
-
- c.cache()
-# c.newton([0.4])
View
14 scipy/stats/models/family/__init__.py
@@ -1,14 +0,0 @@
-'''
-This module contains the one-parameter exponential families used
-for fitting GLMs and GAMs.
-
-These families are described in
-
- P. McCullagh and J. A. Nelder. "Generalized linear models."
- Monographs on Statistics and Applied Probability.
- Chapman & Hall, London, 1983.
-
-'''
-
-from scipy.stats.models.family.family import Gaussian, Family, \
- Poisson, Gamma, InverseGaussian, Binomial
View
258 scipy/stats/models/family/family.py
@@ -1,258 +0,0 @@
-import numpy as N
-from scipy.stats.models.family import links as L
-from scipy.stats.models.family import varfuncs as V
-
-class Family(object):
-
- """
- A class to model one-parameter exponential
- families.
-
- INPUTS:
- link -- a Link instance
- variance -- a variance function (models means as a function
- of mean)
-
- """
-
- valid = [-N.inf, N.inf]
-
- tol = 1.0e-05
- links = []
-
- def _setlink(self, link):
- self._link = link
- if hasattr(self, "links"):
- if link not in self.links:
- raise ValueError, 'invalid link for family, should be in %s' % `self.links`
-
- def _getlink(self):
- return self._link
-
- link = property(_getlink, _setlink)
-
- def __init__(self, link, variance):
-
- self.link = link
- self.variance = variance
-
- def weights(self, mu):
-
- """
- Weights for IRLS step.
-
- w = 1 / (link'(mu)**2 * variance(mu))
-
- INPUTS:
- mu -- mean parameter in exponential family
-
- OUTPUTS:
- w -- weights used in WLS step of GLM/GAM fit
-
- """
-
- return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
-
- def deviance(self, Y, mu, scale=1.):
- """
- Deviance of (Y,mu) pair. Deviance is usually defined
- as the difference
-
- DEV = (SUM_i -2 log Likelihood(Y_i,mu_i) + 2 log Likelihood(mu_i,mu_i)) / scale
-
- INPUTS:
- Y -- response variable
- mu -- mean parameter
- scale -- optional scale in denominator of deviance
-
- OUTPUTS: dev
- dev -- DEV, as described aboce
-
- """
-
- return N.power(self.devresid(Y, mu), 2).sum() / scale
-
- def devresid(self, Y, mu):
- """
- The deviance residuals, defined as the residuals
- in the deviance.
-
- Without knowing the link, they default to Pearson residuals
-
- resid_P = (Y - mu) * sqrt(weight(mu))
-
- INPUTS:
- Y -- response variable
- mu -- mean parameter
-
- OUTPUTS: resid
- resid -- deviance residuals
- """
-
- return (Y - mu) * N.sqrt(self.weights(mu))
-
- def fitted(self, eta):
- """
- Fitted values based on linear predictors eta.
-
- INPUTS:
- eta -- values of linear predictors, say,
- X beta in a generalized linear model.
-
- OUTPUTS: mu
- mu -- link.inverse(eta), mean parameter based on eta
-
- """
- return self.link.inverse(eta)
-
- def predict(self, mu):
- """
- Linear predictors based on given mu values.
-
- INPUTS:
- mu -- mean parameter of one-parameter exponential family
-
- OUTPUTS: eta
- eta -- link(mu), linear predictors, based on
- mean parameters mu
-
- """
- return self.link(mu)
-
-class Poisson(Family):
-
- """
- Poisson exponential family.
-
- INPUTS:
- link -- a Link instance
-
- """
-
- links = [L.log, L.identity, L.sqrt]
- variance = V.mu
- valid = [0, N.inf]
-
- def __init__(self, link=L.log):
- self.variance = Poisson.variance
- self.link = link
-
- def devresid(self, Y, mu):
- """
- Poisson deviance residual
-
- INPUTS:
- Y -- response variable
- mu -- mean parameter
-
- OUTPUTS: resid
- resid -- deviance residuals
-
- """
- return N.sign(Y - mu) * N.sqrt(2 * Y * N.log(Y / mu) - 2 * (Y - mu))
-
-class Gaussian(Family):
-
- """
- Gaussian exponential family.
-
- INPUTS:
- link -- a Link instance
-
- """
-
- links = [L.log, L.identity, L.inverse]
- variance = V.constant
-
- def __init__(self, link=L.identity):
- self.variance = Gaussian.variance
- self.link = link
-
- def devresid(self, Y, mu, scale=1.):
- """
- Gaussian deviance residual
-
- INPUTS:
- Y -- response variable
- mu -- mean parameter
- scale -- optional scale in denominator (after taking sqrt)
-
- OUTPUTS: resid
- resid -- deviance residuals
- """
-
- return (Y - mu) / N.sqrt(self.variance(mu) * scale)
-
-class Gamma(Family):
-
- """
- Gamma exponential family.
-
- INPUTS:
- link -- a Link instance
-
- BUGS:
- no deviance residuals?
-
- """
-
- links = [L.log, L.identity, L.inverse]
- variance = V.mu_squared
-
- def __init__(self, link=L.identity):
- self.variance = Gamma.variance
- self.link = link
-
-
-class Binomial(Family):
-
- """
- Binomial exponential family.
-
- INPUTS:
- link -- a Link instance
- n -- number of trials for Binomial
- """
-
- links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog]
- variance = V.binary
-
- def __init__(self, link=L.logit, n=1):
- self.n = n
- self.variance = V.Binomial(n=self.n)
- self.link = link
-
- def devresid(self, Y, mu):
- """
- Binomial deviance residual
-
- INPUTS:
- Y -- response variable
- mu -- mean parameter
-
- OUTPUTS: resid
- resid -- deviance residuals
-
- """
-
- mu = self.link.clean(mu)
- return N.sign(Y - mu) * N.sqrt(-2 * (Y * N.log(mu / self.n) + (self.n - Y) * N.log(1 - mu / self.n)))
-
-class InverseGaussian(Family):
-
- """
- InverseGaussian exponential family.
-
- INPUTS:
- link -- a Link instance
- n -- number of trials for Binomial
-
- """
-
- links = [L.inverse_squared, L.inverse, L.identity, L.log]
- variance = V.mu_cubed
-
- def __init__(self, link=L.identity):
- self.n = n
- self.variance = InverseGaussian.variance
- self.link = link
View
390 scipy/stats/models/family/links.py
@@ -1,390 +0,0 @@
-import numpy as N
-import scipy.stats
-
-class Link:
-
- """
- A generic link function for one-parameter exponential
- family, with call, inverse and deriv methods.
-
- """
-
- def initialize(self, Y):
- return N.asarray(Y).mean() * N.ones(Y.shape)
-
- def __call__(self, p):
- return NotImplementedError
-
- def inverse(self, z):
- return NotImplementedError
-
- def deriv(self, p):
- return NotImplementedError
-
-
-class Logit(Link):
-
- """
- The logit transform as a link function:
-
- g'(x) = 1 / (x * (1 - x))
- g^(-1)(x) = exp(x)/(1 + exp(x))
-
- """
-
- tol = 1.0e-10
-
- def clean(self, p):
- """
- Clip logistic values to range (tol, 1-tol)
-
- INPUTS:
- p -- probabilities
-
- OUTPUTS: pclip
- pclip -- clipped probabilities
- """
-
- return N.clip(p, Logit.tol, 1. - Logit.tol)
-
- def __call__(self, p):
- """
- Logit transform
-
- g(p) = log(p / (1 - p))
-
- INPUTS:
- p -- probabilities
-
- OUTPUTS: z
- z -- logit transform of p
-
- """
-
- p = self.clean(p)
- return N.log(p / (1. - p))
-
- def inverse(self, z):
- """
- Inverse logit transform
-
- h(z) = exp(z)/(1+exp(z))
-
- INPUTS:
- z -- logit transform of p
-
- OUTPUTS: p
- p -- probabilities
-
- """
- t = N.exp(z)
- return t / (1. + t)
-
- def deriv(self, p):
-
- """
- Derivative of logit transform
-
- g(p) = 1 / (p * (1 - p))
-
- INPUTS:
- p -- probabilities
-
- OUTPUTS: y
- y -- derivative of logit transform of p
-
- """
- p = self.clean(p)
- return 1. / (p * (1 - p))
-
-logit = Logit()
-
-class Power(Link):
-
- """
- The power transform as a link function:
-
- g(x) = x**power
-
- """
-
- def __init__(self, power=1.):
- self.power = power
-
- def __call__(self, x):
- """
- Power transform
-
- g(x) = x**self.power
-
- INPUTS:
- x -- mean parameters
-
- OUTPUTS: z
- z -- power transform of x
-
- """
-
- return N.power(x, self.power)
-
- def inverse(self, z):
- """
- Inverse of power transform
-
- g(x) = x**(1/self.power)
-
- INPUTS:
- z -- linear predictors in GLM
-
- OUTPUTS: x
- x -- mean parameters
-
- """
- return N.power(z, 1. / self.power)
-
- def deriv(self, x):
- """
- Derivative of power transform
-
- g(x) = self.power * x**(self.power - 1)
-
- INPUTS:
- x -- mean parameters
-
- OUTPUTS: z
- z -- derivative of power transform of x
-
- """
-
- return self.power * N.power(x, self.power - 1)
-
-inverse = Power(power=-1.)
-inverse.__doc__ = """
-
-The inverse transform as a link function:
-
-g(x) = 1 / x
-"""
-
-sqrt = Power(power=0.5)
-sqrt.__doc__ = """
-
-The square-root transform as a link function:
-
-g(x) = sqrt(x)
-"""
-
-inverse_squared = Power(power=-2.)
-inverse_squared.__doc__ = """
-
-The inverse squared transform as a link function:
-
-g(x) = 1 / x**2
-"""
-
-identity = Power(power=1.)
-identity.__doc__ = """
-
-The identity transform as a link function:
-
-g(x) = x
-"""
-
-class Log(Link):
-
- """
- The log transform as a link function:
-
- g(x) = log(x)
-
- """
-
- tol = 1.0e-10
-
- def clean(self, x):
- return N.clip(x, Logit.tol, N.inf)
-
- def __call__(self, x, **extra):
- """
- Log transform
-
- g(x) = log(x)
-
- INPUTS:
- x -- mean parameters
-
- OUTPUTS: z
- z -- log(x)
-
- """
- x = self.clean(x)
- return N.log(x)
-
- def inverse(self, z):
- """
- Inverse of log transform
-
- g(x) = exp(x)
-
- INPUTS:
- z -- linear predictors in GLM
-
- OUTPUTS: x
- x -- exp(z)
-
- """
- return N.exp(z)
-
- def deriv(self, x):
- """
- Derivative of log transform
-
- g(x) = 1/x
-
- INPUTS:
- x -- mean parameters
-
- OUTPUTS: z
- z -- derivative of log transform of x
-
- """
-
- x = self.clean(x)
- return 1. / x
-
-log = Log()
-
-class CDFLink(Logit):
-
- """
- The use the CDF of a scipy.stats distribution as a link function:
-
- g(x) = dbn.ppf(x)
-
- """
-
- def __init__(self, dbn=scipy.stats.norm):
- self.dbn = dbn
-
- def __call__(self, p):
- """
- CDF link
-
- g(p) = self.dbn.pdf(p)
-
- INPUTS:
- p -- mean parameters
-
- OUTPUTS: z
- z -- derivative of CDF transform of p
-
- """
- p = self.clean(p)
- return self.dbn.ppf(p)
-
- def inverse(self, z):
- """
- Derivative of CDF link
-
- g(z) = self.dbn.cdf(z)
-
- INPUTS:
- z -- linear predictors in GLM
-
- OUTPUTS: p
- p -- inverse of CDF link of z
-
- """
- return self.dbn.cdf(z)
-
- def deriv(self, p):
- """
- Derivative of CDF link
-
- g(p) = 1/self.dbn.pdf(self.dbn.ppf(p))
-
- INPUTS:
- x -- mean parameters
-
- OUTPUTS: z
- z -- derivative of CDF transform of x
-
- """
- p = self.clean(p)
- return 1. / self.dbn.pdf(self(p))
-
-probit = CDFLink()
-probit.__doc__ = """
-
-The probit (standard normal CDF) transform as a link function:
-
-g(x) = scipy.stats.norm.ppf(x)
-
-"""
-
-cauchy = CDFLink(dbn=scipy.stats.cauchy)
-cauchy.__doc__ = """
-
-The Cauchy (standard Cauchy CDF) transform as a link function:
-
-g(x) = scipy.stats.cauchy.ppf(x)
-
-"""
-
-class CLogLog(Logit):
-
- """
- The complementary log-log transform as a link function:
-
- g(x) = log(-log(x))
-
- """
-
- def __call__(self, p):
- """
- C-Log-Log transform
-
- g(p) = log(-log(p))
-
- INPUTS:
- p -- mean parameters
-
- OUTPUTS: z
- z -- log(-log(p))
-
- """
- p = self.clean(p)
- return N.log(-N.log(p))
-
- def inverse(self, z):
- """
- Inverse of C-Log-Log transform
-
- g(z) = exp(-exp(z))
-
- INPUTS:
- z -- linear predictor scale
-
- OUTPUTS: p
- p -- mean parameters
-
- """
- return N.exp(-N.exp(z))
-
- def deriv(self, p):
- """
- Derivatve of C-Log-Log transform
-
- g(p) = - 1 / (log(p) * p)
-
- INPUTS:
- p -- mean parameters
-
- OUTPUTS: z
- z -- - 1 / (log(p) * p)
-
- """
- p = self.clean(p)
- return -1. / (N.log(p) * p)
-
-cloglog = CLogLog()
View
87 scipy/stats/models/family/varfuncs.py
@@ -1,87 +0,0 @@
-__docformat__ = 'restructuredtext'
-
-import numpy as N
-
-class VarianceFunction:
- """
- Variance function that relates the variance of a random variable
- to its mean. Defaults to 1.
- """
-
- def __call__(self, mu):
- """
- Default variance function
-
- INPUTS:
- mu -- mean parameters
-
- OUTPUTS: v
- v -- ones(mu.shape)
- """
-
- return N.ones(mu.shape, N.float64)
-
-constant = VarianceFunction()
-
-class Power:
- """
- Power variance function:
-
- V(mu) = fabs(mu)**power
-
- INPUTS:
- power -- exponent used in power variance function
-
- """
-
- def __init__(self, power=1.):
- self.power = power
-
- def __call__(self, mu):
-
- """
- Power variance function
-
- INPUTS:
- mu -- mean parameters
-
- OUTPUTS: v
- v -- fabs(mu)**self.power
- """
- return N.power(N.fabs(mu), self.power)
-
-class Binomial:
- """
- Binomial variance function
-
- p = mu / n; V(mu) = p * (1 - p) * n
-
- INPUTS:
- n -- number of trials in Binomial
- """
-
- tol = 1.0e-10
-
- def __init__(self, n=1):
- self.n = n
-
- def clean(self, p):
- return N.clip(p, Binomial.tol, 1 - Binomial.tol)
-
- def __call__(self, mu):
- """
- Binomial variance function
-
- INPUTS:
- mu -- mean parameters
-
- OUTPUTS: v
- v -- mu / self.n * (1 - mu / self.n) * self.n
- """
- p = self.clean(mu / self.n)
- return p * (1 - p) * self.n
-
-mu = Power()
-mu_squared = Power(power=2)
-mu_cubed = Power(power=3)
-binary = Binomial()
View
754 scipy/stats/models/formula.py
@@ -1,754 +0,0 @@
-"""
-Provides the basic classes needed to specify statistical models.
-"""
-import copy
-import types
-import numpy as N
-
-try:
- set
-except NameError:
- from sets import Set as set
-
-__docformat__ = 'restructuredtext'
-
-default_namespace = {}
-
-class Term(object):
- """
- This class is very simple: it is just a named term in a model formula.
-
- It is also callable: by default it namespace[self.name], where namespace
- defaults to formula.default_namespace.
- When called in an instance of formula,
- the namespace used is that formula's namespace.
-
- Inheritance of the namespace under +,*,- operations:
- ----------------------------------------------------
-
- By default, the namespace is empty, which means it must be
- specified before evaluating the design matrix.
-
- When it is unambiguous, the namespaces of objects are derived from the
- context.
-
- Rules:
- ------
-
- i) "X * I", "X + I", "X**i": these inherit X's namespace
- ii) "F.main_effect()": this inherits the Factor F's namespace
- iii) "A-B": this inherits A's namespace
- iv) if A.namespace == B.namespace, then A+B inherits this namespace
- v) if A.namespace == B.namespace, then A*B inherits this namespace
-
- Equality of namespaces:
- -----------------------
-
- This is done by comparing the namespaces directly, if
- an exception is raised in the check of equality, they are
- assumed not to be equal.
- """
-
- def __pow__(self, power):
- """
- Raise the quantitative term's values to an integer power, i.e.
- polynomial.
- """
-
- try:
- power = float(power)
- except:
- raise ValueError, 'expecting a float'
-
- if power == int(power):
- name = '%s^%d' % (self.name, int(power))
- else:
- name = '%s^%0.2f' % (self.name, power)
-
- value = Quantitative(name, func=self, transform=lambda x: N.power(x, power))
- value.power = power
- value.namespace = self.namespace
- return value
-
- def __init__(self, name, func=None, termname=None):
-
- self.name = name
- self.__namespace = None
- if termname is None:
- self.termname = name
- else:
- self.termname = termname
-
- if type(self.termname) is not types.StringType:
- raise ValueError, 'expecting a string for termname'
- if func:
- self.func = func
-
- # Namespace in which self.name will be looked up in, if needed
-
- def _get_namespace(self):
- if isinstance(self.__namespace, N.ndarray):
- return self.__namespace
- else: return self.__namespace or default_namespace
-
- def _set_namespace(self, value): self.__namespace = value
- def _del_namespace(self): del self.__namespace
- namespace = property(_get_namespace, _set_namespace, _del_namespace)
-
- def __str__(self):
- """
- '<term: %s>' % self.termname
- """
- return '<term: %s>' % self.termname
-
- def __add__(self, other):
- """
- Formula(self) + Formula(other)
- """
- other = Formula(other)
- f = other + self
- if _namespace_equal(other.namespace, self.namespace):
- f.namespace = self.namespace
- return f
-
- def __mul__(self, other):
- """
- Formula(self) * Formula(other)
- """
-
- if other.name is 'intercept':
- f = Formula(self, namespace=self.namespace)
- elif self.name is 'intercept':
- f = Formula(other, namespace=other.namespace)
- else:
- other = Formula(other, namespace=other.namespace)
- f = other * self
- if _namespace_equal(other.namespace, self.namespace):
- f.namespace = self.namespace
- return f
-
- def names(self):
- """
- Return the names of the columns in design associated to the terms,
- i.e. len(self.names()) = self().shape[0].
- """
- if type(self.name) is types.StringType:
- return [self.name]
- else:
- return list(self.name)
-
- def __call__(self, *args, **kw):
- """
- Return the columns associated to self in a design matrix.
- If the term has no 'func' attribute, it returns
- ``self.namespace[self.termname]``
- else, it returns
- ``self.func(*args, **kw)``
- """
-
- if not hasattr(self, 'func'):
- val = self.namespace[self.termname]
- else:
- val = self.func
- if callable(val):
- if isinstance(val, (Term, Formula)):
- val = copy.copy(val)
- val.namespace = self.namespace
- val = val(*args, **kw)
-
- val = N.asarray(val)
- return N.squeeze(val)
-
-class Factor(Term):
- """A categorical factor."""
-
- def __init__(self, termname, keys, ordinal=False):
- """
- Factor is initialized with keys, representing all valid
- levels of the factor.
-
- If ordinal is False, keys can have repeats: set(keys) is what is
- used.
-
- If ordinal is True, the order is taken from the keys, and
- there should be no repeats.
- """
-
- if not ordinal:
- self.keys = list(set(keys))
- self.keys.sort()
- else:
- self.keys = keys
- if len(set(keys)) != len(list(keys)):
- raise ValueError, 'keys for ordinal Factor should be unique, in increasing order'
- self._name = termname
- self.termname = termname
- self.ordinal = ordinal
-
- if self.ordinal:
- name = self.termname
- else:
- name = ['(%s==%s)' % (self.termname, str(key)) for key in self.keys]
-
- Term.__init__(self, name, termname=self.termname, func=self.get_columns)
-
- def get_columns(self, *args, **kw):
- """
- Calling function for factor instance.
- """
-
- v = self.namespace[self._name]
- while True:
- if callable(v):
- if isinstance(v, (Term, Formula)):
- v = copy.copy(v)
- v.namespace = self.namespace
- v = v(*args, **kw)
- else: break
-
- n = len(v)
-
- if self.ordinal:
- col = [float(self.keys.index(v[i])) for i in range(n)]
- return N.array(col)
-
- else:
- value = []
- for key in self.keys:
- col = [float((v[i] == key)) for i in range(n)]
- value.append(col)
- return N.array(value)
-
- def values(self, *args, **kw):
- """
- Return the keys of the factor, rather than the columns of the design
- matrix.
- """
-
- del(self.func)
- val = self(*args, **kw)
- self.func = self.get_columns
- return val
-
- def verify(self, values):
- """
- Verify that all values correspond to valid keys in self.
- """
- s = set(values)
- if not s.issubset(self.keys):
- raise ValueError, 'unknown keys in values'
-
- def __add__(self, other):
- """
- Formula(self) + Formula(other)
-
- When adding \'intercept\' to a factor, this just returns
-
- Formula(self, namespace=self.namespace)
-
- """
-
- if other.name is 'intercept':
- return Formula(self, namespace=self.namespace)
- else:
- return Term.__add__(self, other)
-
- def main_effect(self, reference=None):
- """
- Return the 'main effect' columns of a factor, choosing
- an optional reference key.
-
- The reference key can be one of the keys of the Factor,
- or an integer, representing which column to remove.
- It defaults to 0.
-
- """
-
- names = self.names()
-
- if reference is None:
- reference = 0
- else:
- if reference in names:
- reference = names.index(reference)
- else:
- reference = int(reference)
-
- def maineffect_func(value, reference=reference):
- rvalue = []
- keep = range(value.shape[0])
- keep.pop(reference)
- for i in range(len(keep)):
- rvalue.append(value[keep[i]] - value[reference])
- return N.array(rvalue)
-
- keep = range(len(self.names()))
- keep.pop(reference)
- __names = self.names()
- _names = ['%s-%s' % (__names[keep[i]], __names[reference]) for i in range(len(keep))]
- value = Quantitative(_names, func=self,
- termname='%s:maineffect' % self.termname,
- transform=maineffect_func)
- value.namespace = self.namespace
- return value
-
- def __getitem__(self, key):
- """
- Retrieve the column corresponding to key in a Formula.
-
- :Parameters:
- key : one of the Factor's keys
-
- :Returns: ndarray corresponding to key, when evaluated in
- current namespace
- """
- if not self.ordinal:
- i = self.names().index('(%s==%s)' % (self.termname, str(key)))
- return self()[i]
- else:
- v = self.namespace[self._name]
- return N.array([(vv == key) for vv in v]).astype(N.float)
-
-
-class Quantitative(Term):
- """
- A subclass of term that can be used to apply point transformations
- of another term, i.e. to take powers:
-
- >>> import numpy as N
- >>> from scipy.stats.models import formula
- >>> X = N.linspace(0,10,101)
- >>> x = formula.Term('X')
- >>> x.namespace={'X':X}
- >>> x2 = x**2
- >>> print N.allclose(x()**2, x2())
- True
- >>> x3 = formula.Quantitative('x2', func=x, transform=lambda x: x**2)
- >>> x3.namespace = x.namespace
- >>> print N.allclose(x()**2, x3())
- True
-
- """
-
- def __init__(self, name, func=None, termname=None, transform=lambda x: x):
- self.transform = transform
- Term.__init__(self, name, func=func, termname=termname)
-
- def __call__(self, *args, **kw):
- """
- A quantitative is just like term, except there is an additional
- transformation: self.transform.
-
- """
- return self.transform(Term.__call__(self, *args, **kw))
-
-class Formula(object):
- """
- A formula object for manipulating design matrices in regression models,
- essentially consisting of a list of term instances.
-
- The object supports addition and multiplication which correspond
- to concatenation and pairwise multiplication, respectively,
- of the columns of the two formulas.
-
- """
-
- def _get_namespace(self):
- if isinstance(self.__namespace, N.ndarray):
- return self.__namespace
- else: return self.__namespace or default_namespace
-
- def _set_namespace(self, value): self.__namespace = value
- def _del_namespace(self): del self.__namespace
- namespace = property(_get_namespace, _set_namespace, _del_namespace)
-
- def _terms_changed(self):
- self._names = self.names()
- self._termnames = self.termnames()
-
- def __init__(self, termlist, namespace=default_namespace):
- """
- Create a formula from either:
- i. a `formula` object
- ii. a sequence of `term` instances
- iii. one `term`
- """
-
-
- self.__namespace = namespace
- if isinstance(termlist, Formula):
- self.terms = copy.copy(list(termlist.terms))
- elif type(termlist) is types.ListType:
- self.terms = termlist
- elif isinstance(termlist, Term):
- self.terms = [termlist]
- else:
- raise ValueError
-
- self._terms_changed()
-
- def __str__(self):
- """
- String representation of list of termnames of a formula.
- """
- value = []
- for term in self.terms:
- value += [term.termname]
- return '<formula: %s>' % ' + '.join(value)
-
- def __call__(self, *args, **kw):
-
- """
- Create (transpose) of the design matrix of the formula within
- namespace. Extra arguments are passed to each term instance. If
- the formula just contains an intercept, then the keyword
- argument 'nrow' indicates the number of rows (observations).
- """
-
- if 'namespace' in kw:
- namespace = kw['namespace']
- else:
- namespace = self.namespace
-
-
- allvals = []
- intercept = False
- iindex = 0
- for t in self.terms:
- t = copy.copy(t)
- t.namespace = namespace
- val = t(*args, **kw)
-
- isintercept = False
- if hasattr(t, "termname"):
- if t.termname == 'intercept':
- intercept = True
- isintercept = True
- interceptindex = iindex
- allvals.append(None)
-
- if val.ndim == 1 and not isintercept:
- val.shape = (1, val.shape[0])
- allvals.append(val)
- elif not isintercept:
- allvals.append(val)
- iindex += 1
-
- if not intercept:
- try:
- allvals = N.concatenate(allvals)
- except:
- pass
- else:
- nrow = kw.get('nrow', -1)
- if allvals != []:
- if interceptindex > 0:
- n = allvals[0].shape[1]
- else:
- n = allvals[1].shape[1]
- allvals[interceptindex] = N.ones((1,n), N.float64)
- allvals = N.concatenate(allvals)
- elif nrow <= 1:
- raise ValueError, 'with only intercept in formula, keyword \'nrow\' argument needed'
- else:
- allvals = I(nrow=nrow)
- allvals.shape = (1,) + allvals.shape
- return N.squeeze(allvals)
-
- def hasterm(self, query_term):
- """
- Determine whether a given term is in a formula.
- """
-
- if not isinstance(query_term, Formula):
- if type(query_term) == type("name"):
- try: query = self[query_term]
- except: return False
- elif isinstance(query_term, Term):
- return query_term.termname in self.termnames()
- elif len(query_term.terms) == 1:
- query_term = query_term.terms[0]
- return query_term.termname in self.termnames()
- else:
- raise ValueError, 'more than one term passed to hasterm'
-
- def __getitem__(self, name):
- t = self.termnames()
- if name in t:
- return self.terms[t.index(name)]
- else:
- raise KeyError, 'formula has no such term: %s' % repr(name)
-
- def termcolumns(self, query_term, dict=False):
- """
- Return a list of the indices of all columns associated
- to a given term.
- """
-
- if self.hasterm(query_term):
- names = query_term.names()
- value = {}
- for name in names:
- value[name] = self._names.index(name)
- else:
- raise ValueError, 'term not in formula'
- if dict:
- return value
- else:
- return value.values()
-
- def names(self):
- """
- Return a list of the names in the formula. The order of the
- names corresponds to the order of the columns when self
- is evaluated.
- """
-
- allnames = []
- for term in self.terms:
- allnames += term.names()
- return allnames
-
- def termnames(self):
- """
- Return a list of the term names in the formula. These
- are the names of each term instance in self.
- """
-
- names = []
- for term in self.terms:
- names += [term.termname]
- return names
-
- def design(self, *args, **kw):
- """
- ``transpose(self(*args, **kw))``
- """
- return self(*args, **kw).T
-
- def __mul__(self, other, nested=False):
- """
- This returns a formula whose columns are the pairwise
- product of the columns of self and other.
-
- TO DO: check for nesting relationship. Should not be too difficult.
- """
-
- other = Formula(other)
-
- selftermnames = self.termnames()
- othertermnames = other.termnames()
-
- I = len(selftermnames)
- J = len(othertermnames)
-
- terms = []
- termnames = []
-
- for i in range(I):
- for j in range(J):
- termname = '%s*%s' % (str(selftermnames[i]), str(othertermnames[j]))
- pieces = termname.split('*')
- pieces.sort()
- termname = '*'.join(pieces)
- termnames.append(termname)