Skip to content
Browse files

Merge branch 'master' of github.com:scikit-learn/scikit-learn

  • Loading branch information...
2 parents bf789be + b44a386 commit 7b12364f992fc0258ebca104f609da2e057d810f @GaelVaroquaux GaelVaroquaux committed
View
19 examples/glm/plot_lasso_lars.py
@@ -24,14 +24,29 @@
# someting's wrong with our dataset
X[:, 6] = -X[:, 6]
+
+m, n = 200, 200
+np.random.seed(0)
+X = np.random.randn(m, n)
+y = np.random.randn(m)
+
+
+_xmean = X.mean(0)
+_ymean = y.mean(0)
+X = X - _xmean
+y = y - _ymean
+_norms = np.apply_along_axis (np.linalg.norm, 0, X)
+nonzeros = np.flatnonzero(_norms)
+X[:, nonzeros] /= _norms[nonzeros]
+
################################################################################
# Demo path functions
################################################################################
-
+G = np.dot(X.T, X)
print "Computing regularization path using the LARS ..."
start = datetime.now()
-alphas, active, path = glm.lars_path(X, y, method='lasso', max_iter=12)
+alphas, active, path = glm.lars_path(X, y, Gram=G, method='lasso')
print "This took ", datetime.now() - start
alphas = np.sum(np.abs(path.T), axis=1)
View
103 scikits/learn/benchmarks/bench_lasso.py
@@ -0,0 +1,103 @@
+"""
+Benchmarks of Lasso vs LassoLARS
+
+First, we fix a training set and increase the number of
+samples. Then we plot the computation time as function of
+the number of samples.
+
+In the second benchmark, we increase the number of dimensions of the
+training set. Then we plot the computation time as function of
+the number of dimensions.
+
+In both cases, only 10% of the features are informative.
+"""
+import gc
+from time import time
+import numpy as np
+
+from bench_glm import make_data
+
+def bench(clf, X_train, Y_train, X_test, Y_test):
+ gc.collect()
+
+ # start time
+ tstart = time()
+ clf = clf.fit(X_train, Y_train)
+ delta = (time() - tstart)
+ # stop time
+
+ as_size = np.sum(np.abs(clf.coef_) > 0)
+ print "active set size: %s (%s %%)" % (as_size, float(as_size) / X_train.shape[1])
+ return delta
+
+def compute_bench(alpha, n_samples, n_features):
+
+ def LassoFactory(alpha):
+ return Lasso(alpha=alpha, fit_intercept=False)
+
+ def LassoLARSFactory(alpha):
+ return LassoLARS(alpha=alpha, normalize=False)
+ # return LassoLARS(alpha=alpha, fit_intercept=False, normalize=False)
+
+ lasso_results = []
+ larslasso_results = []
+
+ n_tests = 1000
+ it = 0
+
+ for ns in n_samples:
+ for nf in n_features:
+ it += 1
+ print '============'
+ print 'Iteration %s' % it
+ print '============'
+ k = nf // 10
+ X, Y, X_test, Y_test, coef_ = make_data(
+ n_samples=ns, n_tests=n_tests, n_features=nf,
+ noise=0.1, k=k)
+
+ X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data
+
+ print "benching Lasso: "
+ lasso_results.append(bench(LassoFactory(alpha),
+ X, Y, X_test, Y_test))
+ print "benching LassoLARS: "
+ larslasso_results.append(bench(LassoLARSFactory(alpha),
+ X, Y, X_test, Y_test))
+
+ return lasso_results, larslasso_results
+
+if __name__ == '__main__':
+ from scikits.learn.glm import Lasso, LassoLARS
+ import pylab as pl
+
+ alpha = 0.01 # regularization parameter
+
+ n_features = 500
+ list_n_samples = range(500, 10001, 500);
+ lasso_results, larslasso_results = compute_bench(alpha, list_n_samples,
+ [n_features])
+
+ pl.close('all')
+ pl.title('Lasso benchmark (%d features - alpha=%s)' % (n_features, alpha))
+ pl.plot(list_n_samples, lasso_results, 'b-', label='Lasso')
+ pl.plot(list_n_samples, larslasso_results,'r-', label='LassoLARS')
+ pl.legend()
+ pl.xlabel('number of samples')
+ pl.ylabel('time (in seconds)')
+ pl.show()
+
+ n_samples = 500
+ list_n_features = range(500, 3001, 500);
+ lasso_results, larslasso_results = compute_bench(alpha, [n_samples],
+ list_n_features)
+
+ pl.figure()
+ pl.title('Lasso benchmark (%d samples - alpha=%s)' % (n_samples, alpha))
+ pl.plot(list_n_features, lasso_results, 'b-', label='Lasso')
+ pl.plot(list_n_features, larslasso_results,'r-', label='LassoLARS')
+ pl.legend()
+ pl.xlabel('number of features')
+ pl.ylabel('time (in seconds)')
+ pl.show()
+
View
218 scikits/learn/glm/lars.py
@@ -15,56 +15,62 @@
from .base import LinearModel
from .._minilearn import lars_fit_wrap
from ..utils.fixes import copysign
-
-# Notes: np.ma.dot copies the masked array before doing the dot
-# product. Maybe we should implement in C our own masked_dot that does
-# not make unnecessary copies.
+from ..utils import arrayfuncs
# all linalg.solve solve a triangular system, so this could be heavily
# optimized by binding (in scipy ?) trsv or trsm
-def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
- verbose=False, precompute=True):
+def lars_path(X, y, Gram=None, max_iter=None, alpha_min=0,
+ method="lar", precompute=True):
""" Compute Least Angle Regression and LASSO path
Parameters
-----------
X: array, shape: (n, p)
Input data
+
y: array, shape: (n)
Input targets
+
max_iter: integer, optional
The number of 'kink' in the path
+
+ Gram: array, shape: (p, p), optional
+ Precomputed Gram matrix (X' * X)
+
alpha_min: float, optional
- The minimum correlation along the path. It corresponds
- to the regularization parameter alpha parameter in the Lasso.
+ The minimum correlation along the path. It corresponds to
+ the regularization parameter alpha parameter in the Lasso.
+
method: 'lar' or 'lasso'
- Specifies the problem solved: the LAR or its variant the LASSO-LARS
- that gives the solution of the LASSO problem for any regularization
- parameter.
+ Specifies the problem solved: the LAR or its variant the
+ LASSO-LARS that gives the solution of the LASSO problem
+ for any regularization parameter.
Returns
--------
alphas: array, shape: (k)
The alphas along the path
-
+
active: array, shape (?)
Indices of active variables at the end of the path.
-
+
coefs: array, shape (p,k)
Coefficients along the path
Notes
------
- XXX : add reference papers and wikipedia page
-
+ http://en.wikipedia.org/wiki/Least-angle_regression
+ http://en.wikipedia.org/wiki/Lasso_(statistics)#LASSO_method
+ XXX : add reference papers
+
"""
# TODO: precompute : empty for now
#
# TODO: detect stationary points.
# Lasso variant
# store full path
-
+
X = np.atleast_2d(X)
y = np.atleast_1d(y)
@@ -75,12 +81,19 @@ def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
max_pred = max_iter # OK for now
+ # because of some restrictions in Cython, boolean values are
+ # simulated using np.int8
+
beta = np.zeros ((max_iter + 1, X.shape[1]))
alphas = np.zeros (max_iter + 1)
n_iter, n_pred = 0, 0
- active = list()
+ active = list()
+ unactive = range (X.shape[1])
+ active_mask = np.zeros (X.shape[1], dtype=np.uint8)
# holds the sign of covariance
sign_active = np.empty (max_pred, dtype=np.int8)
+ Cov = np.empty (X.shape[1])
+ a = np.empty (X.shape[1])
drop = False
# will hold the cholesky factorization
@@ -90,22 +103,36 @@ def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
L = np.zeros ((max_pred, max_pred), dtype=np.float64)
Xt = X.T
- Xna = Xt.view(np.ma.MaskedArray) # variables not in the active set
- # should have a better name
- Xna.soften_mask()
+ if Gram is not None:
+ res_init = np.dot (X.T, y)
while 1:
-
- # Calculate covariance matrix and get maximum
- res = y - np.dot (X, beta[n_iter]) # there are better ways
- Cov = np.ma.dot (Xna, res)
-
- imax = np.ma.argmax (np.ma.abs(Cov)) #rename
- Cov_max = Cov.data [imax]
-
- alpha = np.abs(Cov_max) #sum (np.abs(beta[n_iter]))
+ n_unactive = X.shape[1] - n_pred # number of unactive elements
+
+ if n_unactive:
+ # Calculate covariance matrix and get maximum
+ if Gram is None:
+ res = y - np.dot (X, beta[n_iter]) # there are better ways
+ arrayfuncs.dot_over (X.T, res, active_mask, np.False_, Cov)
+ else:
+ # could use dot_over
+ arrayfuncs.dot_over (Gram, beta[n_iter], active_mask, np.False_, a)
+ Cov = res_init[unactive] - a[:n_unactive]
+
+ imax = np.argmax (np.abs(Cov[:n_unactive])) #rename
+ C_ = Cov [imax]
+ # np.delete (Cov, imax) # very ugly, has to be fixed
+ else:
+ # special case when all elements are in the active set
+ if Gram is None:
+ res = y - np.dot (X, beta[n_iter])
+ C_ = np.dot (X.T[0], res)
+ else:
+ C_ = np.dot(Gram[0], beta[n_iter]) - res_init[0]
+
+ alpha = np.abs(C_) # ugly alpha vs alphas
alphas [n_iter] = alpha
if (n_iter >= max_iter or n_pred >= max_pred ):
@@ -113,60 +140,75 @@ def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
if (alpha < alpha_min): break
-
if not drop:
+ imax = unactive.pop (imax)
+
+
# Update the Cholesky factorization of (Xa * Xa') #
# #
- # ( L 0 ) #
- # L -> ( ) , where L * w = b #
- # ( w z ) z = 1 - ||w|| #
+ # ( L 0 ) #
+ # L -> ( ) , where L * w = b #
+ # ( w z ) z = 1 - ||w|| #
# #
# where u is the last added to the active set #
- n_pred += 1
- active.append(imax)
- Xna[imax] = np.ma.masked
- Cov[imax] = np.ma.masked
- sign_active [n_pred-1] = np.sign (Cov_max)
+ sign_active [n_pred] = np.sign (C_)
- X_max = Xt[imax]
+ if Gram is None:
+ X_max = Xt[imax]
+ c = np.dot (X_max, X_max)
+ b = np.dot (X_max, X[:, active])
+ else:
+ c = Gram[imax, imax]
+ b = Gram[imax, active]
+
+ n_pred += 1
+ active.append(imax)
- c = np.dot (X_max, X_max)
L [n_pred-1, n_pred-1] = c
if n_pred > 1:
- b = np.dot (X_max, Xa.T)
# please refactor me, using linalg.solve is overkill
- L [n_pred-1, :n_pred-1] = linalg.solve (L[:n_pred-1, :n_pred-1], b)
+ #L [n_pred-1, :n_pred-1] = linalg.solve (L[:n_pred-1, :n_pred-1], b)
+ arrayfuncs.solve_triangular (L[:n_pred-1, :n_pred-1],
+ b)
+ L [n_pred-1, :n_pred-1] = b[:]
v = np.dot(L [n_pred-1, :n_pred-1], L [n_pred - 1, :n_pred -1])
L [n_pred-1, n_pred-1] = np.sqrt (c - v)
- else:
- drop = False
-
- Xa = Xt[active] # also Xna[~Xna.mask]
# Now we go into the normal equations dance.
# (Golub & Van Loan, 1996)
- b = copysign (Cov_max.repeat(n_pred), sign_active[:n_pred])
+ b = copysign (C_.repeat(n_pred), sign_active[:n_pred])
b = linalg.cho_solve ((L[:n_pred, :n_pred], True), b)
- C = A = np.abs(Cov_max)
- u = np.dot (Xa.T, b)
- a = np.ma.dot (Xna, u)
+ C = A = np.abs(C_)
+ if Gram is None:
+ u = np.dot (Xt[active].T, b)
+ arrayfuncs.dot_over (X.T, u, active_mask, np.False_, a)
+
+ else:
+ # Not sure that this is not not buggy ...
+ arrayfuncs.dot_over (Gram[active].T, b, active_mask, np.False_, a)
# equation 2.13, there's probably a simpler way
- g1 = (C - Cov) / (A - a)
- g2 = (C + Cov) / (A + a)
+ g1 = (C - Cov[:n_unactive]) / (A - a[:n_unactive])
+ g2 = (C + Cov[:n_unactive]) / (A + a[:n_unactive])
+
+ if not drop:
+ # Quickfix
+ active_mask [imax] = np.True_
+ else:
+ drop = False
# one for the border cases
- g = np.ma.concatenate((g1, g2))
+ g = np.concatenate((g1, g2, [1.]))
g = g[g > 0.]
- gamma_ = np.ma.min (g)
+ gamma_ = np.min (g)
if n_pred >= X.shape[1]:
gamma_ = 1.
@@ -186,14 +228,11 @@ def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
beta[n_iter, active] = beta[n_iter - 1, active] + gamma_ * b
if drop:
+ arrayfuncs.cholesky_delete (L[:n_pred, :n_pred], idx)
n_pred -= 1
drop_idx = active.pop (idx)
- # please please please remove this masked arrays pain from me
- Xna[drop_idx] = Xna.data[drop_idx]
- if verbose:
- print 'dropped %s at %s iteration' % (idx, n_iter)
- Xa = Xt[active] # duplicate
- L[:n_pred, :n_pred] = linalg.cholesky(np.dot(Xa, Xa.T), lower=True)
+ unactive.append(drop_idx)
+ active_mask[drop_idx] = False
sign_active = np.delete (sign_active, idx) # do an append to maintain size
sign_active = np.append (sign_active, 0.)
# should be done using cholesky deletes
@@ -212,13 +251,12 @@ def lars_path(X, y, max_iter=None, alpha_min=0, method="lar",
class LARS (LinearModel):
""" Least Angle Regression model a.k.a. LAR
-
+
Parameters
----------
n_features : int, optional
Number of selected active features
- XXX : todo add fit_intercept
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
@@ -229,7 +267,6 @@ class LARS (LinearModel):
`coef_` : array, shape = [n_features]
parameter vector (w in the fomulation formula)
- XXX : add intercept_
`intercept_` : float
independent term in decision function.
@@ -246,34 +283,33 @@ class LARS (LinearModel):
-----
See also scikits.learn.glm.LassoLARS that fits a LASSO model
using a variant of Least Angle Regression
-
- XXX : add ref + wikipedia page
-
+
+ http://en.wikipedia.org/wiki/Least_angle_regression
+
See examples. XXX : add examples names
"""
def __init__(self, n_features, normalize=True):
self.n_features = n_features
self.normalize = normalize
self.coef_ = None
+ self.fit_intercept = True
- def fit (self, X, y, **params):
+ def fit (self, X, y, Gram=None, **params):
self._set_params(**params)
# will only normalize non-zero columns
X = np.atleast_2d(X)
y = np.atleast_1d(y)
+ X, y, Xmean, Ymean = self._center_data(X, y)
+
if self.normalize:
- self._xmean = X.mean(0)
- self._ymean = y.mean(0)
- X = X - self._xmean
- y = y - self._ymean
- self._norms = np.apply_along_axis (np.linalg.norm, 0, X)
- nonzeros = np.flatnonzero(self._norms)
- X[:, nonzeros] /= self._norms[nonzeros]
+ norms = np.sqrt(np.sum(X**2, axis=0))
+ nonzeros = np.flatnonzero(norms)
+ X[:, nonzeros] /= norms[nonzeros]
method = 'lar'
- alphas_, active, coef_path_ = lars_path(X, y,
+ alphas_, active, coef_path_ = lars_path(X, y, Gram=Gram,
max_iter=self.n_features, method=method)
self.coef_ = coef_path_[:,-1]
return self
@@ -281,7 +317,7 @@ def fit (self, X, y, **params):
class LassoLARS (LinearModel):
""" Lasso model fit with Least Angle Regression a.k.a. LARS
-
+
It is a Linear Model trained with an L1 prior as regularizer.
lasso).
@@ -290,7 +326,6 @@ class LassoLARS (LinearModel):
alpha : float, optional
Constant that multiplies the L1 term. Defaults to 1.0
- XXX : todo add fit_intercept
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
@@ -301,7 +336,6 @@ class LassoLARS (LinearModel):
`coef_` : array, shape = [n_features]
parameter vector (w in the fomulation formula)
- XXX : add intercept_
`intercept_` : float
independent term in decision function.
@@ -310,7 +344,7 @@ class LassoLARS (LinearModel):
>>> from scikits.learn import glm
>>> clf = glm.LassoLARS(alpha=0.1)
>>> clf.fit([[-1,1], [0, 0], [1, 1]], [-1, 0, -1])
- LassoLARS(normalize=True, alpha=0.1, max_iter=None)
+ LassoLARS(normalize=True, alpha=0.1, max_iter=None, fit_intercept=True)
>>> print clf.coef_
[ 0. -0.51649658]
@@ -320,7 +354,8 @@ class LassoLARS (LinearModel):
an alternative optimization strategy called 'coordinate descent.'
"""
- def __init__(self, alpha=1.0, max_iter=None, normalize=True):
+ def __init__(self, alpha=1.0, max_iter=None, normalize=True,
+ fit_intercept=True):
""" XXX : add doc
# will only normalize non-zero columns
"""
@@ -328,8 +363,9 @@ def __init__(self, alpha=1.0, max_iter=None, normalize=True):
self.normalize = normalize
self.coef_ = None
self.max_iter = max_iter
+ self.fit_intercept = fit_intercept
- def fit (self, X, y, **params):
+ def fit (self, X, y, Gram=None, **params):
""" XXX : add doc
"""
self._set_params(**params)
@@ -337,24 +373,26 @@ def fit (self, X, y, **params):
X = np.atleast_2d(X)
y = np.atleast_1d(y)
+ X, y, Xmean, Ymean = self._center_data(X, y)
+
n_samples = X.shape[0]
alpha = self.alpha * n_samples # scale alpha with number of samples
+ # XXX : should handle also unnormalized datasets
if self.normalize:
- self._xmean = X.mean(0)
- self._ymean = y.mean(0)
- X = X - self._xmean
- y = y - self._ymean
- self._norms = np.apply_along_axis (np.linalg.norm, 0, X)
- nonzeros = np.flatnonzero(self._norms)
- X[:, nonzeros] /= self._norms[nonzeros]
+ norms = np.sqrt(np.sum(X**2, axis=0))
+ nonzeros = np.flatnonzero(norms)
+ X[:, nonzeros] /= norms[nonzeros]
method = 'lasso'
- alphas_, active, coef_path_ = lars_path(X, y,
+ alphas_, active, coef_path_ = lars_path(X, y, Gram=Gram,
alpha_min=alpha, method=method,
max_iter=self.max_iter)
self.coef_ = coef_path_[:,-1]
+
+ self._set_intercept(Xmean, Ymean)
+
return self
@@ -490,4 +528,4 @@ def predict(self, X, normalize=True):
return np.dot(X, self.coef_) + self.intercept_
-
+
View
3 scikits/learn/glm/setup.py
@@ -31,8 +31,7 @@ def configuration(parent_package='', top_path=None):
include_dirs=[join('..', 'src', 'cblas'),
numpy.get_include(),
blas_info.pop('include_dirs', [])],
- extra_compile_args=['-std=c99'] + \
- blas_info.pop('extra_compile_args', []),
+ extra_compile_args=blas_info.pop('extra_compile_args', []),
**blas_info
)
View
5 scikits/learn/glm/tests/test_bayes.py
@@ -6,13 +6,14 @@
import numpy as np
from numpy.testing import assert_array_equal, \
- assert_array_almost_equal
+ assert_array_almost_equal, decorators
from ..bayes import BayesianRidge, ARDRegression
from scikits.learn import datasets
-def XFailed_test_bayesian_on_diabetes():
+@decorators.skipif(True, "XFailed test")
+def test_bayesian_on_diabetes():
"""
Test BayesianRidge on diabetes
"""
View
25 scikits/learn/glm/tests/test_lars.py
@@ -15,12 +15,12 @@
X, y = diabetes.data, diabetes.target
-def test_1():
+def test_simple():
"""
Principle of LARS is to keep covariances tied and decreasing
"""
max_pred = 10
- alphas_, active, coef_path_ = lars_path(diabetes.data, diabetes.target, max_pred, method="lar")
+ alphas_, active, coef_path_ = lars_path(diabetes.data, diabetes.target, max_iter=max_pred, method="lar")
for (i, coef_) in enumerate(coef_path_.T):
res = y - np.dot(X, coef_)
cov = np.dot(X.T, res)
@@ -34,6 +34,27 @@ def test_1():
assert ocur == max_pred
+def test_simple_precomputed():
+ """
+ The same, with precomputed Gram matrix
+ """
+ max_pred = 10
+ G = np.dot (diabetes.data.T, diabetes.data)
+ alphas_, active, coef_path_ = lars_path(diabetes.data, diabetes.target, Gram=G, max_iter=max_pred, method="lar")
+ for (i, coef_) in enumerate(coef_path_.T):
+ res = y - np.dot(X, coef_)
+ cov = np.dot(X.T, res)
+ C = np.max(abs(cov))
+ eps = 1e-3
+ ocur = len(cov[ C - eps < abs(cov)])
+ if i < max_pred:
+ assert ocur == i+1
+ else:
+ # no more than max_pred variables can go into the active set
+ assert ocur == max_pred
+
+
+
def test_lars_lstsq():
"""
Test that LARS gives least square solution at the end
View
38 scikits/learn/preprocessing.py
@@ -8,31 +8,47 @@
from .base import BaseEstimator
+
+def _mean_and_std(X, axis=0, with_std=True):
+ Xr = np.rollaxis(X, axis)
+ mean_ = Xr.mean(axis=0)
+ std_ = Xr.std(axis=0) if with_std else None
+ return mean_, std_
+
+
+def scale(X, axis=0, with_std=True, copy=True):
+ """Method to standardize a dataset along any axis
+ """
+ mean_, std_ = _mean_and_std(X, axis, with_std)
+ if copy:
+ X = X.copy()
+ Xr = np.rollaxis(X, axis)
+ Xr -= mean_
+ if with_std:
+ Xr /= std_
+ return X
+
+
class Scaler(BaseEstimator):
- """Object to standardize a dataset along any axis
+ """Object to standardize a dataset
It centers the dataset and optionaly scales to
- fix the variance to 1.
+ fix the variance to 1 for each feature
"""
- def __init__(self, axis=0, with_std=True):
- self.axis = axis
+ def __init__(self, with_std=True):
self.with_std = with_std
def fit(self, X, y=None, **params):
self._set_params(**params)
- X = np.rollaxis(X, self.axis)
- self.mean = X.mean(axis=0)
- if self.with_std:
- self.std = X.std(axis=0)
+ self.mean_, self.std_ = _mean_and_std(X, axis=0, with_std=self.with_std)
return self
def transform(self, X, y=None, copy=True):
if copy:
X = X.copy()
# We are taking a view of the X array and modifying it
- Xr = np.rollaxis(X, self.axis)
- Xr -= self.mean
+ X -= self.mean_
if self.with_std:
- Xr /= self.std
+ X /= self.std_
return X
View
2 scikits/learn/setup.py
@@ -119,10 +119,10 @@ def configuration(parent_package='', top_path=None):
include_dirs=[numpy.get_include()]
)
- config.add_subpackage('utils')
# this has to be build *after* cblas
config.add_subpackage('glm')
+ config.add_subpackage('utils')
# add the test directory
config.add_subpackage('tests')
View
23 scikits/learn/tests/test_preprocessing.py
@@ -2,25 +2,30 @@
from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_
-from ..preprocessing import Scaler
+#from ..preprocessing import Scaler
+from scikits.learn.preprocessing import Scaler, scale
def test_scaler():
"""Test scaling of dataset along all axis
"""
-
X = np.random.randn(4, 5)
- scaler = Scaler(axis=1)
+ scaler = Scaler()
X_scaled = scaler.fit(X).transform(X, copy=False)
- assert_array_almost_equal(X_scaled.mean(axis=1), 4*[0.0])
- assert_array_almost_equal(X_scaled.std(axis=1), 4*[1.0])
+ assert_array_almost_equal(X_scaled.mean(axis=0), 5*[0.0])
+ assert_array_almost_equal(X_scaled.std(axis=0), 5*[1.0])
# Check that X has not been copied
assert_(X_scaled is X)
-
- scaler = Scaler(axis=0, with_std=False)
- X_orig = X.copy()
X_scaled = scaler.fit(X).transform(X, copy=True)
assert_array_almost_equal(X_scaled.mean(axis=0), 5*[0.0])
+ assert_array_almost_equal(X_scaled.std(axis=0), 5*[1.0])
+ # Check that X has not been copied
+ assert_(X_scaled is not X)
+
+ X_scaled = scale(X, axis=1, with_std=False)
+ assert_array_almost_equal(X_scaled.mean(axis=1), 4*[0.0])
+ X_scaled = scale(X, axis=1, with_std=True)
+ assert_array_almost_equal(X_scaled.std(axis=1), 4*[1.0])
# Check that the data hasn't been modified
- assert_array_equal(X, X_orig)
+
View
5,187 scikits/learn/utils/arrayfuncs.c
5,187 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
89 scikits/learn/utils/arrayfuncs.pyx
@@ -0,0 +1,89 @@
+"""
+Small collection of auxiliary functions that operate on arrays
+
+"""
+cimport numpy as np
+import numpy as np
+
+
+cdef extern from "cblas.h":
+ enum CBLAS_ORDER:
+ CblasRowMajor=101
+ CblasColMajor=102
+ enum CBLAS_TRANSPOSE:
+ CblasNoTrans=111
+ CblasTrans=112
+ CblasConjTrans=113
+ AtlasConj=114
+ enum CBLAS_UPLO:
+ CblasUpper=121
+ CblasLower=122
+ enum CBLAS_DIAG:
+ CblasNonUnit=131
+ CblasUnit=132
+
+ double cblas_ddot(int N, double *X, int incX, double *Y, int incY)
+
+ void cblas_dtrsv(CBLAS_ORDER Order, CBLAS_UPLO Uplo,
+ CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag,
+ int N, double *A, int lda, double *X,
+ int incX)
+
+
+cdef extern from "src/cholesky_delete.c":
+ int c_cholesky_delete "cholesky_delete" (int m, int n, double *L, int go_out)
+
+
+
+ctypedef np.float64_t DOUBLE
+ctypedef np.uint8_t BOOL
+# cython has no support for np.bool type. See
+# http://www.mail-archive.com/cython-dev@codespeak.net/msg05913.html
+
+
+def dot_over (
+ np.ndarray[DOUBLE, ndim=2] X,
+ np.ndarray[DOUBLE, ndim=1] y,
+ np.ndarray[BOOL, ndim=1] mask,
+ BOOL val,
+ np.ndarray[DOUBLE, ndim=1] out):
+ """
+ Computes the dot product over a mask
+ """
+
+ cdef int i, j=0, N, incX, incY, incXY
+ cdef double *X_ptr = <double *> X.data
+ N = X.shape[1]
+ incX = X.strides[1] / sizeof (double)
+ incY = y.strides[0] / sizeof (double)
+ incXY = X.strides[0] / sizeof (double)
+
+ for i in range(X.shape[0]):
+ if mask[i] == val:
+ out[j] = cblas_ddot (N, X_ptr + i*incXY, incX, <double *> y.data, incY)
+ j += 1
+
+
+def solve_triangular (
+ np.ndarray[DOUBLE, ndim=2] X,
+ np.ndarray[DOUBLE, ndim=1] y
+ ):
+ """
+ Solves a triangular system (overwrites y)
+
+ TODO: option for upper, lower, etc.
+ """
+ cdef int lda = <int> X.strides[0] / sizeof(double)
+
+ cblas_dtrsv (CblasRowMajor, CblasLower, CblasNoTrans,
+ CblasNonUnit, <int> X.shape[0], <double *> X.data,
+ lda, <double *> y.data, 1);
+
+
+def cholesky_delete (
+ np.ndarray[DOUBLE, ndim=2] L,
+ int go_out):
+
+ cdef int n = <int> L.shape[0]
+ cdef int m = <int> L.strides[0] / sizeof (double)
+ c_cholesky_delete (m, n, <double *> L.data, go_out)
View
23 scikits/learn/utils/setup.py
@@ -1,3 +1,7 @@
+from os.path import join
+from numpy.distutils.system_info import get_info, get_standard_file, BlasNotFoundError
+
+
def configuration(parent_package='',top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
@@ -6,5 +10,24 @@ def configuration(parent_package='',top_path=None):
config.add_subpackage('sparsetools')
+ # cd fast needs CBLAS
+ blas_info = get_info('blas_opt', 0)
+ if (not blas_info) or (
+ ('NO_ATLAS_INFO', 1) in blas_info.get('define_macros', [])) :
+ cblas_libs = ['cblas']
+ blas_info.pop('libraries', None)
+ else:
+ cblas_libs = blas_info.pop('libraries', [])
+
+ config.add_extension('arrayfuncs',
+ sources=['arrayfuncs.c'],
+ libraries=cblas_libs,
+ include_dirs=[join('..', 'src', 'cblas'),
+ numpy.get_include(),
+ blas_info.pop('include_dirs', [])],
+ extra_compile_args=blas_info.pop('extra_compile_args', []),
+ **blas_info
+ )
+
return config
View
47 scikits/learn/utils/src/cholesky_delete.c
@@ -0,0 +1,47 @@
+#include <math.h>
+#include <cblas.h>
+
+#ifdef _MSC_VER
+ #define copysign _copysign
+#endif
+
+/* General Cholesky Delete.
+ * Remove an element from the cholesky factorization
+ * m = columns
+ * n = rows
+ *
+ * TODO: put transpose as an option
+ *
+ */
+int cholesky_delete (int m, int n, double *L, int go_out) {
+
+ double c, s;
+
+ /* delete row go_out */
+ double * _L = L + (go_out * m);
+ int i;
+ for (i = go_out; i < n - 1; ++i) {
+ cblas_dcopy (i + 2, _L + m , 1, _L, 1);
+ _L += m;
+ }
+
+ _L = L + (go_out * m);
+ for (i=go_out; i < n - 1; ++i) {
+
+ cblas_drotg (_L + i, _L + i + 1, &c, &s);
+ if (_L[i] < 0) {
+ /* Diagonals cannot be negative */
+ _L[i] = copysign(_L[i], 1.0);
+ c = -c;
+ s = -s;
+ }
+ _L[i+1] = 0.; /* just for cleanup */
+ _L += m;
+
+ cblas_drot (n - (i + 2), _L + i, m, _L + i + 1,
+ m, c, s);
+
+ }
+
+ return 0;
+}

0 comments on commit 7b12364

Please sign in to comment.
Something went wrong with that request. Please try again.