# scikit-learn/scikit-learn

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
459 lines (360 sloc) 14.7 KB
 # Authors: Mathieu Blondel # Olivier Grisel # Peter Prettenhofer # Lars Buitinck # Giorgio Patrini # # License: BSD 3 clause #!python #cython: boundscheck=False, wraparound=False, cdivision=True from libc.math cimport fabs, sqrt, pow cimport numpy as np import numpy as np import scipy.sparse as sp cimport cython from cython cimport floating from numpy.math cimport isnan np.import_array() ctypedef fused integral: int long long ctypedef np.float64_t DOUBLE def csr_row_norms(X): """L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) return _csr_row_norms(X.data, X.shape, X.indices, X.indptr) def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, np.ndarray[integral, ndim=1, mode="c"] X_indices, np.ndarray[integral, ndim=1, mode="c"] X_indptr): cdef: unsigned long long n_samples = shape[0] unsigned long long n_features = shape[1] np.ndarray[DOUBLE, ndim=1, mode="c"] norms np.npy_intp i, j double sum_ norms = np.zeros(n_samples, dtype=np.float64) for i in range(n_samples): sum_ = 0.0 for j in range(X_indptr[i], X_indptr[i + 1]): sum_ += X_data[j] * X_data[j] norms[i] = sum_ return norms def csr_mean_variance_axis0(X): """Compute mean and variance along axis 0 on a CSR matrix Parameters ---------- X : CSR sparse matrix, shape (n_samples, n_features) Input data. Returns ------- means : float array with shape (n_features,) Feature-wise means variances : float array with shape (n_features,) Feature-wise variances """ if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) means, variances, _ = _csr_mean_variance_axis0(X.data, X.shape[0], X.shape[1], X.indices) return means, variances def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, unsigned long long n_samples, unsigned long long n_features, np.ndarray[integral, ndim=1] X_indices): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: np.npy_intp i unsigned long long non_zero = X_indices.shape[0] np.npy_intp col_ind floating diff # means[j] contains the mean of feature j np.ndarray[floating, ndim=1] means # variances[j] contains the variance of feature j np.ndarray[floating, ndim=1] variances if floating is float: dtype = np.float32 else: dtype = np.float64 means = np.zeros(n_features, dtype=dtype) variances = np.zeros_like(means, dtype=dtype) cdef: # counts[j] contains the number of samples where feature j is non-zero np.ndarray[np.int64_t, ndim=1] counts = np.zeros(n_features, dtype=np.int64) # counts_nan[j] contains the number of NaNs for feature j np.ndarray[np.int64_t, ndim=1] counts_nan = np.zeros(n_features, dtype=np.int64) for i in xrange(non_zero): col_ind = X_indices[i] if not isnan(X_data[i]): means[col_ind] += X_data[i] else: counts_nan[col_ind] += 1 for i in xrange(n_features): means[i] /= (n_samples - counts_nan[i]) for i in xrange(non_zero): col_ind = X_indices[i] if not isnan(X_data[i]): diff = X_data[i] - means[col_ind] variances[col_ind] += diff * diff counts[col_ind] += 1 for i in xrange(n_features): variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]**2 variances[i] /= (n_samples - counts_nan[i]) return means, variances, counts_nan def csc_mean_variance_axis0(X): """Compute mean and variance along axis 0 on a CSC matrix Parameters ---------- X : CSC sparse matrix, shape (n_samples, n_features) Input data. Returns ------- means : float array with shape (n_features,) Feature-wise means variances : float array with shape (n_features,) Feature-wise variances """ if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) means, variances, _ = _csc_mean_variance_axis0(X.data, X.shape[0], X.shape[1], X.indices, X.indptr) return means, variances def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, unsigned long long n_samples, unsigned long long n_features, np.ndarray[integral, ndim=1] X_indices, np.ndarray[integral, ndim=1] X_indptr): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: np.npy_intp i, j unsigned long long counts unsigned long long startptr unsigned long long endptr floating diff # means[j] contains the mean of feature j np.ndarray[floating, ndim=1] means # variances[j] contains the variance of feature j np.ndarray[floating, ndim=1] variances if floating is float: dtype = np.float32 else: dtype = np.float64 means = np.zeros(n_features, dtype=dtype) variances = np.zeros_like(means, dtype=dtype) cdef np.ndarray[np.int64_t, ndim=1] counts_nan = np.zeros(n_features, dtype=np.int64) for i in xrange(n_features): startptr = X_indptr[i] endptr = X_indptr[i + 1] counts = endptr - startptr for j in xrange(startptr, endptr): if not isnan(X_data[j]): means[i] += X_data[j] else: counts_nan[i] += 1 counts -= counts_nan[i] means[i] /= (n_samples - counts_nan[i]) for j in xrange(startptr, endptr): if not isnan(X_data[j]): diff = X_data[j] - means[i] variances[i] += diff * diff variances[i] += (n_samples - counts_nan[i] - counts) * means[i]**2 variances[i] /= (n_samples - counts_nan[i]) return means, variances, counts_nan def incr_mean_variance_axis0(X, last_mean, last_var, last_n): """Compute mean and variance along axis 0 on a CSR or CSC matrix. last_mean, last_var are the statistics computed at the last step by this function. Both must be initialized to 0.0. last_n is the number of samples encountered until now and is initialized at 0. Parameters ---------- X : CSR or CSC sparse matrix, shape (n_samples, n_features) Input data. last_mean : float array with shape (n_features,) Array of feature-wise means to update with the new data X. last_var : float array with shape (n_features,) Array of feature-wise var to update with the new data X. last_n : int array with shape (n_features,) Number of samples seen so far, before X. Returns ------- updated_mean : float array with shape (n_features,) Feature-wise means updated_variance : float array with shape (n_features,) Feature-wise variances updated_n : int array with shape (n_features,) Updated number of samples seen Notes ----- NaNs are ignored during the computation. References ---------- T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample variance: recommendations, The American Statistician, Vol. 37, No. 3, pp. 242-247 Also, see the non-sparse implementation of this in `utils.extmath._batch_mean_variance_update`. """ if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) return _incr_mean_variance_axis0(X.data, X.shape[0], X.shape[1], X.indices, X.indptr, X.format, last_mean, last_var, last_n) def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, unsigned long long n_samples, unsigned long long n_features, np.ndarray[integral, ndim=1] X_indices, np.ndarray[integral, ndim=1] X_indptr, str X_format, np.ndarray[floating, ndim=1] last_mean, np.ndarray[floating, ndim=1] last_var, np.ndarray[np.int64_t, ndim=1] last_n): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef: np.npy_intp i # last = stats until now # new = the current increment # updated = the aggregated stats # when arrays, they are indexed by i per-feature cdef: np.ndarray[floating, ndim=1] new_mean np.ndarray[floating, ndim=1] new_var np.ndarray[floating, ndim=1] updated_mean np.ndarray[floating, ndim=1] updated_var if floating is float: dtype = np.float32 else: dtype = np.float64 new_mean = np.zeros(n_features, dtype=dtype) new_var = np.zeros_like(new_mean, dtype=dtype) updated_mean = np.zeros_like(new_mean, dtype=dtype) updated_var = np.zeros_like(new_mean, dtype=dtype) cdef: np.ndarray[np.int64_t, ndim=1] new_n np.ndarray[np.int64_t, ndim=1] updated_n np.ndarray[floating, ndim=1] last_over_new_n np.ndarray[np.int64_t, ndim=1] counts_nan # Obtain new stats first new_n = np.full(n_features, n_samples, dtype=np.int64) updated_n = np.zeros_like(new_n, dtype=np.int64) last_over_new_n = np.zeros_like(new_n, dtype=dtype) if X_format == 'csr': # X is a CSR matrix new_mean, new_var, counts_nan = _csr_mean_variance_axis0( X_data, n_samples, n_features, X_indices) else: # X is a CSC matrix new_mean, new_var, counts_nan = _csc_mean_variance_axis0( X_data, n_samples, n_features, X_indices, X_indptr) for i in xrange(n_features): new_n[i] -= counts_nan[i] # First pass cdef bint is_first_pass = True for i in xrange(n_features): if last_n[i] > 0: is_first_pass = False break if is_first_pass: return new_mean, new_var, new_n # Next passes for i in xrange(n_features): updated_n[i] = last_n[i] + new_n[i] last_over_new_n[i] = last_n[i] / new_n[i] # Unnormalized stats for i in xrange(n_features): last_mean[i] *= last_n[i] last_var[i] *= last_n[i] new_mean[i] *= new_n[i] new_var[i] *= new_n[i] # Update stats for i in xrange(n_features): updated_var[i] = (last_var[i] + new_var[i] + last_over_new_n[i] / updated_n[i] * (last_mean[i] / last_over_new_n[i] - new_mean[i])**2) updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i] updated_var[i] /= updated_n[i] return updated_mean, updated_var, updated_n def inplace_csr_row_normalize_l1(X): """Inplace row normalize using the l1 norm""" _inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr) def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data, shape, np.ndarray[integral, ndim=1] X_indices, np.ndarray[integral, ndim=1] X_indptr): cdef unsigned long long n_samples = shape[0] cdef unsigned long long n_features = shape[1] # the column indices for row i are stored in: # indices[indptr[i]:indices[i+1]] # and their corresponding values are stored in: # data[indptr[i]:indptr[i+1]] cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): sum_ = 0.0 for j in xrange(X_indptr[i], X_indptr[i + 1]): sum_ += fabs(X_data[j]) if sum_ == 0.0: # do not normalize empty rows (can happen if CSR is not pruned # correctly) continue for j in xrange(X_indptr[i], X_indptr[i + 1]): X_data[j] /= sum_ def inplace_csr_row_normalize_l2(X): """Inplace row normalize using the l2 norm""" _inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr) def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data, shape, np.ndarray[integral, ndim=1] X_indices, np.ndarray[integral, ndim=1] X_indptr): cdef integral n_samples = shape[0] cdef integral n_features = shape[1] cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): sum_ = 0.0 for j in xrange(X_indptr[i], X_indptr[i + 1]): sum_ += (X_data[j] * X_data[j]) if sum_ == 0.0: # do not normalize empty rows (can happen if CSR is not pruned # correctly) continue sum_ = sqrt(sum_) for j in xrange(X_indptr[i], X_indptr[i + 1]): X_data[j] /= sum_ def assign_rows_csr(X, np.ndarray[np.npy_intp, ndim=1] X_rows, np.ndarray[np.npy_intp, ndim=1] out_rows, np.ndarray[floating, ndim=2, mode="c"] out): """Densify selected rows of a CSR matrix into a preallocated array. Like out[out_rows] = X[X_rows].toarray() but without copying. No-copy supported for both dtype=np.float32 and dtype=np.float64. Parameters ---------- X : scipy.sparse.csr_matrix, shape=(n_samples, n_features) X_rows : array, dtype=np.intp, shape=n_rows out_rows : array, dtype=np.intp, shape=n_rows out : array, shape=(arbitrary, n_features) """ cdef: # npy_intp (np.intp in Python) is what np.where returns, # but int is what scipy.sparse uses. int i, ind, j np.npy_intp rX np.ndarray[floating, ndim=1] data = X.data np.ndarray[int, ndim=1] indices = X.indices, indptr = X.indptr if X_rows.shape[0] != out_rows.shape[0]: raise ValueError("cannot assign %d rows to %d" % (X_rows.shape[0], out_rows.shape[0])) out[out_rows] = 0. for i in range(X_rows.shape[0]): rX = X_rows[i] for ind in range(indptr[rX], indptr[rX + 1]): j = indices[ind] out[out_rows[i], j] = data[ind]