import cupy as cp import numpy as np from cuml import LinearRegression as cuLinearRegression from sklearn.linear_model import LinearRegression as skLinearRegression import scipy.sparse as sp from scipy import linalg import numbers from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES from sklearn.utils.sparsefuncs import mean_variance_axis, inplace_column_scale def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, sample_weight=None, return_mean=False, check_input=True): if isinstance(sample_weight, numbers.Number): sample_weight = None if sample_weight is not None: sample_weight = np.asarray(sample_weight) if check_input: X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], dtype=FLOAT_DTYPES) elif copy: if sp.issparse(X): X = X.copy() else: X = X.copy(order='K') y = np.asarray(y, dtype=X.dtype) if fit_intercept: if sp.issparse(X): X_offset, X_var = mean_variance_axis(X, axis=0) if not return_mean: X_offset[:] = X.dtype.type(0) if normalize: # TODO: f_normalize could be used here as well but the function # inplace_csr_row_normalize_l2 must be changed such that it # can return also the norms computed internally # transform variance to norm in-place X_var *= X.shape[0] X_scale = np.sqrt(X_var, X_var) del X_var X_scale[X_scale == 0] = 1 inplace_column_scale(X, 1. / X_scale) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) else: X_offset = np.average(X, axis=0, weights=sample_weight) X -= X_offset if normalize: X, X_scale = f_normalize(X, axis=0, copy=False, return_norm=True) else: X_scale = np.ones(X.shape[1], dtype=X.dtype) y_offset = np.average(y, axis=0, weights=sample_weight) y = y - y_offset else: X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: y_offset = X.dtype.type(0) else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_offset, y_offset, X_scale # One of several examples found with Hypothesis X_train = np.array([[25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206], [1.0000000000222042, 37525.13455882354, 25000.750007327206, 25000.750007327206, 25000.750007327206], [25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206], [25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206]], dtype=np.float64) y_train = np.array([1.0, 2.003848073721735, 2.003848073721735, 2.003848073721735], dtype=np.float64) X_test = np.array([[25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206, 25000.750007327206]], dtype=np.float64) y_test = np.array([2.003848073721735], dtype=np.float64) X, y, X_offset, y_offset, X_scale = _preprocess_data( X_train, y_train, fit_intercept=True, normalize=False, copy=True, sample_weight=None, return_mean=True) results = {} for lapack_method in ['gelsd', 'gelsy', 'gelss']: coef_, _residues, rank_, singular_ = linalg.lstsq(X, y, lapack_driver=lapack_method) coef_ = coef_.T if y.ndim == 1: coef_ = np.ravel(coef_) results[lapack_method] = {} results[lapack_method]['coef_'] = coef_ / X_scale results[lapack_method]['intercept_'] = y_offset - np.dot(X_offset, coef_.T) cuols = cuLinearRegression() cuols.fit(X_train, y_train) results['cuML'] = {} results['cuML']['coef_'] = cuols.coef_ results['cuML']['intercept_'] = cuols.intercept_ import pprint pprint.pprint(results)