In [1]:
import numpy as np
import pandas as pd
import cudf
import os
from cuml import Ridge as cuRidge
from sklearn.linear_model import Ridge as skRidge
from sklearn.datasets import make_regression

# Select a particular GPU to run the notebook  
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [12]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    train_rows = int(nrows*0.8)
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
        df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
        df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    else:
        print('use random data')
        X,y = make_regression(n_samples=nrows,n_features=ncols,n_informative=ncols, random_state=0)
        df_y_train = pd.DataFrame({'fea0':y[0:train_rows,]})
        df_y_test = pd.DataFrame({'fea0':y[train_rows:,]})
        #y = np.random.randint(0,10,size=(nrows,1))
    #split the dataset in a 80:20 split
    train_rows = int(nrows*0.8)
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})

    return df_X_train, df_X_test, df_y_train, df_y_test

In [13]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [14]:
%%time
nrows = 2**15
ncols = 399

X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

use random data
training data (26214, 399)
training label (26214, 1)
testing data (6554, 399)
testing label (6554, 1)
CPU times: user 8.44 s, sys: 196 ms, total: 8.63 s
Wall time: 953 ms


In [15]:
%%time
skridge = skRidge(fit_intercept=False,
                  normalize=True)
skridge.fit(X_train, y_train)

CPU times: user 1.07 s, sys: 48 ms, total: 1.12 s
Wall time: 109 ms


In [16]:
%%time
sk_predict = skridge.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)

CPU times: user 224 ms, sys: 0 ns, total: 224 ms
Wall time: 5.64 ms


In [17]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

CPU times: user 9.45 s, sys: 1.47 s, total: 10.9 s
Wall time: 3.51 s


In [18]:
%%time
curidge = cuRidge(fit_intercept=False,
                  normalize=True,
                  solver='eig')
curidge.fit(X_cudf, y_cudf)

CPU times: user 544 ms, sys: 20 ms, total: 564 ms
Wall time: 561 ms


In [19]:
%%time
cu_predict = curidge.predict(X_cudf_test).to_array()
error_cu = mean_squared_error(y_test,cu_predict)

CPU times: user 524 ms, sys: 4 ms, total: 528 ms
Wall time: 526 ms


In [20]:
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)

SKL MSE(y):
0.0020651501306759553
CUML MSE(y):
0.0020651501306685753
