# Ridge Regression Demo
The ridge regression model implemented in the cuml library allows the user to change the fit_intercept, normalize, solver and alpha parameters. The model accepts only numpy arrays or cudf dataframes as the input. It is important to understand that the 'svd' solver will run slower than the 'eig' solver however, the 'svd' solver is more stable and robust. Therefore, we would recomend that you use the 'eig' solver when a slight error is acceptable. For additional information please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html

In [1]:
import numpy as np
import pandas as pd
import cudf
import os
from cuml import Ridge as cuRidge
from sklearn.linear_model import Ridge as skRidge
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Select a particular GPU to run the notebook  
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Helper Functions

In [2]:
# helps to calculate the time required by a cell to run
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
# check if mortgage dataset is present and then extract the data from it, else just create a random dataset for regression
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    train_rows = int(nrows*0.8)
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
        df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
        df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    else:
        print('use random data')
        # create a random regression dataset
        X,y = make_regression(n_samples=nrows,n_features=ncols,n_informative=ncols, random_state=0)
        df_y_train = pd.DataFrame({'fea0':y[0:train_rows,]})
        df_y_test = pd.DataFrame({'fea0':y[train_rows:,]})

    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})

    return df_X_train, df_X_test, df_y_train, df_y_test

# Run tests

In [6]:
%%time
# nrows = number of samples
# ncols = number of features of each sample 

nrows = 2**15 
ncols = 399

#split the dataset into training and testing sets, in the ratio of 80:20 respectively
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

use mortgage data
training data (26214, 399)
training label (26214, 1)
testing data (6554, 399)
testing label (6554, 1)


In [7]:
%%time
# use the sklearn ridge regression model to fit the training dataset 
skridge = skRidge(fit_intercept=False,
                  normalize=True, alpha=0.1)
skridge.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [8]:
#%%time
# calculate the mean squared error of the sklearn ridge regression model on the testing dataset
sk_predict = skridge.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)

In [9]:
%%time
# convert the pandas dataframe to cudf format
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

In [13]:
%%time
# run the cuml ridge regression model to fit the training dataset 
curidge = cuRidge(fit_intercept=False,
                  normalize=True,
                  solver='svd', alpha=0.1)
curidge.fit(X_cudf, y_cudf)

<cuml.linear_model.ridge.Ridge at 0x7f4f6658bb38>

In [14]:
%%time
# calculate the mean squared error of the testing dataset using the cuml ridge regression model
cu_predict = curidge.predict(X_cudf_test).to_array()
error_cu = mean_squared_error(y_test,cu_predict)

In [15]:
# print the mean squared error of the sklearn and cuml model to analyse them
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)


SKL MSE(y):
1.824878534150775e-10
CUML MSE(y):
1.8368178e-10
