# Linear Regression

This notebook includes code example linear regression using RAPIDS cuDF and cuML.

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model as sklGLM
from cuml import LinearRegression as cumlOLS
from cuml import Ridge as cumlRidge
import cudf
import os
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt

### Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
    else:
        print('use random data')
        X = np.random.rand(nrows,ncols)
        
    df_X = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    df_y = pd.DataFrame({'fea%d'%i:y[:,i] for i in range(y.shape[1])})
    
    return df_X, df_y

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

In [5]:
%%time
nrows = 2**20
ncols = 399

X, y = load_data(nrows,ncols)
print('data',X.shape)
print('label',y.shape)

use mortgage data
data (1048576, 399)
label (1048576, 1)
CPU times: user 20.8 s, sys: 2.38 s, total: 23.2 s
Wall time: 20.7 s


Even though the OLS interface of cuML is very similar to Scikit-Learn's implemetation, cuML doesn't use some of the parameters such as "copy" and "n_jobs". Also, cuML includes two different implementation of OLS using SVD and Eigen decomposition. Eigen decomposition based implementation is very fast but causes very small errors in the coefficients which is negligible for most of the applications. SVD is stable but slower than eigen decomposition based implementation. 

In [6]:
fit_intercept = True
normalize = False
algorithm = "eig" # eig: eigen decomposition based method, svd: singular value decomposition based method.

In [7]:
%%time
reg_sk = sklGLM.LinearRegression(fit_intercept=fit_intercept, normalize=normalize)
result_sk = reg_sk.fit(X, y)

CPU times: user 1min 1s, sys: 12.9 s, total: 1min 14s
Wall time: 12.8 s


In [8]:
%%time
y_sk = reg_sk.predict(X)
error_sk = mean_squared_error(y,y_sk)

CPU times: user 1.23 s, sys: 16 ms, total: 1.25 s
Wall time: 254 ms


In [9]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X)
y_cudf = np.array(y.as_matrix())
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

CPU times: user 3.3 s, sys: 2.45 s, total: 5.75 s
Wall time: 4.95 s


In [10]:
%%time
reg_cuml = cumlOLS(fit_intercept=fit_intercept, normalize=normalize, algorithm=algorithm)
result_cuml = reg_cuml.fit(X_cudf, y_cudf)

CPU times: user 1.43 s, sys: 196 ms, total: 1.62 s
Wall time: 649 ms


In [11]:
%%time
y_cuml = reg_cuml.predict(X_cudf)
y_cuml = to_nparray(y_cuml).ravel()
error_cuml = mean_squared_error(y,y_cuml)

CPU times: user 120 ms, sys: 20 ms, total: 140 ms
Wall time: 130 ms


In [12]:
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cuml)

SKL MSE(y):
1.7849512e-14
CUML MSE(y):
9.758998e-12
