# Generalized Linear Models

This notebook includes code examples generalized linear models using RAPIDS cuDF and cuML. Right now, only the Ordinary Least Squares (OLS) method is included the notebook. Other linear models will be added in the coming versions.

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as sklOLS
from cuml import LinearRegression as cumlOLS
import cudf
import os
# from mpl_toolkits.mplot3d import Axes3D
# from matplotlib import pyplot as plt

### Helper Functions

In [3]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [4]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
    else:
        print('use random data')
        X = np.random.rand(nrows,ncols)
        
    df_X = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    df_y = pd.DataFrame({'fea%d'%i:y[:,i] for i in range(y.shape[1])})
    
    return df_X, df_y

In [5]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

## OLS with Mortgage Data

In [6]:
%%time
nrows = 2**21
ncols = 399

X, y = load_data(nrows,ncols)
print('data',X.shape)
print('label',y.shape)

use mortgage data
data (2097152, 399)
label (2097152, 1)
CPU times: user 29.7 s, sys: 3.12 s, total: 32.9 s
Wall time: 30.4 s


Even though the OLS interface of cuML is very similar to Scikit-Learn's implemetation, cuML doesn't use some of the parameters such as "copy" and "n_jobs". Also, cuML includes two different implementation of OLS using SVD and Eigen decomposition. Eigen decomposition based implementation is very fast but causes very small errors in the coefficients which is negligible for most of the applications. SVD is stable but slower than eigen decomposition based implementation. 

In [7]:
fit_intercept = True
normalize = False
algorithm = "eig" # eig: eigen decomposition based method, svd: singular value decomposition based method.

In [8]:
%%time
reg_sk = sklOLS(fit_intercept=fit_intercept, normalize=normalize)
result_sk = reg_sk.fit(X, y)

CPU times: user 1min, sys: 10.9 s, total: 1min 11s
Wall time: 13.2 s


In [9]:
%%time
y_sk = reg_sk.predict(X)
error_sk = mean_squared_error(y,y_sk)

CPU times: user 960 ms, sys: 2.11 ms, total: 962 ms
Wall time: 415 ms


In [10]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X)
y_cudf = np.array(y.as_matrix())
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

CPU times: user 2.43 s, sys: 720 ms, total: 3.15 s
Wall time: 2.36 s


In [11]:
%%time
reg_cuml = cumlOLS(fit_intercept=fit_intercept, normalize=normalize, algorithm=algorithm)
result_cuml = reg_cuml.fit(X_cudf, y_cudf)

CPU times: user 1.51 s, sys: 215 ms, total: 1.72 s
Wall time: 769 ms


In [14]:
%%time
y_cuml = reg_cuml.predict(X_cudf)
y_cuml = to_nparray(y_cuml).ravel()
error_cuml = mean_squared_error(y,y_cuml)

CPU times: user 106 ms, sys: 19.1 ms, total: 125 ms
Wall time: 113 ms


In [16]:
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cuml)

SKL MSE(y):
1.071529e-11
CUML MSE(y):
8.935713e-12
