In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD as skTSVD
from cuml import TruncatedSVD as cumlTSVD
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        raise FileNotFoundError('Please download the required dataset or check the path')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    return df

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=5e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [5]:
%%time
nrows = 2**22
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)

use mortgage data
data (4194304, 40)
CPU times: user 8.92 s, sys: 956 ms, total: 9.87 s
Wall time: 9.87 s


In [6]:
n_components = 10
random_state = 42

In [7]:
%%time
algorithm='arpack'
tsvd_sk = skTSVD(n_components=n_components,algorithm=algorithm, 
            random_state=random_state)
result_sk = tsvd_sk.fit_transform(X)

CPU times: user 32.5 s, sys: 840 ms, total: 33.3 s
Wall time: 2.49 s


In [8]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 4.62 s, sys: 192 ms, total: 4.81 s
Wall time: 684 ms


In [9]:
%%time
algorithm='full'
tsvd_cuml = cumlTSVD(n_components=n_components,algorithm=algorithm, 
            random_state=random_state)
result_cuml = tsvd_cuml.fit_transform(X)

CPU times: user 2.06 s, sys: 228 ms, total: 2.28 s
Wall time: 1.1 s


In [10]:
for attr in ['singular_values_','components_']:
    passed = array_equal(getattr(tsvd_sk,attr),getattr(tsvd_cuml,attr),threshold=0.1)
    # larger error margin due to different algorithms: arpack vs full
    message = 'compare tsvd: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

compare tsvd: cuml vs sklearn          singular_values_ equal
compare tsvd: cuml vs sklearn               components_ equal


In [11]:
passed = array_equal(result_sk,result_cuml,threshold=0.1)
# larger error margin due to different algorithms: arpack vs full
message = 'compare tsvd: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

compare tsvd: cuml vs sklearn transformed results equal
