In [None]:
import gzip
import os

import cudf
from cuml import PCA as cumlPCA
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA as skPCA
from sklearn.metrics import mean_squared_error

# Helper Functions

In [None]:
def load_data(nrows, ncols, cached="data/mortgage.npy.gz"):
    """Load mortgage dataset if available, otherwise use random data."""
    if os.path.exists(cached):
        print("Using mortgage data.")
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0, X.shape[0] - 1, nrows), :ncols]
    else:
        print("Using random data.")
        X = np.random.rand(nrows, ncols)
    df = pd.DataFrame({"fea%d" % i: X[:, i] for i in range(X.shape[1])})
    return df

In [None]:
def array_equal(a, b, threshold=2e-3, with_sign=True):
    """Check the equality of two arrays with a given threshold."""
    a = to_nparray(a)
    b = to_nparray(b)
    if not with_sign:
        a, b = np.abs(a), np.abs(b)
    error = mean_squared_error(a, b)
    res = error < threshold
    return res

def to_nparray(x):
    """Convert a given array into a numpy array."""
    if isinstance(x, np.ndarray) or isinstance(x, pd.DataFrame):
        return np.array(x)
    elif isinstance(x, np.float64):
        return np.array([x])
    elif isinstance(x, cudf.DataFrame) or isinstance(x, cudf.Series):
        return x.to_pandas().values
    return x   

# Run tests

In [None]:
%%time
nrows = 2**20
ncols = 400

X = load_data(nrows, ncols)
print("Data shape:", X.shape)

In [None]:
n_components = 10
whiten = False
random_state = 42
svd_solver = "full"

In [None]:
%%time
pca_sk = skPCA(n_components=n_components, svd_solver=svd_solver, 
               whiten=whiten, random_state=random_state)
result_sk = pca_sk.fit_transform(X)

In [None]:
%%time
X = cudf.DataFrame.from_pandas(X)

In [None]:
%%time
pca_cuml = cumlPCA(n_components=n_components, svd_solver=svd_solver, 
                   whiten=whiten, random_state=random_state)
result_cuml = pca_cuml.fit_transform(X)

In [None]:
for attr in ["singular_values_", "components_", "explained_variance_",
             "explained_variance_ratio_"]:
    passed = array_equal(getattr(pca_sk, attr), getattr(pca_cuml, attr))
    message = "compare pca: cuml vs sklearn {} {}".format(
        attr,
        "equal" if passed else "NOT equal"
    )
    print(message)

In [None]:
passed = array_equal(result_sk, result_cuml)
message = "compare pca: cuml vs sklearn transformed results %s" % \
    ("equal" if passed else "NOT equal")
print(message)