# Truncated Singular Value Decomposition (TSVD) 
The TSVD algorithm is a linear dimensionality reduction algorithm that works really well for datasets in which samples correlated in large groups. TSVD does not center the data before computation unlike PCA. 

The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well  as cuDF DataFrames. 

In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. 

For additional information on the tsvd model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/0.6.0/api.html#truncated-svd

In [None]:
import os

import numpy as np

import pandas as pd
import cudf as gd

from sklearn.decomposition import TruncatedSVD as skTSVD
from cuml.decomposition import TruncatedSVD as cumlTSVD

In [None]:
n_samples = 100000
n_features = 2

n_components = 10
random_state = 42

## Generate Data

In [5]:
data, labels = datasets.make_blobs(
   n_samples=n_samples, n_features=n_features, centers=5, random_state=7)

## Fit Scikit-learn Model

In [None]:
%%time
tsvd_sk = skTSVD(n_components=n_components,
                 algorithm="arpack", 
                 random_state=random_state)

result_sk = tsvd_sk.fit_transform(data)

## Fit cuML Model

In [None]:
%%time
device_data = gd.DataFrame.from_pandas(data)

In [None]:
%%time
tsvd_cuml = cumlTSVD(n_components=n_components,
                     algorithm="full", 
                     random_state=random_state)

result_cuml = tsvd_cuml.fit_transform(device_data)

## Evaluate Results

In [None]:
# obtain attributes of the sklearn and cuml tsvd and check to see if they are equal
for attr in ['singular_values_','components_']:
    passed = array_equal(getattr(tsvd_sk,attr),getattr(tsvd_cuml,attr),threshold=0.1)
    # larger error margin due to different algorithms: arpack vs full
    message = 'compare tsvd: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

In [None]:
# compare the reduced matrix
passed = array_equal(result_sk,result_cuml,threshold=0.1)
# larger error margin due to different algorithms: arpack vs full
message = 'compare tsvd: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)