# TruncatedSVD benchmarks using the lobpcg solver

https://github.com/scikit-learn/scikit-learn/pull/12319

These benchmarks were run on a 4-core Xeon Skylake CPU on Linux with 64 GB RAM.

In [1]:
from sklearn.decomposition import TruncatedSVD

In [2]:
import numpy as np
import pandas as pd  # pandas is required for this notebook by the neurtu package
import scipy.sparse

from neurtu import delayed, timeit

In [3]:
rng = np.random.RandomState(43)

## Sparse data

In [7]:

def make_sparse(n_samples, n_features, density):
    rng1 = np.random.RandomState(42)
    rng2 = np.random.RandomState(43)

    nnz = int(n_samples*n_features*density)
    row = rng1.randint(n_samples, size=nnz)
    cols = rng2.randint(n_features, size=nnz)

    data = rng1.rand(nnz)
    
    X = scipy.sparse.coo_matrix((data, (row, cols)), shape=(n_samples, n_features))
    return X.asformat('csr')


def benchmark_sparse():
    for n_features in [10000, 100000]:
        for n_samples in [5000, 20000, 100000]:
            for density in [0.01, 0.0001]:
                if density == 0.01 and n_features > 50000:
                    continue
                #X = scipy.sparse.rand(n_samples, n_features, density=density, random_state=42)
                X = make_sparse(n_samples, n_features, density)
                for n_components in [2, 20, 100]:
                    for algorithm in ['randomized', 'lobpcg']:
                        params = {'n_components': n_components,
                                  "n_samples": n_samples,
                                  "n_features": n_features,
                                  "nnz": X.nnz,
                                  "density": density,
                                  "algorithm": algorithm,
                                 }
                        
                        yield delayed(TruncatedSVD, tags=params)(n_components=n_components, algorithm=algorithm).fit_transform(X) 

In [8]:
df = timeit(benchmark_sparse(), repeat=3).wall_time

162it [04:35,  5.65s/it]                       


Below is the wall_time in seconds (faster solver is highlighted in green),

In [9]:
def highlight_best(s):
    is_max = s == s.min()
    return ['background-color: #206b3c80' if v else '' for v in is_max]

df['mean'].unstack().round(2).style.apply(highlight_best, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,algorithm,lobpcg,randomized
n_components,n_samples,n_features,nnz,density,Unnamed: 5_level_1,Unnamed: 6_level_1
2,5000,10000,5000,0.0001,0.03,0.02
2,5000,10000,497497,0.01,0.1,0.07
2,5000,100000,49998,0.0001,0.08,0.12
2,20000,10000,19999,0.0001,0.06,0.04
2,20000,10000,1990158,0.01,0.33,0.27
2,20000,100000,199989,0.0001,0.2,0.2
2,100000,10000,99997,0.0001,0.12,0.15
2,100000,10000,9950411,0.01,1.6,1.32
2,100000,100000,999956,0.0001,0.84,0.52
20,5000,10000,5000,0.0001,0.07,0.05


## Dense data

In [14]:
def benchmark_dense():
    for n_features in [50, 500, 1000, 5000]:
        for n_samples in [5000, 20000, 50000, 100000, 1000000]:
                if n_features * n_samples > (5000*100000):
                    continue
                rng = np.random.RandomState(42)
                X = rng.randn(n_samples, n_features)
                for n_components in [2, 20, 100]:
                    if n_components >= n_features:
                        continue
                    for algorithm in ['randomized', 'lobpcg']:
                        params = {'n_components': n_components,
                                  "n_samples": n_samples,
                                  "n_features": n_features,
                                  "algorithm": algorithm,
                                 }
                        
                        yield delayed(TruncatedSVD, tags=params)(n_components=n_components, algorithm=algorithm).fit_transform(X) 

In [15]:
df = timeit(benchmark_dense(), repeat=3).wall_time

294it [13:26, 10.85s/it]                       


Below is the wall_time in seconds (faster solver is highlighted in green),

In [16]:
df['mean'].unstack().round(2).style.apply(highlight_best, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,algorithm,lobpcg,randomized
n_components,n_samples,n_features,Unnamed: 3_level_1,Unnamed: 4_level_1
2,5000,50,0.0,0.01
2,5000,500,0.03,0.03
2,5000,1000,0.08,0.07
2,5000,5000,0.39,0.33
2,20000,50,0.01,0.03
2,20000,500,0.13,0.16
2,20000,1000,0.29,0.3
2,20000,5000,1.46,1.3
2,50000,50,0.04,0.08
2,50000,500,0.3,0.39
