In [1]:
import sys
sys.path.append('../xrun')

In [2]:
import fractions

import numpy as np
import pandas as pd

from timeit import default_timer as timer

from scipy import linalg
from scipy.sparse import linalg as sparse_linalg, issparse
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.utils.extmath import svd_flip, safe_sparse_dot

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

from xrun.data.loader import load_dataset

In [3]:
input_path = "../data/input/sketch-docword.enron.8192.txt.gz"
X = load_dataset(input_path)

Loading BoW dataset from ../data/input/sketch-docword.enron.8192.txt.gz
Data shape: (8192, 28102)
Elapsed time: 20.48 secs


In [4]:
start_time = timer()
U, Sigma, VT = sparse_linalg.eigen.svds(A=X, which='LM', k=100, solver='arpack')
end_time = timer()
duration_sec = end_time - start_time
print(f"SVD computed in {duration_sec/60:.1f} minutes.")

# svds doesn't abide by scipy.linalg.svd/randomized_svd conventions, so reverse its outputs.
Sigma = Sigma[::-1]
U, VT = svd_flip(U[:, ::-1], VT[::-1])

SVD computed in 0.2 minutes.


In [5]:
U.shape

(8192, 100)

In [6]:
Sigma.shape

(100,)

In [7]:
VT.shape

(100, 28102)

In [8]:
X_transformed = np.dot(U, np.dot(np.diag(Sigma), VT))

In [9]:
X_transformed.shape

(8192, 28102)