In [9]:
import sys
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.extmath import randomized_svd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from surprise import Reader, Dataset, SVD, dump
from surprise.model_selection import cross_validate  # thay cho evaluate
import surprise.accuracy as accuracy                 # giữ accuracy


# Custom libraries
sys.path.append('../Util')
from reduction import get_sparse
from loader import get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint
from reduction import reduce_matrix, get_sparse

In [10]:
# Set this to where you save and load all data
data_path = '../goodbooks-10k/'

In [11]:
# Get dataframe from books
books = get_book_dataframe(data_path)

saving books_dataframe to file


In [12]:
tfidf = TfidfVectorizer(stop_words='english')

In [13]:
tfidf_matrix_description = tfidf.fit_transform(books['description'])
tfidf_matrix_description.shape

(10000, 59713)

In [14]:
tfidf_matrix_shelves = tfidf.fit_transform(books['popular_shelves'])
tfidf_matrix_shelves.shape

(10000, 11245)

In [15]:
tfidf_matrix_tags = tfidf.fit_transform(books['tags'])
tfidf_matrix_tags.shape

(10000, 11245)

In [16]:
# Weight the smaller matrices by ratio to largest column matrix
shelves_weight = tfidf_matrix_description.shape[1] / tfidf_matrix_shelves.shape[1]
tags_weight = tfidf_matrix_description.shape[1] / tfidf_matrix_tags.shape[1]

In [17]:
tfidf_matrix_shelves = tfidf_matrix_shelves.multiply(shelves_weight)
tfidf_matrix_tags = tfidf_matrix_tags.multiply(tags_weight)

In [18]:
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])

print('printing feature_matrix to file')
scipy.sparse.save_npz('../.tmp/feature_matrix', feature_matrix)

printing feature_matrix to file


In [19]:
# SVD on full features to calculate sum of eigen values
U, E, V = reduce_matrix(feature_matrix, n_components=3000)

In [20]:
total_eigen_values = 0
for e in E:
    total_eigen_values += (e*e)
total_eigen_values

567667.4988303256

In [21]:
features_U, E_reduced, _ = reduce_matrix(feature_matrix, n_components=1000)

In [24]:
total_eigen_values = 567667.4988303256
reduced_eigen_values = 0
for e in E_reduced:
    reduced_eigen_values += (e*e)
reduced_eigen_values

554449.8748599396

In [25]:
information_loss = reduced_eigen_values/total_eigen_values
information_loss

0.9767159050013946

In [26]:
# Save the reduced feature matrix to save time elsewhere
filename = '../.tmp/feature_matrix_1000.npy'
np.save(filename, features_U)