Using the method outlined in the Towards Data Science article. 
https://towardsdatascience.com/how-to-use-cross-validation-for-matrix-completion-2b14103d2c4c

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


using the smaller even selection table (5k ratings from each rating type)

In [14]:
user_ratings = np.load("./Data/even_selection_5k_pivot.pkl", mmap_mode='r', allow_pickle=True)

In [64]:
user_ratings = user_ratings.T

In [65]:
user_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17923 entries, 13000 to 99997323
Columns: 7816 entries, 2 to 261824
dtypes: float64(7816)
memory usage: 1.0 GB


In [66]:
user_ratings.head()

movie_id,2,6,7,10,11,12,13,14,15,16,...,254687,255254,255502,255725,255726,257143,257214,260195,260823,261824
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13000,,,,,,,,,,,...,,,,,,,,,,
21875,,,,,,,,,,,...,,,,,,,,,,
25831,,,,,,,,,,,...,,,,,,,,,,
27725,,,,,,,,,,,...,,,,,,,,,,
35040,,,,,,,,,,,...,,,,,,,,,,


In [17]:
null_mask = user_ratings.isnull()

In [18]:
total_nans = null_mask.sum().sum()
total_nans

140061273

In [19]:
total_entries = user_ratings.shape[0] * user_ratings.shape[1]
total_entries

140086168

In [20]:
sparsity = total_nans / total_entries
sparsity

0.9998222879506562

In [21]:
#just curious how many actual ratings there were. 
total_entries - total_nans

24895

I feel like this number should be exactly 25k. Will have to explore further to understand why it's not.

In [59]:
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

from sklearn.decomposition import TruncatedSVD
import time

In [47]:
def cv_matrices(X, fold):
    """
    function copied directly from TDS link at top of notebook.
    
    Given a matrix X, the function creates 4 sets of train + test matrices
    where each train matrix is masked with zeros in 0.25 of the values, and the
    test matrix is masked zeros in 0.75 of them.
    X - numpy array
    fold - is an integer from 0-3.
    Returns the masked data and also the masks for train and test
    """
    # Create a dict with the slicing indices
    rows = X.shape[0]
    cols = X.shape[1]
    mid_rows = int(rows/2)
    mid_cols = int(cols/2)
    
    idx_dict = {
                0: [[0,mid_rows],[0, mid_cols]],
                1: [[0,mid_rows],[mid_cols, cols]],
                2: [[mid_rows, rows], [0, mid_cols]],
                3: [[mid_rows, rows], [mid_cols, cols]]
    }
    
    idexes = idx_dict[fold]
    # Create masks
    train_mask = np.full((rows, cols), 1)
    train_mask[idexes[0][0]:idexes[0][1], idexes[1][0]:idexes[1][1]] = 0
    test_mask = 1 - train_mask
    
    
    # Create X_train
    X_train = X.copy()
    X_train[train_mask==0] = 0
    
    # Create X_test
    X_test = X.copy()
    X_test[train_mask==1] = 0
        
    return X_train, X_test, train_mask, test_mask

In [61]:
user_stats = pd.read_pickle("./Data/moderate_user_stats.pkl")

In [69]:
user_stats.loc[user_ratings.index]['mean_score']

user_id
13000       3.219178
21875       3.613982
25831       3.600000
27725       4.000000
35040       3.544554
              ...   
99983123    3.511278
99986847    4.390476
99989323    3.782051
99996903    3.650794
99997323    3.970109
Name: mean_score, Length: 17923, dtype: float64

In [70]:
user_ratings_centered = user_ratings.sub(user_stats.loc[user_ratings.index]['mean_score'], axis=0)
user_ratings_centered.fillna(0, inplace=True)

In [71]:
user_ratings_centered.head()

movie_id,2,6,7,10,11,12,13,14,15,16,...,254687,255254,255502,255725,255726,257143,257214,260195,260823,261824
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
X_train, X_test, train_mask, test_mask = cv_matrices(user_ratings_centered, 0)

In [80]:
test_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [81]:
np.where(test_mask==0,np.nan, test_mask)

array([[ 1.,  1.,  1., ..., nan, nan, nan],
       [ 1.,  1.,  1., ..., nan, nan, nan],
       [ 1.,  1.,  1., ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [123]:
start = time.time()
svd = TruncatedSVD(n_components=5, random_state=0)
svd.fit(X_train)
y_pred = svd.transform(X_test)

end = time.time()
print("that took {}s".format(end-start))

that took 2.553506851196289s


In [124]:
y_pred.shape

(17923, 5)

In [125]:
V = svd.components_

In [126]:
S = svd.singular_values_

In [127]:
S.shape

(5,)

In [129]:
y_pred_remake = np.round(np.dot(y_pred, V), 1)

In [147]:
y_pred_df = pd.DataFrame(y_pred_remake*np.where(test_mask==0,np.nan, test_mask),
                                index = X_test.index,
                                columns = X_test.columns).dropna(axis=0, how='all').dropna(axis=1, how='all')

In [148]:
y_pred_df

movie_id,2,6,7,10,11,12,13,14,15,16,...,28907,28910,28921,28974,28976,28979,28998,29032,29053,29071
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13000,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,...,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0
21875,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,...,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0
25831,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0
27725,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0
35040,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,...,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49515472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49515779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49523430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49530890,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0


In [169]:
# y_pred_df = y_pred_df.add(user_stats.loc[y_pred_df.index]['mean_score'], axis=0)
ypred_final = y_pred_df.mask(orig_nan_mask).values[~np.isnan(y_pred_df.mask(orig_nan_mask).values)]

In [149]:
y_true = pd.DataFrame(user_ratings_centered*np.where(test_mask==0,np.nan, test_mask),
                     index = X_test.index,
                    columns = X_test.columns)

In [150]:
y_true_df = y_true.dropna(axis=0, how='all').dropna(axis=1, how='all')

In [151]:
y_true_df

movie_id,2,6,7,10,11,12,13,14,15,16,...,28907,28910,28921,28974,28976,28979,28998,29032,29053,29071
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49515472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49515779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49523430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49530890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
y_true_df = y_true_df.add(user_stats.loc[y_true_df.index]['mean_score'], axis=0)

In [166]:
orig_nan_mask = user_ratings.loc[y_true_df.index, y_true_df.columns].isna()

In [170]:
ytrue_final = y_true_df.mask(orig_nan_mask).values[~np.isnan(y_true_df.mask(orig_nan_mask).values)]

In [171]:
mean_squared_error(ytrue_final, ypred_final, squared=False)

1.315725985489806

OK, that's a pretty big number for a scale that goes from 1-5

next steps: write functions to do all these steps, then test different n_components values.

Test this using the the "arpack" algorithm (uses scipy's svds)