In [None]:
import numpy as np
import numpy.linalg as LA
import pandas as pd
import sys, os
sys.path.insert(0, "../")
from Methods.linalg_utils import normalize_column_pair, project_error

def generate_training_and_prediction_mask(train_data_path, all_data_path):
    all_movielens_data = np.load(all_data_path)
    train_data = np.load(train_data_path)
    all_data_mask = (all_movielens_data > 0).astype(int)
    known_mask = (train_data > 0).astype(int)
    prediction_mask = np.logical_xor(all_data_mask, known_mask)
    return known_mask, prediction_mask, all_data_mask, all_movielens_data

def create_df_from_result(data_dir, method_name, latent_dims, known_mask, prediction_mask, original_data, mode=None):
    df = pd.DataFrame({"latent_dim": latent_dims})
    error_list = []
    relative_error_training_list = []
    relative_error_prediction_list = []
    left_factor_name = "U"
    right_factor_name = "V"
    if method_name == "ENMFC":
        if mode == "softImpute":
            left_factor_name = "U_softimpute"
            right_factor_name = "V_softimpute"
            
    for latent_dim in latent_dims:
        data_path = os.path.join(data_dir, method_name, f"latent_dim_{latent_dim}", "1", f"{method_name}_Movielens_r_{latent_dim}_default.npy")
        data = np.load(data_path, allow_pickle=True)
        error, relative_error_training = project_error(data.all()["X"], data.all()[left_factor_name], data.all()[right_factor_name], known_mask)
        #new_prediction_mask = np.logical_or(prediction_mask, known_mask)
        new_prediction_mask = prediction_mask
        error_predion, relative_error_prediction = project_error(original_data, data.all()[left_factor_name], data.all()[right_factor_name], new_prediction_mask)
        relative_error_training_list.append(relative_error_training)
        relative_error_prediction_list.append(relative_error_prediction)
        print(data.all()["U"].shape)
    df["re_training"] = relative_error_training_list
    df["re_prediction"] = relative_error_prediction_list
    return df

In [None]:
data_dir = "ENMF/Results/Movielens/"
latent_dims = [5, 10, 15, 20, 25]
dataset_dir = "ENMF/Dataset"
train_data_path = os.path.join(dataset_dir, "movielens1m_0.8training.npy")
all_data_path = os.path.join(dataset_dir, "movielens1m.npy")
known_mask, prediction_mask, all_data_mask, all_movielens_data = generate_training_and_prediction_mask(train_data_path, all_data_path)
print(np.sum(known_mask), np.sum(prediction_mask), np.sum(all_data_mask))
df_columns = create_df_from_result(data_dir, "ENMFC", latent_dims, known_mask, prediction_mask, all_movielens_data)
print(df_columns)
df_columns_impute = create_df_from_result(data_dir, "ENMFC", latent_dims, known_mask, prediction_mask, all_movielens_data, mode="softImpute")
print(df_columns_impute)

797758 202451 1000209
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
   latent_dim  re_training  re_prediction
0           5     0.213549       0.244977
1          10     0.199806       0.247862
2          15     0.190218       0.252751
3          20     0.182325       0.259397
4          25     0.175138       0.264356
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
   latent_dim  re_training  re_prediction
0           5     0.219435       0.247008
1          10     0.207521       0.244459
2          15     0.199210       0.244417
3          20     0.192338       0.245827
4          25     0.186001       0.247081


In [8]:
df_columns = create_df_from_result(data_dir, "SCD", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("SCD", df_columns)
df_columns = create_df_from_result(data_dir, "ADM", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("ADM", df_columns)
df_columns = create_df_from_result(data_dir, "MUL", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("MUL", df_columns)

(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
SCD    latent_dim  re_training  re_prediction
0           5     0.219602       0.247190
1          10     0.208943       0.245309
2          15     0.201888       0.245750
3          20     0.196196       0.246925
4          25     0.191176       0.248183
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
ADM    latent_dim  re_training  re_prediction
0           5     0.252985       0.278227
1          10     0.244707       0.266567
2          15     0.239040       0.261437
3          20     0.231451       0.255646
4          25     0.224205       0.251382
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
MUL    latent_dim  re_training  re_prediction
0           5     0.219973       0.247638
1          10     0.209449       0.245907
2          15     0.202461       0.246187
3          20     0.196937       0.247414
4          25     0.192126       0.248277
