In [6]:
import numpy as np
import numpy.linalg as LA
import pandas as pd
import sys, os
sys.path.insert(0, "../")
from Methods.linalg_utils import normalize_column_pair, project_error

def generate_training_and_prediction_mask(train_data_path, all_data_path):
    all_movielens_data = np.load(all_data_path)
    train_data = np.load(train_data_path)
    all_data_mask = (all_movielens_data > 0).astype(int)
    known_mask = (train_data > 0).astype(int)
    prediction_mask = np.logical_xor(all_data_mask, known_mask)
    return known_mask, prediction_mask, all_data_mask, all_movielens_data

def create_df_from_result(data_dir, method_name, latent_dims, known_mask, prediction_mask, original_data, mode=None):
    # data_dir: /mnt/SSD6/qiujing/NMF/AI_exp/Clean_version/ENMF/Results/Movielens/
    df = pd.DataFrame({"latent_dim": latent_dims})
    error_list = []
    relative_error_training_list = []
    relative_error_prediction_list = []
    left_factor_name = "U"
    right_factor_name = "V"
    if method_name == "ENMFC":
        if mode == "softImpute":
            left_factor_name = "U_softimpute"
            right_factor_name = "V_softimpute"
            
    for latent_dim in latent_dims:
        data_path = os.path.join(data_dir, method_name, f"latent_dim_{latent_dim}", "1", f"{method_name}_Movielens_r_{latent_dim}_default.npy")
        data = np.load(data_path, allow_pickle=True)
        error, relative_error_training = project_error(data.all()["X"], data.all()[left_factor_name], data.all()[right_factor_name], known_mask)
        #new_prediction_mask = np.logical_or(prediction_mask, known_mask)
        new_prediction_mask = prediction_mask
        error_predion, relative_error_prediction = project_error(original_data, data.all()[left_factor_name], data.all()[right_factor_name], new_prediction_mask)
        relative_error_training_list.append(relative_error_training)
        relative_error_prediction_list.append(relative_error_prediction)
        print(data.all()["U"].shape)
    df["re_training"] = relative_error_training_list
    df["re_prediction"] = relative_error_prediction_list
    return df

In [9]:
data_dir = "/mnt/SSD6/qiujing/NMF/AI_exp/Clean_version/ENMF/Results/Movielens/"
latent_dims = [5, 10, 15, 20, 25]
dataset_dir = "/mnt/SSD6/qiujing/NMF/AI_exp/Clean_version/ENMF/Dataset"
train_data_path = os.path.join(dataset_dir, "movielens1m_0.8training.npy")
all_data_path = os.path.join(dataset_dir, "movielens1m.npy")
known_mask, prediction_mask, all_data_mask, all_movielens_data = generate_training_and_prediction_mask(train_data_path, all_data_path)
print(np.sum(known_mask), np.sum(prediction_mask), np.sum(all_data_mask))
df_columns = create_df_from_result(data_dir, "ENMFC", latent_dims, known_mask, prediction_mask, all_movielens_data)
print(df_columns)
df_columns_impute = create_df_from_result(data_dir, "ENMFC", latent_dims, known_mask, prediction_mask, all_movielens_data, mode="softImpute")
print(df_columns_impute)

797758 202451 1000209
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
   latent_dim  re_training  re_prediction
0           5     0.213549       0.244977
1          10     0.199806       0.247862
2          15     0.190218       0.252751
3          20     0.182325       0.259397
4          25     0.175138       0.264356
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
   latent_dim  re_training  re_prediction
0           5     0.219435       0.247008
1          10     0.207521       0.244459
2          15     0.199210       0.244417
3          20     0.192338       0.245827
4          25     0.186001       0.247081


In [8]:
df_columns = create_df_from_result(data_dir, "SCD", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("SCD", df_columns)
df_columns = create_df_from_result(data_dir, "ADM", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("ADM", df_columns)
df_columns = create_df_from_result(data_dir, "MUL", latent_dims, known_mask, prediction_mask, all_movielens_data)
print("MUL", df_columns)

(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
SCD    latent_dim  re_training  re_prediction
0           5     0.219602       0.247190
1          10     0.208943       0.245309
2          15     0.201888       0.245750
3          20     0.196196       0.246925
4          25     0.191176       0.248183
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
ADM    latent_dim  re_training  re_prediction
0           5     0.252985       0.278227
1          10     0.244707       0.266567
2          15     0.239040       0.261437
3          20     0.231451       0.255646
4          25     0.224205       0.251382
(6040, 5)
(6040, 10)
(6040, 15)
(6040, 20)
(6040, 25)
MUL    latent_dim  re_training  re_prediction
0           5     0.219973       0.247638
1          10     0.209449       0.245907
2          15     0.202461       0.246187
3          20     0.196937       0.247414
4          25     0.192126       0.248277


In [5]:
df_columns

Unnamed: 0,latent_dim,re_training,re_prediction
0,5,0.213549,0.244977
1,10,0.199806,0.247862
2,15,0.190218,0.252751
3,20,0.182325,0.259397
4,25,0.175138,0.264356


In [12]:
df_columns_impute

Unnamed: 0,latent_dim,re_training,re_prediction
0,5,0.219435,0.247008
1,10,0.207521,0.244459
2,15,0.19921,0.244417
3,20,0.192338,0.245827
4,25,0.186001,0.247081


In [3]:

df = pd.DataFrame({"latent_dim": latent_dims})

all_knwown_mask 
for latent_dim in latent_dims:
data_path = f"/mnt/SSD6/qiujing/NMF/AI_exp/Clean_version/ENMF/Results/Movielens/ENMFC/latent_dim_20/1/ENMFC_Verb_r_20_default.npy"
data = np.load(data_path,  allow_pickle=True)

958.2313551902771

In [6]:
print(data.all()["U"].shape, data.all()["V"].shape)
project_error(data.all()["X"], data.all()["U"], data.all()["V"], known_mask)

(6040, 20) (3706, 20)


(665.4989009986182, 0.19693699917937965)

In [51]:





LA.norm(data.all()["X"])

(6040, 20) (3706, 20)


In [25]:
data.all()["enmf_error"]/LA.norm(data.all()["X"])

0.30443355853871384

In [52]:

#softImpute_error = data.all()["svd_error"]
known_mask = (data.all()["X"] > 0).astype(int)
project_error(data.all()["X"], data.all()["U_eig"], data.all()["V_eig"], known_mask)

ImportError: cannot import name 'project_error' from 'Methods.linalg_utils' (/mnt/SSD6/qiujing/NMF/AI_exp/Clean_version/ENMF/Experiments/../Methods/linalg_utils.py)

In [46]:
project_error(data.all()["X"], data.all()["U_nmf"], data.all()["V_nmf"], known_mask)

(1028.7564016859758, 0.30443355853871384)

In [42]:
known_mask.shape

(6040, 3706)

In [33]:
data.all()["V_eig"].shape

(3706, 20)

In [31]:
data.all().keys()


dict_keys(['X', 'U', 'V', 'U_eig', 'V_eig', 't_svd', 'svd_error', 'U_rotate', 'V_rotate', 't_rotate', 'distance_po', 'U_mp', 'V_mp', 't_mp', 'hitmp_error', 'U_nmf', 'V_nmf', 't_nmf', 'enmf_error', 'total_time'])

In [9]:
for latent_dim in [5, 10, 15, 20, 25]:
    

array({'X': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]]), 'U': array([[0.15216899, 0.21187553, 0.34761604, ..., 0.27589336, 0.13263873,
        0.14322028],
       [0.18835826, 0.24551857, 0.37599376, ..., 0.31523816, 0.1628822 ,
        0.18120407],
       [0.1480152 , 0.1656852 , 0.29339423, ..., 0.27723887, 0.12067121,
        0.14721339],
       ...,
       [0.09747469, 0.14694233, 0.2432369 , ..., 0.18108316, 0.0867908 ,
        0.08940028],
       [0.19528782, 0.30999783, 0.41756477, ..., 0.28647438, 0.18235901,
        0.17763575],
       [0.17816883, 0.45905639, 0.56183767, ..., 0.21073227, 0.20363659,
        0.12785469]]), 'V': array([[0.31134911, 0.36338928, 0.89240163, ..., 0.79665899, 0.241283  ,
        0.30206648],
       [0.33717146, 0.36246816, 0.51285884, ..., 0.51846138, 0.2

In [None]:
# Create a df with latent_r as row, [enmf MUL, ADM, softImputeALS] as column

import pandas as pd

data.to_csv('YOUR_FILE/HERE.csv')