### PMF5 Output Comparison
Summary: This notebook is used to explore methods for comparing the outputs of PMF5 to NMF-PY and development of metrics for evaluating the output of NMF-PY.

In [1]:
import os
import sys
import copy
import logging
import time
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import permutations

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.datahandler import DataHandler
from src.model.base_nmf import BaseSearch
from tests.factor_comparison import FactorComp
from src.model.optimization import ComponentSearch
from src.utils import calculate_Q

In [13]:
n_components = 4
features = 41

pmf_profile_file = os.path.join("D:\\", "projects", "nmf_py", "data", f"baton-rouge_{n_components}f_profiles.txt")
pmf_residuals_file = os.path.join("D:\\", "projects", "nmf_py", "data", "factor_test", f"br{n_components}f_residuals.txt")
pmf_contribution_file = os.path.join("D:\\", "projects", "nmf_py", "data", f"baton-rouge_{n_components}f_contributions.txt")

output_path = os.path.join("D:\\", "projects", "nmf_py", "data", "factor_test")
nmf_file = f"nmf-br{n_components}-output.json"
nmf_output_file = os.path.join(output_path, nmf_file)

input_file = os.path.join("D:\\", "projects", "nmf_py", "data", "Dataset-BatonRouge-con.csv")
uncertainty_file = os.path.join("D:\\", "projects", "nmf_py", "data", "Dataset-BatonRouge-unc.csv")

In [14]:
index_col = "Date"

dh = DataHandler(input_path=input_file,  uncertainty_path=uncertainty_file, output_path=None, index_col=index_col)

19-Apr-23 10:55:31 - Input and output configured successfully


In [15]:
pc = FactorComp(nmf_output_file=nmf_output_file, pmf_profile_file=pmf_profile_file, pmf_contribution_file=pmf_contribution_file, factors=n_components, species=features, residuals_path=pmf_residuals_file)
pmf_q = calculate_Q(pc.pmf_residuals.values, dh.uncertainty_data_processed)
pc.compare(PMF_Q=pmf_q)

Calculating correlation between factors from each epoch: 100%|█████████████████████████| 20/20 [00:00<00:00, 56.83it/s]


Number of permutations for 4 factors: 24


Calculating average correlation for all permutations for each epoch: 100%|█████████████| 20/20 [00:07<00:00,  2.82it/s]

R2 - Model: 18, Best permutations: ['Factor 1', 'Factor 3', 'Factor 4', 'Factor 2'], Average R2: 0.9207010195701427, 
Profile R2 Avg: 0.9846012964113698, Contribution R2 Avg: 0.8568007427289156, 
Profile R2: [0.9614367437324146, 0.9996006824877424, 0.9812699869925912, 0.9960977724327312], 
Contribution R2: [0.8101476084787497, 0.703381937388954, 0.9493474624368172, 0.9643259626111415]
PMF5 Q(true): 86895.6875, NMF-PY Model 18 Q(true): 105543.5625





In [65]:
i = 0
nmf_f = pc.factor_map[i]
pmf_f = pc.factor_columns[i]
print(f"PMF factor {pmf_f} is mapped to NMF factor {nmf_f}")

PMF factor Factor 1 is mapped to NMF factor Factor 1


In [66]:
nmf_H_f = pc.nmf_epochs_dfs[pc.best_model]['H'].loc[nmf_f].to_numpy()    # 41 (features)
nmf_W_f = pc.nmf_epochs_dfs[pc.best_model]['W'][nmf_f].to_numpy()        # 307 (samples)
nmf_W_f = nmf_W_f.reshape(len(nmf_W_f), 1)
nmf_WH_f = np.multiply(nmf_W_f, nmf_H_f)

In [67]:
pmf_W_f = pc.pmf_contribution_df[pmf_f].to_numpy()
pmf_H_f = pc.pmf_profiles_df[pmf_f].to_numpy()
pmf_W_f = pmf_W_f.reshape(len(pmf_W_f), 1)
pmf_WH_f = np.multiply(pmf_W_f, pmf_H_f)

In [71]:
corr_matrix = np.corrcoef(nmf_WH_f.flatten(), pmf_WH_f.flatten())
corr = corr_matrix[0, 1]
r_sq = corr ** 2
r_sq

0.8790406753926603

In [72]:
correlations = []

for i, factor in enumerate(pc.factor_columns):
    nmf_f = pc.factor_map[i]
    pmf_f = factor
    nmf_H_f = pc.nmf_epochs_dfs[pc.best_model]['H'].loc[nmf_f].to_numpy()    # 41 (features)
    nmf_W_f = pc.nmf_epochs_dfs[pc.best_model]['W'][nmf_f].to_numpy()        # 307 (samples)
    nmf_W_f = nmf_W_f.reshape(len(nmf_W_f), 1)
    nmf_WH_f = np.multiply(nmf_W_f, nmf_H_f)
    
    pmf_W_f = pc.pmf_contribution_df[pmf_f].to_numpy()
    pmf_H_f = pc.pmf_profiles_df[pmf_f].to_numpy()
    pmf_W_f = pmf_W_f.reshape(len(pmf_W_f), 1)
    pmf_WH_f = np.multiply(pmf_W_f, pmf_H_f)
    
    corr_matrix = np.corrcoef(nmf_WH_f.flatten(), pmf_WH_f.flatten())
    corr = corr_matrix[0, 1]
    r_sq = corr ** 2
    correlations.append(r_sq)
print(f"Prime Profile - Factor Sample Contributions R2 Avg: {np.mean(correlations)}, Factor R2: {correlations}")

Prime Profile - Factor Sample Contributions R2 Avg: 0.9161556865735174, Factor R2: [0.8790406753926603, 0.8620530004439384, 0.9552797191480776, 0.9682493513093933]


In [39]:
factor_permutations = list(permutations(pc.factor_columns, len(pc.factor_columns)))
best_factor_mapping = None
best_model = -1
best_avg_r = 0
best_r = []

for model in range(len(pc.nmf_epochs_dfs)):
    for factor_p in factor_permutations:
        r_list = []
        for i, factor in enumerate(pc.factor_columns):
            nmf_factor =  factor_p[i]
            pmf_contribution = pmf_contributions_df[factor]
            nmf_contribution = pc.nmf_epochs_dfs[model]["W"][nmf_factor]
            pmf_contribution = pmf_contribution.astype(float)
            nmf_contribution = nmf_contribution.astype(float)
            corr_matrix = np.corrcoef(nmf_contribution, pmf_contribution)
            corr = corr_matrix[0, 1]
            r2 = corr ** 2
            r_list.append(r2)
        r2_mean = np.mean(r_list)
        if r2_mean > best_avg_r:
            best_avg_r = r2_mean
            best_r = r_list
            best_factor_mapping = factor_p
            best_model = model
print(f"Best Contribution Profile: {best_factor_mapping}, Avg R2: {best_avg_r}, Factor R2: {best_r}")

Best Contribution Profile: ('Factor 4', 'Factor 2', 'Factor 1', 'Factor 3'), Avg R2: 0.872502700516915, Factor R2: [0.7857782125497458, 0.7481539668120997, 0.9732683258267482, 0.9828102968790666]


9

NameError: name 'nmf_df' is not defined

AttributeError: 'dict' object has no attribute 'iloc'

Unnamed: 0,Factor 1,Factor 2,Factor 3,Factor 4,Datetime
0,1.54410,0.449500,2.406800,1.59610,06/02/05 03:00
1,1.00530,0.990740,1.150000,1.98420,06/02/05 06:00
2,2.84740,0.516180,3.243700,1.23430,06/03/05 03:00
3,0.20761,2.094500,1.468700,0.17409,06/04/05 03:00
4,0.75381,0.903250,0.461180,0.23299,06/04/05 06:00
...,...,...,...,...,...
301,0.25549,0.633920,0.091281,3.89460,09/24/06 06:00
302,0.51440,0.402750,-0.027151,1.69450,09/25/06 06:00
303,0.30866,0.629770,-0.130690,0.10427,09/26/06 03:00
304,2.28460,0.094477,-0.055430,0.15904,09/26/06 06:00
