### PMF5 Output Comparison
Summary: This notebook is used to explore methods for comparing the outputs of PMF5 to NMF-PY and development of metrics for evaluating the output of NMF-PY.

In [1]:
import os
import sys
import copy
import logging
import time
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import permutations

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from python.data.datahandler import DataHandler
from python.model.base_nmf import BaseSearch
from tests.factor_comparison import FactorComp
from python.model.optimization import ComponentSearch
from python.utils import calculate_Q

In [6]:
n_components = 4
features = 41

pmf_profile_file = os.path.join("D:\\", "projects", "nmf_py", "data", f"baton-rouge_{n_components}f_profiles.txt")
pmf_residuals_file = os.path.join("D:\\", "projects", "nmf_py", "data", "factor_test", f"br{n_components}f_residuals.txt")
pmf_factor_contributions_file = os.path.join("D:\\", "projects", "nmf_py", "data", f"baton-rouge_{n_components}f_contributions.txt")
nmf_output_file = os.path.join("..", "tests", "test-save-04.json")

In [9]:
profile_comparison = FactorComp(nmf_output=nmf_output_file, pmf_output=pmf_profile_file, factors=n_components, species=features, residuals_path=pmf_residuals_file)
profile_comparison.compare()


Calculating correlation between factors from each epoch: 100%|████████████████████████| 20/20 [00:00<00:00, 113.61it/s]


Number of permutations for 4 factors: 24


Calculating average correlation for all permutations for each epoch: 100%|█████████████| 20/20 [00:07<00:00,  2.56it/s]

R2 - Model: 9, Best permutations: ['Factor 2', 'Factor 3', 'Factor 1', 'Factor 4'], Average: 0.9725850224237742, Factors: [0.927419178508057, 0.9926387149042535, 0.9721656577625054, 0.998116538520281]
PMF5 Q(true): None, NMF-PY Model 9 Q(true): 86071.34375





In [24]:
pmf_factor_contributions = os.path.join("D:\\", "projects", "nmf_py", "data", "baton-rouge_4f_contributions.txt")

pmf_contributions_df = None
pmf_contribution_data = []
pmf_contribution_columns = None

factors = 4
column_row = 4
data_start_row = 5

column_labels = None
dates = []

with open(pmf_factor_contributions, 'r') as open_file:
    contribution_strings = open_file.read()
    rows = contribution_strings.split('\n')
    for i, row in enumerate(rows):
        if i == column_row-1:
            pmf_contribution_columns = row.split('\t')[2:]
        elif i >= data_start_row-1:
            row_cells = row.split('\t')
            if len(row_cells) > 1:
                dates.append(row_cells[1])
                pmf_contribution_data.append(row_cells[2:])
          
pmf_contributions_df = pd.DataFrame(pmf_contribution_data, columns=pmf_contribution_columns)
pmf_contributions_df["Datetime"] = dates

factor_types = {}
for f in pmf_contribution_columns:
    factor_types[f] = 'float'
pmf_contributions_df = pmf_contributions_df.astype(factor_types)
pmf_contributions_df

Unnamed: 0,Factor 1,Factor 2,Factor 3,Factor 4,Datetime
0,0.39197,1.268000,0.144910,0.14991,06/01/05 06:00
1,1.54410,0.449500,2.406800,1.59610,06/02/05 03:00
2,1.00530,0.990740,1.150000,1.98420,06/02/05 06:00
3,2.84740,0.516180,3.243700,1.23430,06/03/05 03:00
4,0.20761,2.094500,1.468700,0.17409,06/04/05 03:00
...,...,...,...,...,...
302,0.25549,0.633920,0.091281,3.89460,09/24/06 06:00
303,0.51440,0.402750,-0.027151,1.69450,09/25/06 06:00
304,0.30866,0.629770,-0.130690,0.10427,09/26/06 03:00
305,2.28460,0.094477,-0.055430,0.15904,09/26/06 06:00


In [25]:
profile_comparison.nmf_epochs_dfs[profile_comparison.best_model]["W"]

Unnamed: 0,Factor 1,Factor 2,Factor 3,Factor 4
0,73.759895,9.135810,78.294731,33.144077
1,148.109833,200.284637,54.098850,138.775116
2,91.277672,106.003624,89.278114,169.000015
3,208.418274,276.008240,128.955261,77.562653
4,43.055752,138.476669,111.505524,40.481544
...,...,...,...,...
302,-0.000000,19.150635,98.196846,383.738068
303,79.703934,2.978098,15.627500,149.849884
304,21.526638,-0.000000,41.759487,14.172704
305,178.774796,-0.000000,35.775497,42.865414


In [27]:
correlations = []

# i = 0
# factor = profile_comparison.factor_columns[i]
# nmf_factor = profile_comparison.factor_map[i]
# pmf_contribution = pmf_contributions_df[factor]
# nmf_contribution = profile_comparison.nmf_epochs_dfs[profile_comparison.best_model]["W"][nmf_factor]
# pmf_contribution = pmf_contribution.astype(float)
# nmf_contribution = nmf_contribution.astype(float)
# corr_matrix = np.corrcoef(nmf_contribution, pmf_contribution)
# corr = corr_matrix[0, 1]
# r_sq = corr ** 2

for i, factor in enumerate(profile_comparison.factor_columns):
    nmf_factor = profile_comparison.factor_map[i]
    pmf_contribution = pmf_contributions_df[factor]
    nmf_contribution = profile_comparison.nmf_epochs_dfs[profile_comparison.best_model]["W"][nmf_factor]
    pmf_contribution = pmf_contribution.astype(float)
    nmf_contribution = nmf_contribution.astype(float)
    corr_matrix = np.corrcoef(nmf_contribution, pmf_contribution)
    corr = corr_matrix[0, 1]
    r_sq = corr ** 2
    correlations.append(r_sq)
print(f"Contribution R2 Avg: {np.mean(correlations)}, Factor R2: {correlations}")

Contribution R2 Avg: 0.4525592910647951, Factor R2: [0.061472776728167734, 0.6904192165547623, 0.07848946657700788, 0.9798557043992424]


In [28]:
factor_permutations = list(permutations(self.factor_columns, len(self.factor_columns)))
factor_permutations

NameError: name 'permutations' is not defined

NameError: name 'nmf_df' is not defined

AttributeError: 'dict' object has no attribute 'iloc'

Unnamed: 0,Factor 1,Factor 2,Factor 3,Factor 4,Datetime
0,1.54410,0.449500,2.406800,1.59610,06/02/05 03:00
1,1.00530,0.990740,1.150000,1.98420,06/02/05 06:00
2,2.84740,0.516180,3.243700,1.23430,06/03/05 03:00
3,0.20761,2.094500,1.468700,0.17409,06/04/05 03:00
4,0.75381,0.903250,0.461180,0.23299,06/04/05 06:00
...,...,...,...,...,...
301,0.25549,0.633920,0.091281,3.89460,09/24/06 06:00
302,0.51440,0.402750,-0.027151,1.69450,09/25/06 06:00
303,0.30866,0.629770,-0.130690,0.10427,09/26/06 03:00
304,2.28460,0.094477,-0.055430,0.15904,09/26/06 06:00
