## Modeling Profiler

Using the ESAT simulator to evaluate potential approaches to optimize modeling very large datasets.

The first approach will look at implementing and validating the following workflow:
1. Create a subset dataset of the input by randomly selecting N values from the input/uncertainty.
2. Train a single model on that data until convergence.
3. Use the factor profile H matrix to calculate a W for the complete dataset.
4. Calculate Q(full)
5. Take a new subset of the data, restart training with the prior H.
6. Repeat until Q(full) is no longer decreasing.

Run full dataset model with the same random seed and evaluate the difference in loss and factor profiles.

#### Code Imports

In [1]:
from esat.data.datahandler import DataHandler
from esat.model.batch_sa import BatchSA
from esat.model.sa import SA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat_eval.simulator import Simulator
from esat.estimator import FactorEstimator
from esat_eval.factor_catalog import FactorCatalog, Factor

from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import min_weight_full_bipartite_matching
from sklearn.decomposition import PCA

from tqdm.notebook import tqdm

import plotly.graph_objects as go
import plotly.colors as pc
import plotly.io as pio
import logging
import time
import pandas as pd
import numpy as np
import copy

logger = logging.getLogger(__name__)

#### Synthetic Dataset

Generate a synthetic dataset where the factor profiles and contributions are pre-determined for model output analysis.

In [2]:
# Synethic dataset parameters
seed = 1
syn_factors = 6                # Number of factors in the synthetic dataset
syn_features = 40              # Number of features in the synthetic dataset
syn_samples = 50000             # Number of samples in the synthetic dataset
outliers = True                # Add outliers to the dataset
outlier_p = 0.10               # Decimal percent of outliers in the dataset
outlier_mag = 1.25                # Magnitude of outliers
contribution_max = 2           # Maximum value of the contribution matrix (W) (Randomly sampled from a uniform distribution)
noise_mean_min = 0.03          # Min value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_mean_max = 0.05          # Max value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_scale = 0.1              # Scale of the noise added to the synthetic dataset
uncertainty_mean_min = 0.04    # Min value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_mean_max = 0.06    # Max value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_scale = 0.01       # Scale of the uncertainty matrix

In [3]:
# Initialize the simulator with the above parameters
simulator = Simulator(seed=seed,
                      factors_n=syn_factors,
                      features_n=syn_features,
                      samples_n=syn_samples,
                      outliers=outliers,
                      outlier_p=outlier_p,
                      outlier_mag=outlier_mag,
                      contribution_max=contribution_max,
                      noise_mean_min=noise_mean_min,
                      noise_mean_max=noise_mean_max,
                      noise_scale=noise_scale,
                      uncertainty_mean_min=uncertainty_mean_min,
                      uncertainty_mean_max=uncertainty_mean_max,
                      uncertainty_scale=uncertainty_scale
                     )

29-Apr-25 15:14:28 - Synthetic profiles generated


In [4]:
# Example command for passing in a custom factor profile matrix, instead of the randomly generated profile matrix.
# my_profile = np.ones(shape=(syn_factors, syn_features))
# simulator.generate_profiles(profiles=my_profile)

In [5]:
# Example of how to customize the factor contributions. Curve_type options: 'uniform', 'decreasing', 'increasing', 'logistic', 'periodic'
# simulator.update_contribution(factor_i=0, curve_type="logistic", scale=0.1, frequency=0.5)
# simulator.update_contribution(factor_i=1, curve_type="periodic", minimum=0.0, maximum=1.0, frequency=0.5, scale=0.1)
# simulator.update_contribution(factor_i=2, curve_type="increasing", minimum=0.0, maximum=1.0, scale=0.1)
# simulator.update_contribution(factor_i=3, curve_type="decreasing", minimum=0.0, maximum=1.0, scale=0.1)
# simulator.plot_synthetic_contributions()

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [6]:
syn_input_df, syn_uncertainty_df = simulator.get_data()

29-Apr-25 15:14:28 - Synthetic data generated
29-Apr-25 15:14:28 - Synthetic uncertainty data generated
29-Apr-25 15:14:28 - Synthetic dataframes completed
29-Apr-25 15:14:29 - Synthetic source apportionment instance created.


In [7]:
data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
V, U = data_handler.get_data()

#### Input Parameters

In [8]:
index_col = "Date"                  # the index of the input/uncertainty datasets
# factors = syn_factors               # the number of factors
factors = 6
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1                # convergence criteria for the change in loss, Q
converge_n = 25                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.
optimized = True                    # use the Rust code if possible
parallel = True                     # execute the model training in parallel, multiple models at the same time

### Train Batch Profile

In [9]:
def calculate_W(V, U, H):
    H[H <= 0.0] = 1e-8
    W = np.matmul(V * np.divide(1, U), H.T)
    return W

def q_loss(V, U, H, W):
    residuals = (V-np.matmul(W, H))/U
    return np.sum(residuals)

def mse(V, U, H, W):
    WH = np.matmul(W, H)
    residuals = ((V-WH)/U)**2
    return np.sum(residuals)/V.size

def compare_H(H1, H2):
    correlation_matrix = np.zeros((H1.shape[0], H2.shape[0]))
    for i in range(H1.shape[0]):
        f1 = H1[i].astype(float)
        for j in range(H2.shape[0]):
            f2 = H2[j].astype(float)
            corr_matrix = np.corrcoef(f2, f1)
            corr = corr_matrix[0, 1]
            r_sq = corr ** 2
            correlation_matrix[i,j] = r_sq
    return correlation_matrix
            
def plot_correlations(matrix):
    header = [f"Factor {i}" for i in range(matrix.shape[0])]
    fig = go.Figure(data=[go.Table(header=dict(values=header), cells=dict(values=matrix))])
    fig.show()

def prepare_data(V, U, i_selection):
    _V = pd.DataFrame(V.copy()[i_selection,:])
    _U = pd.DataFrame(U.copy()[i_selection,:])
    
    for f in _V.columns:
        _V[f] = pd.to_numeric(_V[f])
        _U[f] = pd.to_numeric(_U[f])
    return _V.to_numpy(), _U.to_numpy()

### Version 2 of the FactorCatalog

FactorCatalog V2 takes a more robust approach to grouping factor profiles from multiple models with the grouping occuring after all factor profiles have been collected. The updated procedure
is designed to be used to investigate potential solutions for creating models for very large datasets using subsets of the data. The algorithm is described as:
1. Specify your hyper-parameters: samples_n, batches_n, models_n, random_seed, correlation_threshold, factors_k
2. For each batch in batches_n.
3. Create a subset dataset using samples_n randomly selected values from V/U.
4. Created a batchSA instance of models_n, using random_seed and factors_k.
5. Add each output factor to the FactorCatalog (factor model_i, fit_rmse, factor_i, H)
6. Once all batches are completed, cluster the factor collection using a constrained k-means cluster function.
7. Score the models based upon a heuristic, such as the sum of (cluster_cor_avg*cluster_members)
8. Evaluate the clustered profile matrix, using the cluster centroid values, using the complete dataset.

The primary modification of the FactorCatalog is to use the constrained k-means clustering function for grouping 'like' factor profiles. The procedure will work by:
1. Starting with the factors_k=clusters_n, calculate the correlation of all factors/points to the centroids, by model.
2. Initialize the clusters by randomly creating clusters or by selecting clusters_n 'dissimilar' factors.
3. Randomly shuffle the model order for factor assignment:
   1. Assign the factors to the clusters by order of correlation, closer to 1.0 goes first.
   2. If more than one factor in a model would be assigned to the same cluster assign the factor with the highest cor to that cluster and then repeat excluding that cluster(s) until all factors are assigned.
   3. If any given factor does not have a correlation above the specified threshold, create a new cluster centered at that point.
4. At the end of each assignment iteration, remove any cluster which has no members.
5. Once max_assignment_n iterations is reached stop, or when a reassignment doesn't change.

The constrained k-means clustered is a standard clustering approach with the exception that the distance is calculated as 1/r2 of the point to the cluster centroid. Two other differences are a) the number of clusters can increase and decrease depending on correlation threshold and by the constraint that a model can only contribute one factor to any given cluster.

Once all the factors for all models have been clustered, the FactorCatalog models can be scored based upon the heuristic stated in stage A7. The best model, or any selected model, factor profile matrix (H) can then be selected for final evaluation. The factor profile H is not what was produced by the model, but is the mean values of the FactorCatalog's factor (the centroid of the cluster that those factors were assigned to). This approach allows for the factor profile values to be provided as a distributed of possible values for each feature, or demonstrating potential uncertainty in the factor profile. The clustered factor profile is then used to fit the full dataset, but keeping H constant. The loss can then be evaluated against what is calculated for a long-running brute force approach. 

An evaluation of the impact on the model/W matrix and loss given a random selection, MC, simulation of the factor profile would be an interesting next step.

In [None]:
class Factor:
    def __init__(self,
                 factor_id,
                 profile,
                 model_id
                ):
        self.factor_id = factor_id
        self.profile = profile
        self.model_id = model_id
        self.cluster_id = None
        self.cor = None

    def assign(self, cluster_id, cor):
        self.cluster_id = cluster_id
        self.cor = cor

    def deallocate(self):
        self.cluster_id = None
        self.cor = None

    def distance(self, cluster):
        f1 = np.array(self.profile).astype(float)
        f2 = np.array(cluster).astype(float)
        corr_matrix = np.corrcoef(f2, f1)
        corr = corr_matrix[0, 1]
        r_sq = corr ** 2
        return r_sq


class Model:
    def __init__(self,
                 model_id):
        self.model_id = model_id
        self.factors = []

        self.score = None
        
    def add_factor(self, factor):
        self.factors.append(factor)


class Cluster:
    def __init__(self,
                 cluster_id,
                 centroid: np.ndarray
                ):
        self.cluster_id = cluster_id
        self.centroid = centroid
        self.factors = []
        self.count = 0

        self.mean_r2 = 0
        self.std = 0
        self.min_values = np.full(len(centroid), np.nan)
        self.max_values = np.full(len(centroid), np.nan)
        # print(f"Initilaized cluster {self.cluster_id} with centroid: {self.centroid}")

    def __len__(self):
        return self.count

    def add(self, factor: Factor, cor: float):
        factor.assign(cluster_id=self.cluster_id, cor=cor)
        self.factors.append(factor)
        self.count += 1
        self.min_values = np.minimum(self.min_values, factor.profile)
        self.max_values = np.maximum(self.max_values, factor.profile)
        self.mean_r2 = np.mean([factor.cor for factor in self.factors])
        self.std = np.std([factor.profile for factor in self.factors], axis=0)

    def purge(self):
        # print(f"Purging Cluster: {self.cluster_id}, Centroid: {self.centroid}")
        for factor in self.factors: factor.deallocate()
        self.factors = []
        self.count = 0
        self.mean_r2 = 0
        self.std = 0
        self.min_values = np.full(len(self.centroid), np.nan)
        self.max_values = np.full(len(self.centroid), np.nan)

    def recalculate(self):
        if len(self.factors) > 0:
            factor_matrix = np.array([factor.profile for factor in self.factors])
            new_centroid = np.mean(factor_matrix, axis=0)
            # print(f"Recalculating Cluster: {self.cluster_id}, Centroid: {new_centroid}")
            self.centroid = new_centroid

    def plot(self):
        n_features = len(self.centroid)
        factor_matrix = np.array([factor.profile for factor in self.factors])
        
        box_plot = go.Figure()
        for i in range(n_features):
            box_plot.add_trace(go.Box(
                y=factor_matrix[:,i], 
                boxpoints="all", 
                jitter=0.5,
                whiskerwidth=0.2,
                fillcolor=cls,
                marker_size=2,
                line_width=1, 
                name=f"Feature {i+1}")
            )
        box_plot.add_trace(go.Scatter(
            x=np.range(n_features), 
            y=self.centroid, 
            name="Centroid",
            mode='markers',
            marker=dict(color='red', size=10)
        ))
        box_plot.update_layout(title="Clustered Factor Profile", width=1200, height=800)
        box_plot.show()

        
class BatchFactorCatalog:
    def __init__(self,
                 n_factors: int,
                 n_features: int,
                 threshold: float = 0.8,
                 seed: int = 42
                ):
        self.n_factors = n_factors
        self.n_features = n_features
        self.threshold = threshold

        self.rng = np.random.default_rng(seed)

        self.models = {}
        self.model_count = 0
        self.factors = {}
        self.factor_count = 0

        # Min and max values for all factor vectors, used for random initialization of the centroids in clustering
        self.factor_min = None
        self.factor_max = None

        self.clusters = []
        self.dropped_clusters = []

        self.state = {}

    def results(self):
        results = {}
        for cluster in self.clusters:
            results[cluster.cluster_id] = {
                "count": len(cluster),
                "mean_r2": cluster.mean_r2,
                "std": cluster.std
            }
        return results

    def plot(self):
        palette = pc.qualitative.Plotly  # You can replace 'Plotly' with 'Set3', 'Pastel', etc.
        colors = [palette[i % len(palette)] for i in range(len(self.clusters))]

        all_factors = np.array([v.profile for k, v in self.factors.items()])
        factor_assignment = np.array([v.cluster_id for k, v in self.factors.items()])
        all_centroids = np.array([cluster.centroid for cluster in self.clusters])

        factor_pca = PCA(n_components=3)
        pca_results = factor_pca.fit_transform(all_factors)
        
        pca_centroids = factor_pca.transform(all_centroids)
        
        df_pca = pd.DataFrame(pca_results, columns=['PC1', 'PC2', 'PC3'])
        df_pca["Cluster"] = factor_assignment

        n_clusters = len(all_centroids)

        fig = go.Figure()

        for cluster in range(n_clusters):
            cluster_data = df_pca[df_pca['Cluster'] == cluster]
            if len(cluster_data) == 0:
                continue
            color = colors[cluster]
            fig.add_trace(go.Scatter3d(
                x=cluster_data['PC1'],
                y=cluster_data['PC2'],
                z=cluster_data['PC3'],
                mode='markers',
                marker=dict(
                    size=5,
                    color=color,  # Assign unique color to each cluster
                    opacity=0.8
                ),
                name=f"Cluster {cluster}"
            ))
            fig.add_trace(go.Scatter3d(
                x=[pca_centroids[cluster, 0]],
                y=[pca_centroids[cluster, 1]],
                z=[pca_centroids[cluster, 2]],
                mode='markers',
                marker=dict(
                    size=5,
                    color=color,  # Same color as the cluster
                    symbol='x',
                    opacity=0.8
                ),
                name=f"Centroid {cluster}"
            ))
        
        # Update layout for better visualization
        fig.update_layout(
            title="Cluster Centroids",
            scene=dict(
                xaxis_title="PCA1",
                yaxis_title="PCA2",
                zaxis_title="PCA3"
            ),
            showlegend=False,
            height=1200,
            margin=dict(l=0, r=0, b=0, t=40)
        )
        
        # Show the plot
        fig.show()

    def animiate(self, to_file: bool=False, base_matrix = None):
        palette = pc.qualitative.Plotly  # You can replace 'Plotly' with 'Set3', 'Pastel', etc.
        colors = [palette[i % len(palette)] for i in range(len(self.clusters))]
        
        all_factors = np.array([v.profile for k, v in self.factors.items()])
        factor_assignment = np.array([v.cluster_id for k, v in self.factors.items()])
        all_centroids = np.array([cluster.centroid for cluster in self.clusters])
        
        factor_pca = PCA(n_components=3)
        pca_results = factor_pca.fit_transform(all_factors)
        
        pca_centroids = factor_pca.transform(all_centroids)

        plot_base = False
        if base_matrix not None:
            pcs_base = factor_pca.transform(base_matrix)
            df_base = pd.DataFrame(pcs_base, columns=['PCA1', 'PCA2', 'PCA3'])
            plot_base = True
        
        df_pca = pd.DataFrame(pca_results, columns=['PCA1', 'PCA2', 'PCA3'])
        df_pca["Cluster"] = factor_assignment

        frames = []
        state0 = None
        color_map = {}
        total_clusters = len(self.clusters)
        color_palette = generate_cluster_colors(total_clusters)
        
        for i in range(len(self.state)):
            i_centroids = self.state[i]["centroids"]
            factor_assignments = self.state[i]["assignment"]
            pca_centroids = factor_pca.transform(i_centroids)
            df_pca = pd.DataFrame(pca_results, columns=['PCA1', 'PCA2', 'PCA3'])
            df_pca["Cluster"] = factor_assignments
            df_pca["text"] = "Factor Profile"
        
            df_centroids = pd.DataFrame(pca_centroids, columns=['PCA1', 'PCA2', 'PCA3'])
            assigned_centroids, cluster_size = np.unique(factor_assignments, return_counts=True)
            df_centroids = df_centroids.iloc[assigned_centroids]
            df_centroids["Cluster"] = np.arange(len(assigned_centroids))
            df_centroids["text"] = "Centroid"
        
            # Update color map for new clusters
            for cluster in np.unique(assigned_centroids):
                if cluster not in color_map:
                    color_map[cluster] = color_palette[len(color_map) % len(color_palette)]
        
            # Assign colors to points based on the fixed color map
            colors = [color_map[cluster] for cluster in assigned_centroids]
            
            data = []
            data.append(go.Scatter3d(
                x=df_pca["PCA1"],
                y=df_pca["PCA2"],
                z=df_pca["PCA3"],
                mode='markers',
                marker=dict(
                    size=5,
                    color=df_pca["Cluster"],  # Color points by class
                    colorscale=colors,  # Choose a colorscale
                    colorbar=dict(title="Cluster")  # Add a colorbar
                ),
                text=df_pca["text"],
                hoverinfo='text'
            ))
            assigned_cluster_size = cluster_size * 5
            data.append(go.Scatter3d(
                x=df_centroids['PCA1'],
                y=df_centroids['PCA2'],
                z=df_centroids['PCA3'],
                mode='markers',
                marker=dict(
                    size=assigned_cluster_size,
                    color=df_centroids["Cluster"],
                    colorscale=colors,  # Same color as the cluster
                    opacity=0.3
                ),
                text=df_centroids["text"],
                hoverinfo='text'
            ))
            data.append(go.Scatter3d(
                x=df_centroids['PCA1'],
                y=df_centroids['PCA2'],
                z=df_centroids['PCA3'],
                mode='markers',
                marker=dict(
                    size=3,
                    color=df_centroids["Cluster"],
                    colorscale=colors,  # Same color as the cluster
                    symbol='x',
                ),
                text=df_centroids["text"],
                hoverinfo='text'
            ))
            if plot_base:
                data.append(go.Scatter3d(
                    x=df_base['PCA1'],
                    y=df_base['PCA2'],
                    z=df_base['PCA3'],
                    mode='markers',
                    marker=dict(
                        size=3,
                        color="red,
                    )
                ))
            
            if i == 0:
                state0 = data
            frames.append(
                go.Frame(data=data, 
                         layout=go.Layout(
                             annotations=[
                                dict(
                                    x=1,
                                    y=1,
                                    showarrow=False,
                                    text=f"Iteration: {i + 1}/{len(factor_catalog.state)}",
                                    xref="paper",
                                    yref="paper",
                                    font=dict(size=14)
                                )
                            ]
                        ),
                         name=str(i)
                        )
            )
        df_pca0 = pd.DataFrame(pca_results, columns=['PC1', 'PC2', 'PC3'])
        camera = dict(
            eye=dict(x=1.25, y=1.25, z=1.25)  
        )
        fig = go.Figure(
            data=state0,
            layout=go.Layout(
                title="Factor Profile Clustering",
                height=1000,
                width=1000,
                scene=dict(
                    xaxis=dict(range=[df_pca0['PC1'].min()-0.25, df_pca0['PC1'].max()+0.25], autorange=False),
                    yaxis=dict(range=[df_pca0['PC2'].min()-0.25, df_pca0['PC2'].max()+0.25], autorange=False),
                    zaxis=dict(range=[df_pca0['PC3'].min()-0.25, df_pca0['PC3'].max()+0.25], autorange=False),
                    aspectmode="manual",
                    aspectratio=dict(x=1, y=1, z=1)
                ),
                updatemenus=[dict(
                    type="buttons",
                    showactive=False,
                    buttons=[
                        dict(label="Play",
                             method="animate",
                             args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True, mode="immediate")]),
                        dict(label="Pause",
                             method="animate",
                             args=[[None], dict(frame=dict(duration=0, redraw=False), mode="immediate")]),
                        dict(
                            args=[[0], dict(frame=dict(duration=0, redraw=True), mode="immediate")],
                            label="Reset",
                            method="animate"),
                    ]
                )],
                annotations=[
                    dict(
                        x=1,
                        y=1,
                        showarrow=False,
                        text="Iteration: NA",
                        xref="paper",
                        yref="paper",
                        font=dict(size=14)
                    )
                ],
                showlegend=False
            ),
            frames=frames
        )
        # Show the figure
        if to_file:
            pio.write_html(fig, file="factor_clustering.html", auto_open=False)
        else:
            fig.show()
        
    def add_model(self, model: SA, norm: bool = True):
        model_id = self.model_count
        model_factor_ids = []
        norm_H = model.H / np.sum(model.H, axis=0)
        i_model = Model(model_id=model_id)
        for i in range(model.H.shape[0]):
            factor_id = self.factor_count
            self.factor_count += 1
            model_factor_ids.append(factor_id)
            i_H = norm_H if norm else model.H 
            factor = Factor(factor_id=factor_id, profile=i_H[i], model_id=model_id)
            
            i_model.add_factor(factor)
            self.factors[factor_id] = factor
            self.update_ranges(i_H[i])
            
        self.models[str(model_id)] = i_model
        self.model_count += 1

    def compare(self, matrix):
        compare_results = {}
        for i in range(matrix.shape[0]):
            i_H = matrix[i]
            i_cor = 0.0
            best_cluster = None
            for cluster in self.clusters:
                cluster_cor = self.distance(i_H, cluster.centroid)
                if cluster_cor > i_cor:
                    i_cor = cluster_cor
                    best_cluster = cluster.cluster_id
            compare_results[i] = {"cluster_id": best_cluster, "r2": i_cor}
        return compare_results

    def score(self):
        # iterate over all models, get the membership count the cluster that each factor is mapped to.
        for model_id, model in self.models.items():
            model_score = 0.0
            for factor in model.factors:
                factor_score = len(self.clusters[factor.cluster_id])
                model_score += factor_score
            model.score = model_score

    def update_ranges(self, factor):
        if self.factor_min is None and self.factor_max is None:
            self.factor_min = copy.copy(factor)
            self.factor_max = copy.copy(factor)
        else:
            self.factor_min = np.minimum(self.factor_min, factor)
            self.factor_max = np.maximum(self.factor_max, factor)

    def initialize_clusters(self):
        for k in range(self.n_factors):
            new_centroid = np.zeros(self.n_features)
            for i in range(self.n_features):
                i_v = self.rng.uniform(low=self.factor_min[i], high=self.factor_max[i])
                new_centroid[i] = i_v
            cluster = Cluster(cluster_id=k, centroid=new_centroid)
            self.clusters.append(cluster)

    def purge_clusters(self):
        for cluster in self.clusters:
            cluster.purge()

    def distance(self, factor1, factor2):
        f1 = np.array(factor1).astype(float)
        f2 = np.array(factor2).astype(float)
        corr_matrix = np.corrcoef(f2, f1)
        corr = corr_matrix[0, 1]
        r_sq = corr ** 2
        return r_sq

    def calculate_centroids(self):
        new_centroid_matrix = []
        for cluster in self.clusters: 
            if cluster is not None:
                cluster.recalculate()
                new_centroid_matrix.append(cluster.centroid)
        return np.array(new_centroid_matrix)

    def cluster_cleanup(self):
        drop_clusters = []
        for i in range(len(self.clusters)):
            cluster_i = self.clusters[i]
            if cluster_i is None:
                for j in range(i+1, len(self.clusters)):
                    cluster_j = self.clusters[j]
                    ij_cor = self.distance(cluster_i.centroid, cluster_j.centroid)
                    if ij_cor > self.threshold:
                        smaller_cluster = i if len(cluster_i) < len(cluster_j) else j
                        if smaller_cluster not in drop_clusters:
                            drop_clusters.append(smaller_cluster)
        new_clusters = []
        new_centroid_matrix = []
        for i in range(len(self.clusters)):
            if i not in drop_clusters:
                new_clusters.append(self.clusters[i])
                new_centroid_matrix.append(self.clusters[i].centroid)
            else:
                new_clusters.append(None)
                self.dropped_clusters.append(i)
        self.clusters = new_clusters
        return np.array(new_centroid_matrix)

    def save_state(self, iteration):
        factor_assignment = np.array([v.cluster_id for k, v in factor_catalog.factors.items()])
        all_centroids = np.array([cluster.centroid for cluster in factor_catalog.clusters])
        self.state[iteration] = {"assignment":factor_assignment, "centroids": all_centroids}

    def run(self, max_iterations: int = 20):
        self.initialize_clusters()        
        centroids = self.calculate_centroids()
        converged = False
        current_iter = 0
        with tqdm(total=max_iterations*len(self.models), desc="Running clustering. N Clusters: NA, Added: NA") as pbar:
            while not converged:
                if current_iter >= max_iterations:
                    logger.info(f"Factor clustering did not converge after {max_iterations} iterations.")
                    break
                self.purge_clusters()
    
                model_list = self.rng.permutation(list(self.models.keys()))
                for model_i in model_list:
                    pbar.update(1)
                    model_factors = [factor.factor_id for factor in self.models[model_i].factors]
                    factor_dist = {}
                    factor_hi = {}
                    # Calculate distances for all factors in the model to all centroids and then order the distances.
                    for factor_i in model_factors:
                        distances = [(j, self.distance(self.factors[factor_i].profile, cluster.centroid)) for j, cluster in enumerate(self.clusters) if cluster is not None]
                        distances.sort(key=lambda x: x[1], reverse=True)
                        factor_dist[str(factor_i)] = distances
                        factor_hi[str(factor_i)] = distances[0]
                    already_assigned = []
                    factor_hi = dict(sorted(factor_hi.items(), key=lambda x: x[1], reverse=True))
                    # Assign factors to clusters, if model hasn't contributed to the cluster already and if the correlation is above the threshold
                    for factor_id in factor_hi.keys():
                        # iterate through list of clusters in order of highest correlation.
                        cluster_idx = -1
                        for cluster_i, correlation_i in factor_dist[factor_id]:
                            if cluster_i not in already_assigned and correlation_i >= self.threshold:
                                cluster_idx = cluster_i
                                break
                        if cluster_idx != -1:
                            self.clusters[cluster_idx].add(factor=self.factors[int(factor_id)], cor=factor_hi[factor_id][1])
                            already_assigned.append(cluster_idx)
                        else:
                            new_cluster_id = self.dropped_clusters.pop() if len(self.dropped_clusters) > 0 else len(self.clusters)
                                
                            new_cluster = Cluster(cluster_id=new_cluster_id, centroid=self.factors[int(factor_id)].profile)
                            new_cluster.add(factor=self.factors[int(factor_id)], cor=1.0)
                            if new_cluster_id == len(self.clusters):
                                self.clusters.append(new_cluster)
                            else:
                                self.clusters[new_cluster_id] = new_cluster
                            already_assigned.append(new_cluster_id)
    
                    # Recalculate centroids of clusters
                    self.save_state(iteration=current_iter)
                    new_centroids = self.calculate_centroids()
                    new_centroids = self.cluster_cleanup()

                    if new_centroids.shape == centroids.shape:
                        if np.sum(new_centroids - centroids) == 0.0:
                            converge = True
                    pbar.set_description(f"Running clustering. N Clusters: {len(centroids)}, Added: {len(new_centroids) - len(centroids)}")
                    centroids = new_centroids
                current_iter += 1
        self.score()

In [None]:
%%capture

rng = np.random.default_rng(seed)
batch_size = 2000
max_batches = 10
i_batches = 0
n_models = 10
max_iter = 20000

i_H = None
i_selection = rng.choice(syn_samples, size=batch_size, replace=False, shuffle=True)
i_V, i_U = prepare_data(V=V, U=U, i_selection=i_selection)

factor_catalog = BatchFactorCatalog(n_factors=factors, n_features=i_V.shape[1], threshold=0.8, seed=42)

t0 = time.time()
change_p = 0.1
with tqdm(range(max_batches*2), desc="Generating subset profiles") as pbar:
    for i in range(max_batches):
        if i > 0:
            j_selection = rng.choice(syn_samples, size=int(batch_size*change_p), replace=False, shuffle=True)
            idx_change = rng.choice(batch_size, size=int(batch_size*change_p), replace=False, shuffle=True)
            i_selection[idx_change] = j_selection
            i_V, i_U = prepare_data(V=V, U=U, i_selection=i_selection)

        batch_sa = BatchSA(V=i_V, U=i_U, factors=factors, models=n_models, method=method, seed=rng.integers(low=0, high=1e8), max_iter=max_iter,
                            converge_delta=converge_delta, converge_n=converge_n, verbose=False)
        _ = batch_sa.train()
        pbar.update(1)
        for sa in batch_sa.results:
            factor_catalog.add_model(model=sa, norm=True)
        
        pbar.set_description(f"Generating subset profiles.")
t1 = time.time()
logger.info(f"Runtime: {((t1-t0)/60):.2f} min(s)")

In [None]:
t2 = time.time()
factor_catalog.run(max_iterations=50)
t3 = time.time()

In [13]:
# factor_catalog.plot()

In [None]:
model_scores = {}
for model_id, model in factor_catalog.models.items():
    model_factors = []
    for factor in model.factors:
        model_factors.append({"id": factor.factor_id, "cluster_id": factor.cluster_id, "cluster_count": len(factor_catalog.clusters[factor.cluster_id])})
    model_scores[model_id] = {"score": model.score, "factors": model_factors}
model_scores = dict(sorted(model_scores.items(), key=lambda item: item[1]["score"], reverse=True))
model_scores

In [15]:
def select_unique(fg, count_threshold: int = 1):
    model_scores = {}
    cluster_count = {}
    for model_id, model in fg.models.items():
        model_factors = []
        for factor in model.factors:
            model_factors.append({"id": factor.factor_id, "cluster_id": factor.cluster_id, "cluster_count": len(fg.clusters[factor.cluster_id])})
            if factor.cluster_id not in cluster_count.keys():
                cluster_count[factor.cluster_id] = len(fg.clusters[factor.cluster_id])
        model_scores[model_id] = {"score": model.score, "factors": model_factors}
    model_scores = dict(sorted(model_scores.items(), key=lambda item: item[1]["score"], reverse=True))

    added_map = []
    unique_models = {}
    for model_id, model in model_scores.items():
        model_mapping = {"score": model["score"]}
        cluster_mapping = {}
        f_map = []
        for factor in model["factors"]:
            cluster_mapping[factor["id"]] = factor["cluster_id"]
            f_map.append(factor["cluster_id"])
        f_map = sorted(f_map)
        if len(added_map) == 0:
            added_map.append(f_map)
            unique_models[model_id] = cluster_mapping
        else:
            add = False
            for i_map in added_map:
                if f_map != i_map:
                    dif = list(set(f_map) - set(i_map))
                    for d in dif:
                        if cluster_count[d] > count_threshold:
                            add = True
            if add:
                added_map.append(f_map)
                unique_models[model_id] = cluster_mapping
    return unique_models
            

In [None]:
unique_models = select_unique(fg=factor_catalog)
unique_models

In [None]:
i = list(unique_models.keys())[0]
i_factors = list(unique_models[i].values())
i_H = np.array([cluster.centroid for cluster in factor_catalog.clusters if cluster.cluster_id in i_factors])
i_H = i_H / i_H.sum(axis=0)

In [18]:
%%time
base_sa = SA(factors=factors, method=method, V=V, U=U, seed=seed, verbose=True)
base_sa.initialize(H=None, W=None)
_ = base_sa.train(max_iter=50000, converge_delta=converge_delta, converge_n=converge_n)
base_sa.summary()

29-Apr-25 16:28:45 - ------------		Model Details		-----------
29-Apr-25 16:28:45 - 	Method: ls-nmf				Factors: 6
29-Apr-25 16:28:45 - 	Number of Features: 40			Number of Samples: 50000
29-Apr-25 16:28:45 - 	Random Seed: 42
29-Apr-25 16:28:45 - ---------------		Model Results		--------------
29-Apr-25 16:28:45 - 	Q(true): 13857819.0000			Q(robust): 11416581.0000
29-Apr-25 16:28:45 - 	MSE(true): 6.9289			MSE(robust): 5.7083
29-Apr-25 16:28:45 - 	Converged: True				Converge Steps: 11786
29-Apr-25 16:28:45 - ------------------------------------------------------


CPU times: total: 48min 12s
Wall time: 1h 1min 42s


In [None]:
%%time
t4 = time.time()
final_sa1 = SA(factors=factors, method=method, V=V, U=U, seed=seed, verbose=True)
final_sa1.initialize(H=i_H, W=None)
_ = final_sa1.train(max_iter=50000, converge_delta=converge_delta, converge_n=converge_n, hold_h=True)
t5 = time.time()
final_sa1.summary()

In [None]:
%%time
t6 = time.time()
final_sa2 = SA(factors=factors, method=method, V=V, U=U, seed=seed, verbose=True)
final_sa2.initialize(H=final_sa1.H, W=final_sa1.W)
_ = final_sa2.train(max_iter=50000, converge_delta=converge_delta, converge_n=converge_n, hold_h=False)
t7 = time.time()
final_sa2.summary()

In [None]:
total_runtime = ((t1-t0) + (t3-t2) + (t5-t4) + (t7-t6))/60
print(f"Total Profiler Runtime: {total_runtime:.2f} min(s)")

In [None]:
i1 = list(unique_models.keys())[1]
i_factors = list(unique_models[i1].values())
i_H1 = np.array([cluster.centroid for cluster in factor_catalog.clusters if cluster.cluster_id in i_factors])
i_H1 = i_H1 / i_H1.sum(axis=0)

In [None]:
%%time
t4 = time.time()
final_sa3 = SA(factors=factors, method=method, V=V, U=U, seed=seed, verbose=True)
final_sa3.initialize(H=i_H1, W=None)
_ = final_sa3.train(max_iter=50000, converge_delta=converge_delta, converge_n=converge_n, hold_h=True)
t5 = time.time()
final_sa3.summary()

In [None]:
%%time
t6 = time.time()
final_sa4 = SA(factors=factors, method=method, V=V, U=U, seed=seed, verbose=True)
final_sa4.initialize(H=final_sa3.H, W=final_sa3.W)
_ = final_sa4.train(max_iter=50000, converge_delta=converge_delta, converge_n=converge_n, hold_h=False)
t7 = time.time()
final_sa4.summary()