## Environmental Source Apportionment Toolkit (ESAT) Workflow

The source apportionment workflow that is available in ESAT replicates the functionality that is found in PMF5. The workflow covers all of the steps for each feature that is available in PMF5:
 1. Data preprocessing, cleanup and analysis.
 2. Source apportionment model creation.
 3. Post-processing analysis and visualizations.
 4. Error estimation analysis of the source apportionment model.
    1. Bootstrap (BS)
    2. Displacement (DISP)
    3. Bootstrap-Displacement (BS-DISP)
 5. Constrained source apportionment model. 

The code provided in this notebook are intended to provide an example of how to implement the ESAT workflow programmaticaly.


In [None]:
# Notebook imports
import os
import sys
import json

In [None]:
# When running from Google Colab or other Jupyter notebook cloud environment, the esat python package may need to be installed.
# If the python package file is available locally run a pip install for the specific wheel for your current OS/Arch
#! pip install esat

#### Code Imports

In [None]:
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis

#### Sample Dataset
The three sample datasets from PMF5 are available for use, but a new dataset can be used in their place.

In [None]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, "..", "data")

# Baton Rouge Dataset
br_input_file = os.path.join(data_dir, "Dataset-BatonRouge-con.csv")
br_uncertainty_file = os.path.join(data_dir, "Dataset-BatonRouge-unc.csv")
br_output_path = os.path.join(data_dir, "output", "BatonRouge")
# Baltimore Dataset
b_input_file = os.path.join(data_dir, "Dataset-Baltimore_con.txt")
b_uncertainty_file = os.path.join(data_dir, "Dataset-Baltimore_unc.txt")
b_output_path = os.path.join(data_dir, "output", "Baltimore")
# Saint Louis Dataset
sl_input_file = os.path.join(data_dir, "Dataset-StLouis-con.csv")
sl_uncertainty_file = os.path.join(data_dir, "Dataset-StLouis-unc.csv")
sl_output_path = os.path.join(data_dir, "output", "StLouis")

#### Input Parameters

In [None]:
index_col = "Date"                  # the index of the input/uncertainty datasets
factors = 6                         # the number of factors
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.01               # convergence criteria for the change in loss, Q
converge_n = 50                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.

#### Dataset Selection
One of the three sample datasets can be selected or a new cleaned dataset can be used. Datasets should be cleaned, containing no missing data (either dropping missing/NaNs, or interpolating the missing values).

In [None]:
# Loading the Baton Rouge dataset
input_file = br_input_file
uncertainty_file = br_uncertainty_file
output_path = b_output_path

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [None]:
data_handler = DataHandler(
    input_path=input_file,
    uncertainty_path=uncertainty_file,
    index_col=index_col
)

In [None]:
data_handler.input_data.columns

#### Input/Uncertainty Data Metrics and Visualizations

In [None]:
import plotly.graph_objects as go
import plotly.colors

import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import gaussian_kde

In [None]:
def plot_feature_correlation_heatmap(df: pd.DataFrame, method: str = "pearson", show: bool = True):
    """
    Plots a correlation heatmap for the features in the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame with features as columns.
    method : str
        Correlation method: 'pearson', 'spearman', or 'kendall'.
    show : bool
        Whether to display the plot immediately.

    Returns
    -------
    plotly.graph_objects.Figure
        The Plotly heatmap figure.
    """
    corr = df.corr(method=method)
    fig = go.Figure(
        data=go.Heatmap(
            z=corr.values,
            x=corr.columns,
            y=corr.columns,
            colorscale="rdylbu",
            reversescale=True,
            colorbar=dict(title="Correlation"),
            zmin=-1, zmax=1
        )
    )
    fig.update_layout(
        title=dict(x=0.5, xanchor="center", text=f"Feature Correlation Heatmap ({method.title()})"),
        xaxis_title="Features",
        yaxis_title="Features",
        width=800,
        height=800,
        margin=dict(l=0, r=0, t=50, b=0)
    )
    if show:
        fig.show()
    return fig

In [None]:
_ = plot_feature_correlation_heatmap(data_handler.input_data)

In [None]:
def plot_superimposed_histograms(df, show: bool=True, nbins: int=50):
    """
    Plots superimposed histograms for each feature in the input data using a colormap.
    """
    fig = go.Figure()
    # Use a qualitative color palette from Plotly
    colors = getattr(plotly.colors.qualitative, "Plotly")
    n_colors = len(colors)
    for i, col in enumerate(df.columns):
        fig.add_trace(go.Histogram(
            x=df[col],
            name=str(col),
            opacity=0.5,
            nbinsx=nbins,
            marker_color=colors[i % n_colors]
        ))
    fig.update_layout(
        barmode='overlay',
        title=dict(x=0.5, xanchor="center", text='Histograms of Features'),
        xaxis_title='Value',
        yaxis_title='Count',
        width=1200,
        height=800,
        margin=dict(l=10, r=5, t=50, b=10),
        hovermode='x unified'
    )
    if show:
        fig.show()
    return fig

In [None]:
_ = plot_superimposed_histograms(data_handler.input_data)

In [None]:
def plot_2d_histogram(df, x_col, y_col, show=True, nbins=100):
    fig = go.Figure(data=go.Histogram2d(
        x=df[x_col],
        y=df[y_col],
        nbinsx=nbins,
        nbinsy=nbins,
        colorscale='Blues'
    ))
    fig.update_layout(
        title=dict(x=0.5, xanchor="center", text=f'2D Histogram: {x_col} vs {y_col}'),
        xaxis_title=x_col,
        yaxis_title=y_col,
        width=800,
        height=800,
        margin=dict(l=20, r=20, t=60, b=20),
    )
    if show:
        fig.show()
    return fig

In [None]:
_ = plot_2d_histogram(data_handler.input_data, "124-Trimethylbenzene", "224-Trimethylpentane")

In [None]:
def plot_3d_histogram_ribbons_log(df, nbins=50, show=True):
    """
    Plots a 3D histogram where each feature is a separate 'ribbon' along the y-axis, with log-scaled x-axis.
    """
    fig = go.Figure()
    for i, col in enumerate(df.columns):
        data = df[col].dropna().values
        # Only positive values for log scale
        data = data[data > 0]
        if data.size == 0:
            continue
        counts, bin_edges = np.histogram(data, bins=nbins)
        bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])
        # Only keep bins with positive centers
        mask = bin_centers > 0
        bin_centers = bin_centers[mask]
        counts = counts[mask]
        y = np.full_like(bin_centers, i, dtype=float)
        fig.add_trace(go.Scatter3d(
            x=np.log10(bin_centers),
            y=y,
            z=counts,
            mode='lines',
            name=str(col),
            line=dict(width=6),
            hovertemplate=f'Feature: {col}<br>log10(Bin): %{{x}}<br>Count: %{{z}}<extra></extra>'
        ))
    fig.update_layout(
        scene=dict(
            xaxis_title='log10(Value)',
            yaxis_title='Feature',
            yaxis=dict(
                tickvals=list(range(len(df.columns))),
                ticktext=[str(col) for col in df.columns]
            ),
            zaxis_title='Count',
            xaxis_type='linear'  # log scale is handled by log10 transform
        ),
        title='3D Histogram Ribbon (Log Scale)',
        width=900,
        height=600
    )
    if show:
        fig.show()
    return fig

In [None]:
_ = plot_3d_histogram_ribbons_log(data_handler.input_data)

In [None]:
def plotly_ridgeline(df, log_x=False, fill=True, max_height=900, min_spacing=0.5, max_spacing=1.5, nbins=500, show=True):
    n = len(df.columns)
    spacing = min(max_spacing, max(min_spacing, (max_height - 100) / n / 50))
    fig = go.Figure()
    y_ticks = []
    y_labels = []
    for i, col in enumerate(df.columns):
        data = df[col].dropna().values
        if log_x:
            data = data[data > 0]
            log_data = np.log10(data)
            x_grid = np.linspace(log_data.min(), log_data.max(), nbins)
            actual_x = 10 ** x_grid
            kde = gaussian_kde(log_data)
            y = kde(x_grid)
            feature_names = np.full_like(x_grid, col, dtype=object)
            customdata = np.stack([actual_x, x_grid, feature_names], axis=-1)
        else:
            if len(data) < 2:
                continue
            x_grid = np.linspace(data.min(), data.max(), nbins)
            kde = gaussian_kde(data)
            y = kde(x_grid)
            feature_names = np.full_like(x_grid, col, dtype=object)
            customdata = np.stack([x_grid, np.log10(x_grid + 1e-12), feature_names], axis=-1)
        y_offset = i * spacing
        y_ticks.append(y_offset)
        y_labels.append(str(col))
        fig.add_trace(go.Scatter(
            x=x_grid if log_x else x_grid,
            y=y + y_offset,
            mode='lines',
            fill='tozeroy' if fill else None,
            name=str(col),
            line=dict(width=2),
            customdata=customdata,
            hovertemplate=(
                "Feature: %{customdata[2]}<br>"
                "log10(Value): %{customdata[1]:.3f}<br>"
                "Value: %{customdata[0]:.3g}<br>"
                "Density: %{y:.3g}<extra></extra>"
            )
        ))
    fig.update_layout(
        yaxis=dict(
            tickvals=y_ticks,
            ticktext=y_labels,
            title='Feature'
        ),
        xaxis_title='log10(Value)' if log_x else 'Value',
        title='Ridgeline Plot of Feature Distributions',
        showlegend=False,
        height=max_height,
        width=900,
        margin=dict(l=80, r=40, t=60, b=40)
    )
    if show:
        fig.show()
    return fig

In [None]:
_ = plotly_ridgeline(data_handler.input_data, log_x=True, fill=False, max_height=800, min_spacing=0.5, max_spacing=1.0, nbins=500, show=True)

In [None]:
def detect_outliers(
    df: pd.DataFrame,
    dim: int,
    threshold: float = 3.0,
    random_state: int = 42,
    max_iter: int = 200
):
    """
    Detects outliers in a DataFrame using an MLPRegressor autoencoder.
    Returns:
        - DataFrame with reconstruction errors and row-wise outlier mask.
        - DataFrame with value-wise outlier mask (same shape as df).
    """
    X = df.values
    autoencoder = MLPRegressor(hidden_layer_sizes=(dim,), activation='relu',
                               max_iter=max_iter, random_state=random_state)
    autoencoder.fit(X, X)
    # Encode
    hidden_weights = autoencoder.coefs_[0]
    hidden_bias = autoencoder.intercepts_[0]
    encoded = X @ hidden_weights + hidden_bias
    encoded = np.maximum(encoded, 0)  # relu activation
    # Decode (reconstruct)
    output_weights = autoencoder.coefs_[1]
    output_bias = autoencoder.intercepts_[1]
    reconstructed = encoded @ output_weights + output_bias

    # Row-wise (sample) outlier detection
    errors = np.mean((X - reconstructed) ** 2, axis=1)
    err_mean = np.mean(errors)
    err_std = np.std(errors)
    outlier_mask = errors > (err_mean + threshold * err_std)
    row_result = pd.DataFrame({
        "reconstruction_error": errors,
        "is_outlier": outlier_mask
    }, index=df.index)

    # Value-wise outlier detection
    value_errors = (X - reconstructed) ** 2
    value_mean = np.mean(value_errors)
    value_std = np.std(value_errors)
    value_outlier_mask = value_errors > (value_mean + threshold * value_std)
    # Build DataFrame of tuples
    value_result = pd.DataFrame(
        [
            [(bool(value_outlier_mask[i, j]), float(value_errors[i, j]))
             for j in range(value_errors.shape[1])]
            for i in range(value_errors.shape[0])
        ],
        index=df.index,
        columns=df.columns
    )
    return row_result, value_result

In [None]:
def plot_outlier_detection_results(results_df, value_results=None, feature=None, show=True):
    """
    Plots reconstruction errors and highlights outliers.
    If feature is None, plots sample-wise outliers.
    If feature is set, plots value-wise outliers for that feature using the tuple DataFrame.
    """
    if feature is None:
        # Sample-wise plot (as before)
        try:
            results_df = results_df.copy()
            results_df.index = pd.to_datetime(results_df.index)
        except Exception:
            pass

        sorted_df = results_df.sort_index()
        inliers = sorted_df[~sorted_df['is_outlier']]
        outliers = sorted_df[sorted_df['is_outlier']]

        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=inliers.index,
            y=inliers['reconstruction_error'],
            mode='markers',
            name='Inlier',
            marker=dict(color='blue', size=6),
            showlegend=True
        ))
        fig.add_trace(go.Scatter(
            x=outliers.index,
            y=outliers['reconstruction_error'],
            mode='markers',
            name='Outlier',
            marker=dict(color='red', size=8, symbol='x'),
            showlegend=True
        ))
        fig.update_layout(
            title='Reconstruction Error and Outlier Detection',
            xaxis_title='Sample Index (Date)',
            yaxis_title='Reconstruction Error',
            width=800,
            height=500
        )
    else:
        # Value-wise plot for a specific feature using value_results
        if value_results is None or feature not in value_results.columns:
            raise ValueError("Provide value_results and a valid feature name.")

        idx = value_results.index
        try:
            idx = pd.to_datetime(idx)
        except Exception:
            pass

        # Extract outlier mask and loss for the feature
        outlier_mask = value_results[feature].apply(lambda x: x[0])
        loss_values = value_results[feature].apply(lambda x: x[1])

        inlier_idx = idx[~outlier_mask]
        outlier_idx = idx[outlier_mask]
        inlier_vals = loss_values[~outlier_mask]
        outlier_vals = loss_values[outlier_mask]

        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=inlier_idx,
            y=inlier_vals,
            mode='markers',
            name='Inlier',
            marker=dict(color='blue', size=6),
            showlegend=True
        ))
        fig.add_trace(go.Scatter(
            x=outlier_idx,
            y=outlier_vals,
            mode='markers',
            name='Outlier',
            marker=dict(color='red', size=8, symbol='x'),
            showlegend=True
        ))
        fig.update_layout(
            title=f'Value-wise Outlier Detection for {feature}',
            xaxis_title='Sample Index (Date)',
            yaxis_title=f'Reconstruction Loss ({feature})',
            width=800,
            height=500
        )

    if show:
        fig.show()
    return fig

In [None]:
best_dim = 0
best_results = None
least_outliers = float("inf")
for dim in range(3, 11):
    outliers_i, outliers_fi = detect_outliers_mlp(data_handler.input_data, dim=dim, max_iter=5000)
    outlier_count = int(outliers_i[outliers_i["is_outlier"] == True].count()["is_outlier"])
    print(f"Dim: {dim} - Outliers: {outlier_count}")
    if outlier_count < least_outliers:
        least_outliers = outlier_count
        best_dim = dim
        best_results = (outliers_i, outliers_fi)

In [None]:
_ = plot_outlier_detection_results(best_results[0], best_results[1], feature="124-Trimethylbenzene")

In [None]:
outliers_i, outliers_fi = detect_outliers(data_handler.input_data, dim=6, max_iter=5000)
_ = plot_outlier_detection_results(outliers_i, outliers_fi, feature="Ethane")

In [None]:
outlier_counts = outliers_fi.map(lambda x: x[0]).sum()
print(outlier_counts)