From 0202ce494f320b1cc6096bc323479ff3a35b3f37 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Thu, 12 Dec 2024 17:58:44 -0500
Subject: [PATCH 01/58] added option to skip metadata parsing as this can be
 slow for large npar/nobs

---
 pyemu/pst/pst_handler.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyemu/pst/pst_handler.py b/pyemu/pst/pst_handler.py
index c0d50fa9b..c1d9e256a 100644
--- a/pyemu/pst/pst_handler.py
+++ b/pyemu/pst/pst_handler.py
@@ -61,7 +61,7 @@ class Pst(object):
 
     """
 
-    def __init__(self, filename, load=True, resfile=None):
+    def __init__(self, filename, load=True, resfile=None, parse_metadata=True):
 
         self.parameter_data = None
         """pandas.DataFrame:  '* parameter data' information.  Columns are 
@@ -136,7 +136,7 @@ def __init__(self, filename, load=True, resfile=None):
             if not os.path.exists(filename):
                 raise Exception("pst file not found:{0}".format(filename))
 
-            self.load(filename)
+            self.load(filename, parse_metadata=parse_metadata)
 
     def __setattr__(self, key, value):
         if key == "model_command":
@@ -1238,7 +1238,7 @@ def _load_version2(self, filename):
                 "'* model input/output cant be used with '* model input' or '* model output'"
             )
 
-    def load(self, filename):
+    def load(self, filename, parse_metadata=True):
         """entry point load the pest control file.
 
         Args:
@@ -1271,7 +1271,8 @@ def load(self, filename):
 
         self._load_version2(filename)
         self._try_load_longnames()
-        self.try_parse_name_metadata()
+        if parse_metadata:
+            self.try_parse_name_metadata()
         self._reset_file_paths_os()
 
     def _reset_file_paths_os(self):

From ae00d9a13597c6505cd8e36616230cea57b7ac70 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Thu, 24 Apr 2025 15:56:30 -0600
Subject: [PATCH 02/58] more tune ups in pypestworker

---
 pyemu/utils/os_utils.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py
index 94990cab8..50801208c 100644
--- a/pyemu/utils/os_utils.py
+++ b/pyemu/utils/os_utils.py
@@ -651,7 +651,7 @@ def send(self,s,mtype,group,runid,desc,data):
         full_desc = desc + fill_desc
         buf += full_desc.encode()
         buf += sdata
-        s.send(buf)
+        s.sendall(buf)
 
 
     def _check_sec_message(self,recv_sec_message):
@@ -662,7 +662,7 @@ def _check_sec_message(self,recv_sec_message):
 class PyPestWorker(object):
 
 
-    def __init__(self, pst, host, port, timeout=0.1,verbose=True):
+    def __init__(self, pst, host, port, timeout=0.25,verbose=True):
         self.host = host
         self.port = port
         self._pst_arg = pst
@@ -695,23 +695,19 @@ def _process_pst(self):
 
 
     def connect(self,is_reconnect=False):
-        self.message("trying to connect to {0}:{1}...".format(self.host,self.port))
+        self.message("trying to connect to {0}:{1}...".format(self.host,self.port),echo=True)
         self.s = None
         c = 0
         while True:
             try:
                 time.sleep(self.timeout)
-                print(".", end='')
                 c += 1
-                if c % 75 == 0:
-                    print('')
-                print(c)
                 if is_reconnect and c > self.max_reconnect_attempts:
                     print("max reconnect attempts reached...")
                     return False
                 self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                 self.s.connect((self.host, self.port))
-                self.message("connected to {0}:{1}".format(self.host,self.port))
+                self.message("connected to {0}:{1}".format(self.host,self.port),echo=True)
                 break
 
             except ConnectionRefusedError:
@@ -723,8 +719,8 @@ def connect(self,is_reconnect=False):
         return True
 
 
-    def message(self,msg):
-        if self.verbose:
+    def message(self,msg,echo=False):
+        if self.verbose or echo:
             print(str(datetime.now())+" : "+msg)
 
 
@@ -757,9 +753,13 @@ def listen(self,lock=None,send_lock=None):
                 if not success:
                     print("...exiting")
                     time.sleep(self.timeout)
+                    # set the teminate flag so that the get_pars() look will exit
+                    self._lock.acquire()
+                    self.net_pack.mtype = 14
+                    self._lock.release()
                     return
                 else:
-                    print("...reconnect successfully...")
+                    print("...reconnected successfully...")
                     continue
 
             if n > 0:

From 33e368f9bed12650c7ec2d5a6541ce0f717c6a59 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Thu, 24 Apr 2025 16:16:48 -0600
Subject: [PATCH 03/58] more worker stuff

---
 pyemu/utils/os_utils.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py
index 50801208c..1247cf457 100644
--- a/pyemu/utils/os_utils.py
+++ b/pyemu/utils/os_utils.py
@@ -660,9 +660,21 @@ def _check_sec_message(self,recv_sec_message):
                             format(recv_sec_message,self.sec_message))
 
 class PyPestWorker(object):
+    """a pure python worker for pest++.  the pest++ master doesnt even know...
 
+    Args:
+        pst (str or pyemu.Pst): something about a control file
+        host (str): master hostname or IPv4 address
+        port (int): port number that the master is listening on
+        timeout (float): number of seconds to sleep at different points in the process.  
+            if you have lots of pars and/obs, a longer sleep can be helpful, but if you make this smaller,
+            the worker responds faster...'it depends'
+        verbose (bool): flag to echo what's going on to stdout
+        socket_timeout (float): number of seconds that the socket should wait before giving up. 
+            generally, this can be a big number...
+    """
 
-    def __init__(self, pst, host, port, timeout=0.25,verbose=True):
+    def __init__(self, pst, host, port, timeout=0.25,verbose=True, socket_timeout=None):
         self.host = host
         self.port = port
         self._pst_arg = pst
@@ -673,7 +685,9 @@ def __init__(self, pst, host, port, timeout=0.25,verbose=True):
         self.verbose = bool(verbose)
         self.par_names = None
         self.obs_names = None
-
+        if socket_timeout is None:
+            socket_timeout = timeout * 100
+        self.socket_timeout = socket_timeout
         self.par_values = None
         self.max_reconnect_attempts = 10
         self._process_pst()
@@ -741,7 +755,7 @@ def send(self,mtype,group,runid,desc="",data=0):
         return True
 
     def listen(self,lock=None,send_lock=None):
-        self.s.settimeout(self.timeout)
+        self.s.settimeout(self.socket_timeout)
         failed_reconnect = False
         while True:
             time.sleep(self.timeout)

From 858ee5d43ed8b4d0aa94a68b96ac26023be20a8f Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Thu, 22 May 2025 16:51:41 -0600
Subject: [PATCH 04/58] added some sugar to results handler to help with
 programmatic access to a sequence of ensemble/population files

---
 autotest/pst_tests_2.py     |  7 ++++---
 pyemu/pst/result_handler.py | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/autotest/pst_tests_2.py b/autotest/pst_tests_2.py
index eef36c0b0..c82e2db5b 100644
--- a/autotest/pst_tests_2.py
+++ b/autotest/pst_tests_2.py
@@ -915,6 +915,7 @@ def results_ies_1_test():
 
     pst = pyemu.Pst(os.path.join(m_d, "pest.pst"),result_dir=m_d)
 
+    df = pst.ies.get("paren",0)
     df = r.ies.rmr
     print(df)
     assert df is not None
@@ -1096,10 +1097,10 @@ def results_mou_1_test():
         assert df is not None
 
 if __name__ == "__main__":
-    results_ies_3_test()
+    #results_ies_3_test()
     results_ies_1_test()
-    results_ies_2_test()
-    results_mou_1_test()
+    #results_ies_2_test()
+    #results_mou_1_test()
     #at_bounds_test()
 
     #pst_from_flopy_geo_draw_test()
diff --git a/pyemu/pst/result_handler.py b/pyemu/pst/result_handler.py
index 033341969..7d868afb3 100644
--- a/pyemu/pst/result_handler.py
+++ b/pyemu/pst/result_handler.py
@@ -181,6 +181,20 @@ def get_files(self,tag):
                 files.append(f)
         return files
 
+    def get(self,tag,*args):
+        """helper to call __getattr__() with programatic args
+
+        Args:
+            tag (str): string for the item of interest (eg "paren", "dvpop", etc)
+            *args (list): optional args to str concatenate with tag when passed to
+                __getattr__().  for example tag could be "paren" and args could 0,
+                so that what is passed to __getattr__() is "paren0".
+        Returns:
+            "it depends"
+
+        """
+        ttag = tag + "".join([str(a) for a in args])
+        return self.__getattr__(ttag)
 
     def __getattr__(self,tag):
         """overload of the get-attribute class method to make things super

From ce5dff34a2f2704e8e7c0f816a8224eb6b7cc538 Mon Sep 17 00:00:00 2001
From: Rui Hugman <ruihugman@Ruis-MacBook-Pro.local>
Date: Mon, 16 Jun 2025 15:13:10 +0100
Subject: [PATCH 05/58] introducing transformer classes and pipeline

---
 pyemu/__init__.py               |  12 +-
 pyemu/emulators/__init__.py     |  21 +
 pyemu/emulators/base.py         | 187 ++++++++
 pyemu/emulators/transformers.py | 736 ++++++++++++++++++++++++++++++++
 4 files changed, 955 insertions(+), 1 deletion(-)
 create mode 100755 pyemu/emulators/__init__.py
 create mode 100755 pyemu/emulators/base.py
 create mode 100755 pyemu/emulators/transformers.py

diff --git a/pyemu/__init__.py b/pyemu/__init__.py
index db0e00960..9b88113c7 100644
--- a/pyemu/__init__.py
+++ b/pyemu/__init__.py
@@ -20,7 +20,9 @@
 from .sc import Schur
 from .utils import (geostats, gw_utils, helpers, metrics, optimization,
                     os_utils, pp_utils, smp_utils)
-
+from .emulators import (Emulator, BaseTransformer, Log10Transformer,
+                      RowWiseMinMaxScaler, StandardScalerTransformer, NormalScoreTransformer,
+                      TransformerPipeline, AutobotsAssemble)
 #from .prototypes import *
 try:
     from .legacy import *
@@ -53,5 +55,13 @@
     "smp_utils",
     "plot_utils",
     "metrics",
+    "Emulator",
+    "BaseTransformer",
+    "Log10Transformer", 
+    "RowWiseMinMaxScaler",
+    "StandardScalerTransformer",
+    "NormalScoreTransformer",
+    "TransformerPipeline",
+    "AutobotsAssemble",
 ]
 # del get_versions
diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py
new file mode 100755
index 000000000..3bd39b1da
--- /dev/null
+++ b/pyemu/emulators/__init__.py
@@ -0,0 +1,21 @@
+from .transformers import (
+    BaseTransformer,
+    Log10Transformer,
+    RowWiseMinMaxScaler,
+    StandardScalerTransformer,
+    NormalScoreTransformer,
+    TransformerPipeline,
+    AutobotsAssemble
+)
+from .base import Emulator
+
+__all__ = [
+    'Emulator', #base Emulator Class
+    'BaseTransformer',
+    'Log10Transformer',
+    'RowWiseMinMaxScaler',
+    'StandardScalerTransformer',
+    'NormalScoreTransformer',
+    'TransformerPipeline',
+    'AutobotsAssemble'
+]
diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py
new file mode 100755
index 000000000..f088d91ee
--- /dev/null
+++ b/pyemu/emulators/base.py
@@ -0,0 +1,187 @@
+"""
+Base class for emulators.
+"""
+from __future__ import print_function, division
+import pickle
+import numpy as np
+import pandas as pd
+from ..logger import Logger
+
+class Emulator:
+    """
+    Base class for emulators.
+    
+    This class defines the common interface for all emulator implementations
+    and provides shared functionality used by multiple emulator types.
+    
+    Parameters
+    ----------
+    verbose : bool, optional
+        If True, enable verbose logging. Default is True.
+    """
+
+    def __init__(self, verbose=True):
+        """
+        Initialize the Emulator base class.
+
+        Parameters
+        ----------
+        verbose : bool, optional
+            If True, enable verbose logging. Default is True.
+        """
+        self.logger = Logger(verbose)
+        self.log = self.logger.log
+        self.fitted = False
+        self.data = None
+        self.data_transformed = None
+        self.feature_scaler = None
+        self.energy_threshold = 1.0
+        self.feature_transformer = None
+
+    def fit(self, X, y=None):
+        """
+        Fit the emulator to training data.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            Input features for training.
+        y : pandas.DataFrame or None, optional
+            Target values for training if separate from X.
+            
+        Returns
+        -------
+        self : Emulator
+            Returns self for method chaining.
+        """
+        raise NotImplementedError("Subclasses must implement fit method")
+
+    def predict(self, X):
+        """
+        Generate predictions using the fitted emulator.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            Input data to generate predictions for.
+            
+        Returns
+        -------
+        pandas.DataFrame or pandas.Series
+            Predictions for the input data.
+        """
+        if not self.fitted:
+            raise ValueError("Emulator must be fitted before prediction")
+        raise NotImplementedError("Subclasses must implement predict method")
+
+    def prepare_training_data(self, data=None):
+        """
+        Prepare and transform training data for model fitting.
+        
+        Parameters
+        ----------
+        data : pandas.DataFrame, optional
+            Raw training data. If None, uses self.data.
+            
+        Returns
+        -------
+        tuple
+            Processed data ready for model fitting.
+        """
+        if data is None:
+            if self.data is None:
+                raise ValueError("No data provided and no data stored in the emulator")
+            data = self.data
+        
+        # Common preprocessing logic could go here
+        return data
+        
+    def apply_feature_transforms(self, data=None, transforms=None):
+        """
+        Apply feature transformations to data with customizable transformer sequence.
+        
+        Parameters
+        ----------
+        data : pandas.DataFrame, optional
+            Data to transform. If None, uses self.data.
+        transforms : list of dict, optional
+            List of transformation specifications. Each dict should have:
+            - 'type': str - Type of transformation (e.g., 'log10', 'normal_score')
+            - 'columns': list - Columns to apply the transformation to (optional)
+            - Additional kwargs specific to the transformer
+            If None, no transformations are applied.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            Transformed data.
+        
+        Examples
+        --------
+        # Using the transforms parameter:
+        emulator.apply_feature_transforms(
+            transforms=[
+                {'type': 'log10', 'columns': ['flow', 'heads']},
+                {'type': 'normal_score', 'columns': None, 'quadratic_extrapolation': True}
+            ]
+        )
+        """
+        if data is None:
+            data = self.data
+            
+        if data is None:
+            raise ValueError("No data provided and no data stored in the emulator")
+                
+        self.logger.statement("applying feature transforms")
+        # Import AutobotsAssemble here to avoid circular import
+        from .transformers import AutobotsAssemble
+        
+        ft = AutobotsAssemble(data.copy())
+        
+        # Process the transforms parameter if provided
+        if transforms:
+            for transform in transforms:
+                transform_type = transform.get('type')
+                columns = transform.get('columns')
+                # Extract transformer-specific kwargs
+                kwargs = {k: v for k, v in transform.items() 
+                        if k not in ('type', 'columns')}
+                
+                self.logger.statement(f"applying {transform_type} transform")
+                ft.apply(transform_type, columns=columns, **kwargs)
+        
+        transformed_data = ft.df.copy()
+        self.feature_transformer = ft
+        self.data_transformed = transformed_data
+            
+        return transformed_data
+
+    def save(self, filename):
+        """
+        Save the fitted emulator to a file.
+        
+        Parameters
+        ----------
+        filename : str
+            Path to save the emulator.
+        """
+        with open(filename, "wb") as f:
+            pickle.dump(self, f)
+    
+    @classmethod
+    def load(cls, filename):
+        """
+        Load a fitted emulator from a file.
+        
+        Parameters
+        ----------
+        filename : str
+            Path to the saved emulator file.
+            
+        Returns
+        -------
+        Emulator
+            The loaded emulator instance.
+        """
+        with open(filename, "rb") as f:
+            return pickle.load(f)
\ No newline at end of file
diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py
new file mode 100755
index 000000000..22c1bbb02
--- /dev/null
+++ b/pyemu/emulators/transformers.py
@@ -0,0 +1,736 @@
+"""
+Transformer classes for data transformations in emulators.
+"""
+from __future__ import print_function, division
+import numpy as np
+import pandas as pd
+
+class BaseTransformer:
+    """Base class for all transformers providing a consistent interface."""
+
+    def fit(self, X):
+        """Learn parameters from data if needed."""
+        return self
+
+    def transform(self, X):
+        """Apply transformation to X."""
+        raise NotImplementedError
+
+    def fit_transform(self, X):
+        """Fit and transform in one step."""
+        return self.fit(X).transform(X)
+
+    def inverse_transform(self, X):
+        """Inverse transform X back to original space."""
+        raise NotImplementedError
+
+class Log10Transformer(BaseTransformer):
+    """Apply log10 transformation."""
+
+    def __init__(self):
+        self.shifts = {}
+
+    def transform(self, X):
+        result = X.copy()
+        for col in X.columns:
+            min_val = X[col].min()
+            shift = -min_val + 1e-6 if min_val <= 0 else 0
+            self.shifts[col] = shift
+            result[col] = np.log10(X[col] + shift)
+        return result
+
+    def inverse_transform(self, X):
+        result = X.copy()
+        for col in X.columns:
+            shift = self.shifts.get(col, 0)
+            result[col] = (10 ** X[col]) - shift
+        return result
+
+class RowWiseMinMaxScaler(BaseTransformer):
+    """Scale each row of a DataFrame to a specified range.
+    
+    Parameters
+    ----------
+    feature_range : tuple (min, max), default=(-1, 1)
+        The range to scale features into.
+    groups : dict or None, default=None
+        Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group).
+        If None, all columns will be treated as a single group.
+    fit_groups : dict or None, default=None
+        Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max.
+        If None, defaults to using the same columns as in groups.
+    """
+
+    def __init__(self, feature_range=(-1, 1), groups=None, fit_groups=None):
+        self.feature_range = feature_range
+        self.groups = groups
+        self.fit_groups = fit_groups if fit_groups is not None else groups
+        self.row_params = {}  # Will store per-row (min, max) for each group
+
+    def fit(self, X):
+        """Compute row-wise min and max for each group.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to fit the scaler on.
+            
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # If groups not specified, treat all columns as one group
+        if self.groups is None:
+            self.groups = {"all": X.columns.tolist()}
+            
+        if self.fit_groups is None:
+            self.fit_groups = self.groups.copy()
+        
+        # Calculate and store row-wise min and max for each group
+        self.row_params = {}
+        for group_name, group_cols in self.groups.items():
+            # Determine which columns to use for computing min/max for each row
+            fit_cols = self.fit_groups.get(group_name, group_cols)
+            # Keep only columns that exist in the DataFrame
+            fit_cols = [col for col in fit_cols if col in X.columns]
+            if not fit_cols:
+                continue
+                
+            # Compute row-wise min and max using the fit columns
+            row_min = X[fit_cols].min(axis=1)
+            row_max = X[fit_cols].max(axis=1)
+            self.row_params[group_name] = (row_min, row_max)
+        
+        return self
+
+    def transform(self, X):
+        """Scale each row of data to the specified range.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The transformed DataFrame.
+        """
+        result = X.copy()
+        f_min, f_max = self.feature_range
+        
+        # Auto-fit if not already fitted or if groups weren't specified
+        if not self.row_params or self.groups is None:
+            self.fit(X)
+        
+        # Transform each group
+        for group_name, group_cols in self.groups.items():
+            # Keep only columns that exist in the DataFrame
+            valid_cols = [col for col in group_cols if col in X.columns]
+            if not valid_cols:
+                continue
+                
+            # Get the min and max for each row in this group
+            row_min, row_max = self.row_params[group_name]
+            
+            # Calculate the row range, avoiding division by zero
+            row_range = row_max - row_min
+            row_range[row_range == 0] = 1.0  # Set to 1 where range is 0
+            
+            # For all columns in the group, scale using the row-wise parameters
+            group_data = X[valid_cols]
+            # First scale to [0, 1]
+            group_std = group_data.sub(row_min, axis=0).div(row_range, axis=0)
+            # Then scale to the desired feature range
+            result[valid_cols] = group_std * (f_max - f_min) + f_min
+        
+        return result
+
+    def inverse_transform(self, X):
+        """Inverse transform data back to the original scale.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to inverse transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The inverse-transformed DataFrame.
+        """
+        if not self.row_params:
+            raise ValueError("This RowWiseMinMaxScaler instance is not fitted yet. "
+                            "Call 'fit' before using this method.")
+        
+        result = X.copy()
+        f_min, f_max = self.feature_range
+        
+        # Inverse transform each group
+        for group_name, group_cols in self.groups.items():
+            # Keep only columns that exist in the DataFrame
+            valid_cols = [col for col in group_cols if col in X.columns]
+            if not valid_cols:
+                continue
+                
+            # Get the min and max for each row in this group
+            row_min, row_max = self.row_params[group_name]
+            row_range = row_max - row_min
+            row_range[row_range == 0] = 1.0  # Avoid division by zero
+            
+            # Get the scaled data for this group
+            group_data = X[valid_cols]
+            
+            # First convert from feature_range to [0, 1]
+            group_std = (group_data - f_min) / (f_max - f_min)
+            
+            # Then recover original values
+            result[valid_cols] = group_std.mul(row_range, axis=0).add(row_min, axis=0)
+        
+        return result
+
+class MinMaxScaler(BaseTransformer):
+    """Scale each column of a DataFrame to a specified range.
+    
+    Parameters
+    ----------
+    feature_range : tuple (min, max), default=(0, 1)
+        The range to scale features into.
+    columns : list, optional
+        List of column names to be scaled. If None, all columns will be scaled.
+    skip_constant : bool, optional
+        If True, columns with constant values will be skipped. Default is True.
+    """
+
+    def __init__(self, feature_range=(-1, 1), columns=None, skip_constant=True):
+        self.feature_range = feature_range
+        self.columns = columns
+        self.skip_constant = skip_constant
+        self.min_ = {}
+        self.scale_ = {}
+        
+    def fit(self, X):
+        """Learn min and max values for scaling.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to fit the scaler on.
+            
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        columns = self.columns if self.columns is not None else X.columns
+        
+        # Ensure we only work with columns that exist in the DataFrame
+        columns = [col for col in columns if col in X.columns]
+        
+        for col in columns:
+            col_min = X[col].min()
+            col_max = X[col].max()
+            
+            # If the column has constant values and skip_constant is True, store the values but don't transform
+            if self.skip_constant and col_min == col_max:
+                self.min_[col] = col_min
+                self.scale_[col] = 0  # Flag for constant column
+            else:
+                # Store min and calculate scale factor for non-constant columns
+                self.min_[col] = col_min
+                # Avoid division by zero for nearly constant columns
+                if col_max - col_min > 1e-10:
+                    self.scale_[col] = (self.feature_range[1] - self.feature_range[0]) / (col_max - col_min)
+                else:
+                    # For nearly constant columns, set scale to 0 to keep original value
+                    self.scale_[col] = 0
+                    
+        return self
+        
+    def transform(self, X):
+        """Scale features according to feature_range.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The transformed DataFrame.
+        """
+        if not self.min_:
+            self.fit(X)
+            
+        result = X.copy()
+        
+        f_min, f_max = self.feature_range
+        
+        for col in self.min_.keys():
+            if col not in X.columns:
+                continue
+                
+            # Skip columns marked as constant
+            if self.scale_[col] == 0:
+                continue
+                
+            # Apply scaling: X_std = (X - X.min) / (X.max - X.min) -> X_scaled = X_std * (max - min) + min
+            result[col] = (X[col] - self.min_[col]) * self.scale_[col] + f_min
+            
+        return result
+        
+    def inverse_transform(self, X):
+        """Undo the scaling of X according to feature_range.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to inverse transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The inverse-transformed DataFrame.
+        """
+        if not self.min_:
+            raise ValueError("This MinMaxScaler instance is not fitted yet. Call 'fit' before using this method.")
+            
+        result = X.copy()
+        
+        f_min, f_max = self.feature_range
+        
+        for col in self.min_.keys():
+            if col not in X.columns:
+                continue
+                
+            # Skip columns marked as constant
+            if self.scale_[col] == 0:
+                continue
+                
+            # Apply inverse scaling: X_original = (X_scaled - min) / (max - min) * (X.max - X.min) + X.min
+            result[col] = (X[col] - f_min) / self.scale_[col] + self.min_[col]
+            
+        return result
+
+class StandardScalerTransformer(BaseTransformer):
+    """Apply standard scaling (zero mean, unit variance) to data."""
+
+    def __init__(self):
+        self.means = {}
+        self.stds = {}
+
+    def fit(self, X):
+        """Compute mean and standard deviation for each feature."""
+        for col in X.columns:
+            self.means[col] = X[col].mean()
+            self.stds[col] = X[col].std()
+            if self.stds[col] == 0:
+                self.stds[col] = 1.0  # Avoid division by zero
+        return self
+
+    def transform(self, X):
+        """Transform the data using mean and std from fit."""
+        result = X.copy()
+        for col in X.columns:
+            if col in self.means:
+                mean = self.means[col]
+                std = self.stds[col]
+                result[col] = (X[col] - mean) / std
+        return result
+
+    def inverse_transform(self, X):
+        """Inverse transform data back to original scale."""
+        result = X.copy()
+        for col in X.columns:
+            if col in self.means:
+                mean = self.means[col]
+                std = self.stds[col]
+                result[col] = (X[col] * std) + mean
+        return result
+
+class NormalScoreTransformer(BaseTransformer):
+    """A transformer for normal score transformation."""
+
+    def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False):
+        self.tol = tol
+        self.max_samples = max_samples
+        self.quadratic_extrapolation = quadratic_extrapolation
+        self.column_parameters = {}
+        self.shared_z_scores = {}
+
+    def fit(self, X):
+        """Fit the transformer to the data."""
+        for col in X.columns:
+            values = X[col].values
+            sorted_vals = np.sort(values)
+            smoothed_vals = self._moving_average_with_endpoints(sorted_vals)
+
+            n_points = len(smoothed_vals)
+            if n_points not in self.shared_z_scores:
+                self.shared_z_scores[n_points] = self._randrealgen_optimized(n_points)
+
+            z_scores = self.shared_z_scores[n_points]
+            
+            self.column_parameters[col] = {
+                'z_scores': z_scores,
+                'originals': smoothed_vals,
+            }
+        return self
+        
+    def transform(self, X):
+        """Transform the data using normal score transformation.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The transformed DataFrame with normal scores.
+        """
+        result = X.copy()
+        for col in X.columns:
+            params = self.column_parameters.get(col, {})
+            z_scores = params.get('z_scores', [])
+            originals = params.get('originals', [])
+            
+            if len(z_scores) == 0 or len(originals) == 0:
+                continue
+                
+            values = X[col].values
+            
+            # Handle values outside the original range
+            min_orig, max_orig = np.min(originals), np.max(originals)
+            min_z, max_z = np.min(z_scores), np.max(z_scores)
+            
+            # For values within range, use interpolation
+            within_range = (values >= min_orig) & (values <= max_orig)
+            if within_range.any():
+                result.loc[within_range, col] = np.interp(
+                    values[within_range], originals, z_scores
+                )
+                
+            # For values outside range, use extrapolation if enabled or clamp to bounds
+            below_min = values < min_orig
+            above_max = values > max_orig
+            
+            if below_min.any():
+                if self.quadratic_extrapolation:
+                    # Use linear extrapolation below minimum
+                    slope = (z_scores[1] - z_scores[0]) / (originals[1] - originals[0])
+                    result.loc[below_min, col] = min_z + slope * (values[below_min] - min_orig)
+                else:
+                    # Otherwise clamp to minimum z-score
+                    result.loc[below_min, col] = min_z
+                    
+            if above_max.any():
+                if self.quadratic_extrapolation:
+                    # Use linear extrapolation above maximum
+                    slope = (z_scores[-1] - z_scores[-2]) / (originals[-1] - originals[-2])
+                    result.loc[above_max, col] = max_z + slope * (values[above_max] - max_orig)
+                else:
+                    # Otherwise clamp to maximum z-score
+                    result.loc[above_max, col] = max_z
+            
+        return result
+
+    def inverse_transform(self, X):
+        """Inverse transform data back to original space.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame with transformed data to inverse transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The inverse-transformed DataFrame.
+        """
+        result = X.copy()
+        for col in X.columns:
+            params = self.column_parameters.get(col, {})
+            z_scores = params.get('z_scores', [])
+            originals = params.get('originals', [])
+            if len(z_scores) == 0 or len(originals) == 0:
+                continue
+
+            # Get values to inverse transform
+            values = X[col].values
+            min_z, max_z = np.min(z_scores), np.max(z_scores)
+            min_orig, max_orig = np.min(originals), np.max(originals)
+
+            # For values within the z-score range, use interpolation
+            within_range = (values >= min_z) & (values <= max_z)
+            if within_range.any():
+                result.loc[within_range, col] = np.interp(values[within_range], z_scores, originals)
+            
+            # For values outside the z-score range, use extrapolation if enabled
+            below_min = values < min_z
+            above_max = values > max_z
+            
+            if below_min.any():
+                if self.quadratic_extrapolation:
+                    # Use linear extrapolation below minimum z-score
+                    slope = (originals[1] - originals[0]) / (z_scores[1] - z_scores[0])
+                    intercept = originals[0] - slope * z_scores[0]
+                    result.loc[below_min, col] = slope * values[below_min] + intercept
+                else:
+                    # Otherwise clamp to minimum original value
+                    result.loc[below_min, col] = min_orig
+                    
+            if above_max.any():
+                if self.quadratic_extrapolation:
+                    # Use linear extrapolation above maximum z-score
+                    slope = (originals[-1] - originals[-2]) / (z_scores[-1] - z_scores[-2])
+                    intercept = originals[-1] - slope * z_scores[-1]
+                    result.loc[above_max, col] = slope * values[above_max] + intercept
+                else:
+                    # Otherwise clamp to maximum original value
+                    result.loc[above_max, col] = max_orig
+
+        return result
+
+    def _randrealgen_optimized(self, nreal):
+        rval = np.zeros(nreal)
+        nsamp = 0
+        numsort = (nreal + 1) // 2 if nreal % 2 == 0 else nreal // 2
+
+        while nsamp < self.max_samples:
+            nsamp += 1
+            work1 = np.random.normal(size=nreal)
+            work1.sort()
+
+            if nsamp > 1:
+                previous_mean = rval[:numsort] / (nsamp - 1)
+                rval[:numsort] += work1[:numsort]
+                current_mean = rval[:numsort] / nsamp
+                max_diff = np.max(np.abs(current_mean - previous_mean))
+
+                if max_diff <= self.tol:
+                    break
+            else:
+                rval[:numsort] = work1[:numsort]
+
+        rval[:numsort] /= nsamp
+        rval[numsort:] = -rval[:numsort][::-1] if nreal % 2 == 0 else np.concatenate(([-rval[numsort]], -rval[:numsort][::-1]))
+        return rval
+
+    def _moving_average_with_endpoints(self, y_values):
+        """Apply a moving average smoothing to an array while preserving endpoints."""
+        window_size = 3
+        if y_values.shape[0] > 40:
+            window_size = 5
+        if y_values.shape[0] > 90:
+            window_size = 7
+        if y_values.shape[0] > 200:
+            window_size = 9
+
+        if window_size % 2 == 0:
+            raise ValueError("window_size must be odd")
+        half_window = window_size // 2
+        smoothed_y = np.zeros_like(y_values)
+
+        # Handle start points correctly
+        for i in range(0, half_window):
+            smoothed_y[i] = np.mean(y_values[:i + half_window + 1])
+        
+        # Handle end points correctly 
+        for i in range(1, half_window + 1):
+            smoothed_y[-i] = np.mean(y_values[-(i + half_window):])
+        
+        # Middle points
+        for i in range(half_window, len(y_values) - half_window):
+            smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window + 1])
+
+        # Preserve original endpoints exactly
+        smoothed_y[0] = y_values[0]
+        smoothed_y[-1] = y_values[-1]
+        
+        # Ensure monotonicity
+        for i in range(1, len(smoothed_y)):
+            if smoothed_y[i] <= smoothed_y[i - 1]:
+                smoothed_y[i] = smoothed_y[i - 1] + 1e-16
+
+        return smoothed_y
+
+class TransformerPipeline:
+    """Apply a sequence of transformers in order."""
+
+    def __init__(self):
+        self.transformers = []
+        self.fitted = False
+
+    def add(self, transformer, columns=None):
+        """Add a transformer to the pipeline, optionally for specific columns."""
+        self.transformers.append((transformer, columns))
+        return self
+
+    def fit(self, X):
+        """Fit all transformers in the pipeline."""
+        for transformer, columns in self.transformers:
+            cols_to_transform = columns if columns is not None else X.columns
+            sub_X = X[cols_to_transform]
+            transformer.fit(sub_X)
+        self.fitted = True
+        return self
+
+    def transform(self, X):
+        """Transform data using all transformers in the pipeline.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The transformed DataFrame.
+        """
+        result = X.copy()
+        for transformer, columns in self.transformers:
+            cols_to_transform = columns if columns is not None else X.columns
+            # Only use columns that exist in the input data
+            valid_cols = [col for col in cols_to_transform if col in X.columns]
+            if not valid_cols:
+                continue
+            sub_X = result[valid_cols]
+            result[valid_cols] = transformer.transform(sub_X)
+        return result
+
+    def fit_transform(self, X):
+        """Fit all transformers and transform data in one operation."""
+        self.fit(X)
+        return self.transform(X)
+
+    def inverse_transform(self, X):
+        """Apply inverse transformations in reverse order.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            The DataFrame to inverse transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The inverse-transformed DataFrame.
+        """
+        
+        if isinstance(X, pd.Series):
+            result = X.copy().to_frame().T
+        else:
+            result = X.copy()
+        # Need to reverse the order of transformers for inverse
+        for transformer, columns in reversed(self.transformers):
+            cols_to_transform = columns if columns is not None else result.columns
+            # Only use columns that exist in the input data
+            valid_cols = [col for col in cols_to_transform if col in result.columns]
+            if not valid_cols:
+                continue
+            sub_X = result[valid_cols].copy()  # Create a copy to avoid reference issues
+            inverted = transformer.inverse_transform(sub_X)
+            result.loc[:, valid_cols] = inverted  # Use loc for proper assignment
+        if isinstance(X, pd.Series):
+            result = result.iloc[0]
+        return result
+
+class AutobotsAssemble:
+    """Class for transforming features in a DataFrame using a pipeline approach."""
+
+    def __init__(self, df=None):
+        self.df = df.copy() if df is not None else None
+        self.pipeline = TransformerPipeline()
+
+    def apply(self, transform_type, columns=None, **kwargs):
+        """Apply a transformation to specified columns."""
+        transformer = self._create_transformer(transform_type, **kwargs)
+        if columns is None:
+            columns = list(self.df.columns)  # Convert to list to avoid pandas index issues
+        
+        # Fit transformer to data if needed
+        if hasattr(transformer, 'fit') and callable(transformer.fit):
+            if self.df is not None:
+                df_subset = self.df[columns]
+                transformer.fit(df_subset)
+        
+        # Add to pipeline
+        self.pipeline.add(transformer, columns)
+        
+        # Apply transformation to current df if available
+        if self.df is not None:
+            # Use transform directly to ensure correct application
+            df_subset = self.df[columns].copy()
+            transformed = transformer.transform(df_subset)
+            self.df[columns] = transformed
+            
+        return self
+
+    def transform(self, df):
+        """Transform an external DataFrame using the pipeline.
+        
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            The DataFrame to transform.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The transformed DataFrame.
+        """
+        if self.pipeline.transformers:
+            return self.pipeline.transform(df)
+        return df.copy()
+
+    def inverse(self, df=None):
+        """Apply inverse transformations in reverse order."""
+        to_transform = df if df is not None else self.df
+        result = self.pipeline.inverse_transform(to_transform)
+        if df is None:
+            self.df = result
+        return result
+
+    def inverse_on_external_df(self, df, columns=None):
+        """Apply inverse transformations to an external DataFrame.
+        
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            The DataFrame to inverse transform.
+        columns : list, optional
+            Specific columns to inverse transform. If None, all columns are processed.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The inverse-transformed DataFrame.
+        """
+        to_transform = df.copy()
+        if columns is not None:
+            # Ensure we only process specified columns
+            missing_cols = [col for col in columns if col not in df.columns]
+            if missing_cols:
+                raise ValueError(f"Columns not found in DataFrame: {missing_cols}")
+            
+        return self.pipeline.inverse_transform(to_transform)
+
+    def _create_transformer(self, transform_type, **kwargs):
+        """Factory method to create appropriate transformer."""
+        if transform_type == "log10":
+            return Log10Transformer()
+        elif transform_type == "normal_score":
+            return NormalScoreTransformer(**kwargs)
+        elif transform_type == "row_wise_minmax":
+            return RowWiseMinMaxScaler(**kwargs)
+        elif transform_type == "standard_scaler":
+            return StandardScalerTransformer()
+        elif transform_type == "minmax_scaler":
+            return MinMaxScaler(**kwargs)
+        else:
+            raise ValueError(f"Unknown transform type: {transform_type}")
\ No newline at end of file

From f3c45acb59868e142b86b3a3f5ce33c78cb6a24f Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 16 Jun 2025 15:14:54 +0100
Subject: [PATCH 06/58] transformer tests

---
 autotest/transformer_tests.py | 462 ++++++++++++++++++++++++++++++++++
 1 file changed, 462 insertions(+)
 create mode 100755 autotest/transformer_tests.py

diff --git a/autotest/transformer_tests.py b/autotest/transformer_tests.py
new file mode 100755
index 000000000..493be1179
--- /dev/null
+++ b/autotest/transformer_tests.py
@@ -0,0 +1,462 @@
+import os
+import sys
+import shutil
+import pytest
+import numpy as np
+import pandas as pd
+import platform
+sys.path.append("..")
+import pyemu
+
+def test_base_transformer():
+    """Test the BaseTransformer abstract class functionality"""
+    bt = pyemu.emulators.BaseTransformer()
+    
+    # fit should return self
+    assert bt.fit(None) is bt
+    
+    # fit_transform should call fit and transform
+    with pytest.raises(NotImplementedError):
+        bt.fit_transform(None)
+    
+    # transform should raise NotImplementedError
+    with pytest.raises(NotImplementedError):
+        bt.transform(None)
+    
+    # inverse_transform should raise NotImplementedError
+    with pytest.raises(NotImplementedError):
+        bt.inverse_transform(None)
+
+def test_log10_transformer():
+    """Test the Log10Transformer functionality"""
+    # Create test dataframe with positive and negative values
+    df = pd.DataFrame({
+        'pos': [1, 10, 100, 1000],
+        'zero': [0, 0.1, 0.01, 0.001],
+        'neg': [-1, -10, -100, -1000]
+    })
+    
+    # Initialize and test transformer
+    lt = pyemu.emulators.Log10Transformer()
+    
+    # Transform data
+    transformed = lt.transform(df)
+    
+    # Check that positive values are properly transformed
+    np.testing.assert_allclose(
+        transformed['pos'].values,
+        np.log10(df['pos'].values)
+    )
+    
+    # Check that zeros/small values are handled correctly
+    assert not np.any(np.isinf(transformed['zero'].values))
+    
+    # Check that negative values are handled correctly
+    assert not np.any(np.isnan(transformed['neg'].values))
+    
+    # Test inverse transform
+    back_transformed = lt.inverse_transform(transformed)
+    
+    # Check that we get back very close to original values
+    np.testing.assert_allclose(
+        back_transformed['pos'].values, 
+        df['pos'].values
+    )
+    
+    # For zero/very small values
+    np.testing.assert_allclose(
+        back_transformed['zero'].values,
+        df['zero'].values ,
+        rtol=1e-6
+    )
+    
+    # For negative values
+    np.testing.assert_allclose(
+        back_transformed['neg'].values,
+        df['neg'].values ,
+        rtol=1e-6
+    )
+
+def test_row_wise_minmax_scaler():
+    """Test the RowWiseMinMaxScaler functionality"""
+    # Test data
+    df = pd.DataFrame({
+        'a': [1, 2, 3, 4],
+        'b': [10, 20, 30, 40],
+        'c': [100, 200, 300, 400]
+    })
+    
+    # Initialize scaler
+    scaler = pyemu.emulators.RowWiseMinMaxScaler()
+    
+    # Fit and transform
+    transformed = scaler.fit_transform(df)
+    
+    # Check each row is scaled to [0, 1]
+    for i in range(len(df)):
+        row_min = transformed.iloc[i].min()
+        row_max = transformed.iloc[i].max()
+        assert np.isclose(row_min, -1.0)
+        assert np.isclose(row_max, 1.0)
+    
+    # Test inverse transform
+    back_transformed = scaler.inverse_transform(transformed)
+    
+    # Check we get back original values
+    np.testing.assert_allclose(back_transformed.values, df.values)
+
+def test_normal_score_transformer():
+    """Test the NormalScoreTransformer functionality"""
+    # Create test data with various distributions
+    np.random.seed(42)
+    n = 200
+    
+    # Uniform data
+    uniform_data = np.random.uniform(0, 10, n)
+    
+    # Log-normal data
+    lognormal_data = np.exp(np.random.normal(0, 1, n))
+    
+    # Bimodal data
+    bimodal_data = np.concatenate([
+        np.random.normal(-3, 1, n//2),
+        np.random.normal(3, 1, n//2)
+    ])
+    
+    df = pd.DataFrame({
+        'uniform': uniform_data,
+        'lognormal': lognormal_data,
+        'bimodal': bimodal_data
+    })
+    
+    # Initialize transformer
+    nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False)
+    
+    # Transform data
+    transformed = nst.fit_transform(df)
+    
+    # Check transformed distributions are more normal
+    # For each column, check skewness and kurtosis are closer to normal
+    for col in df.columns:
+        # Calculate statistics of original and transformed data
+        orig_skew = skewness(df[col].values)
+        trans_skew = skewness(transformed[col].values)
+        
+        orig_kurt = kurtosis(df[col].values)
+        trans_kurt = kurtosis(transformed[col].values)
+        
+        # Transformed data should have skewness closer to 0
+        assert abs(trans_skew) < abs(orig_skew) or np.isclose(abs(trans_skew), 0, atol=0.5)
+        
+        # Transformed data should have kurtosis closer to 3 (normal distribution)
+        assert abs(trans_kurt - 3) < abs(orig_kurt - 3) or np.isclose(trans_kurt, 3, atol=1.0)
+    
+    # Test inverse transform
+    back_transformed = nst.inverse_transform(transformed)
+    
+    # Check we get back close to original values 
+    # (not exact due to binning and smoothing)
+    np.testing.assert_allclose(
+        back_transformed.values, 
+        df.values,
+        rtol=0.1,
+        atol=0.1
+    )
+    
+    # Test with quadratic extrapolation
+    nst_quad = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True)
+    transformed_quad = nst_quad.fit_transform(df)
+    
+    # Create data outside the original range for extrapolation test
+    # Transform should not fail for out-of-range values when using quadratic extrapolation
+    extreme_transformed = transformed_quad.copy()
+    extreme_transformed.loc[0] = transformed_quad.min() - 1
+    extreme_transformed.loc[1] = transformed_quad.max() + 1
+    
+    back_extreme = nst_quad.inverse_transform(extreme_transformed)
+    assert not np.any(np.isnan(back_extreme.values))
+    assert not np.any(np.isinf(back_extreme.values))
+
+def test_transformer_pipeline():
+    """Test the TransformerPipeline functionality"""
+    # Create test data
+    df = pd.DataFrame({
+        'a': [1, 2, 3, 4],
+        'b': [10, 20, 30, 40],
+        'c': [100, 200, 300, 400]
+    })
+    
+    # Create pipeline with multiple transformers
+    pipeline = pyemu.emulators.TransformerPipeline()
+    
+    # Add log transformer for all columns
+    log_trans = pyemu.emulators.Log10Transformer()
+    pipeline.add(log_trans)
+    
+    # Add row-wise min-max scaler for specific columns
+    minmax_trans = pyemu.emulators.RowWiseMinMaxScaler()
+    pipeline.add(minmax_trans, columns=['a', 'b'])
+    
+    # Transform data
+    transformed = pipeline.transform(df)
+    
+    # Check log was applied to all columns
+    np.testing.assert_allclose(
+        transformed['c'].values,
+        np.log10(df['c'].values)
+    )
+    
+    # Check minmax was applied only to a and b
+    for i in range(len(df)):
+        row_subset = transformed.iloc[i][['a', 'b']]
+        assert np.isclose(row_subset.min(), 0.0) or np.isclose(row_subset.max(), 1.0)
+    
+    # Test inverse transform
+    back_transformed = pipeline.inverse_transform(transformed)
+    
+    # Check we get back close to original values
+    np.testing.assert_allclose(back_transformed.values, df.values, rtol=1e-5)
+
+def test_autobots_assemble():
+    """Test the AutobotsAssemble class functionality"""
+    # Create test data
+    df = pd.DataFrame({
+        'a': [1, 2, 3, 4],
+        'b': [10, 20, 30, 40],
+        'c': [-10, -20, -30, -40]
+    })
+    
+    # Save original data for comparison
+    original_df = df.copy()
+    
+    # Initialize with data
+    aa = pyemu.emulators.AutobotsAssemble(df)
+    
+    # Apply log transform to positive columns
+    aa.apply('log10', columns=['a', 'b'])
+    
+    # Check the transform was applied correctly
+    np.testing.assert_allclose(
+        aa.df[['a', 'b']].values,
+        np.log10(original_df[['a', 'b']].values)
+    )
+    
+    # Check that column c is unchanged
+    np.testing.assert_array_equal(aa.df['c'].values, original_df['c'].values)
+    
+    # Save intermediate state after log transform
+    log_transformed = aa.df.copy()
+    
+    # Apply normal score transform to all columns
+    aa.apply('normal_score')
+    
+    # Save state after normal score transform
+    normal_transformed = aa.df.copy()
+    
+    # Verify both transforms were applied (data should be different from log transform)
+    assert not np.allclose(normal_transformed.values, log_transformed.values)
+    
+    # Apply the inverse transformation
+    back_transformed = aa.inverse()
+    
+    # Check we get back close to original values
+    np.testing.assert_allclose(back_transformed.values, original_df.values, rtol=0.1)
+    
+    # Test with external already-transformed data
+    external_transformed = pd.DataFrame({
+        'a': [-0.5, 0.0, 0.5],  # Already transformed data in normal score space
+        'b': [0.5, 0.0, -0.5],  # (approximately in the normal distribution range)
+        'c': [1.0, 0.0, -1.0]
+    })
+    
+    # Test inverse transform on external transformed data
+    back_external = aa.inverse(external_transformed)
+    
+    # Check that shape is preserved
+    assert back_external.shape == external_transformed.shape
+    
+    # Verify output has reasonable values (should be in the range of original data)
+    for col in ['a', 'b']:
+        # These columns had log transform applied, so should be positive
+        assert np.all(back_external[col] > 0)
+    
+    # Column c should have values in the range of the original data
+    assert np.min(back_external['c']) >= -40
+    assert np.max(back_external['c']) <= -10
+    
+    # Apply transform again to verify roundtrip accuracy
+    roundtrip = aa.transform(back_external)
+    
+    # Check roundtrip accuracy for values within standard normal range (-2 to 2)
+    for col in external_transformed.columns:
+        # Find values within the normal range
+        mask = (external_transformed[col] >= -2) & (external_transformed[col] <= 2)
+        if mask.any():
+            # Get the values to compare
+            expected = external_transformed.loc[mask, col].values
+            actual = roundtrip.loc[mask, col].values
+            
+            # Handle zeros and near-zeros with absolute tolerance instead of relative
+            zero_mask = np.isclose(expected, 0, atol=1e-10)
+            if zero_mask.any():
+                # For zeros, use absolute tolerance
+                np.testing.assert_allclose(
+                    actual[zero_mask],
+                    expected[zero_mask],
+                    atol=0.1  # Absolute tolerance for zeros
+                )
+                
+                # For non-zeros, use relative tolerance
+                if (~zero_mask).any():
+                    np.testing.assert_allclose(
+                        actual[~zero_mask],
+                        expected[~zero_mask],
+                        rtol=0.1  # Relative tolerance for non-zeros
+                    )
+            else:
+                # No zeros, use normal comparison
+                np.testing.assert_allclose(
+                    actual,
+                    expected,
+                    rtol=0.1
+                )
+    
+    # Additional test to verify pipeline order is maintained
+    # Create a new pipeline with transforms in different order
+    bb = pyemu.emulators.AutobotsAssemble(original_df.copy())
+    
+    # First normal score, then log10
+    bb.apply('normal_score')
+    bb.apply('log10', columns=['a', 'b'])
+    
+    # Apply inverse - should revert log10 first, then normal_score
+    back_bb = bb.inverse()
+    
+    # Check we get back close to original values
+    np.testing.assert_allclose(back_bb.values, original_df.values, rtol=0.1)
+
+
+
+def skewness(x):
+    """Calculate skewness of a distribution"""
+    n = len(x)
+    x_mean = np.mean(x)
+    return (np.sum((x - x_mean) ** 3) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 1.5)
+
+def kurtosis(x):
+    """Calculate kurtosis of a distribution"""
+    n = len(x)
+    x_mean = np.mean(x)
+    return (np.sum((x - x_mean) ** 4) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 2)
+
+
+
+
+def test_normal_score_with_external_data():
+    """Test NormalScoreTransformer with external already-transformed data"""
+    # Create training data with a specific distribution
+    np.random.seed(42)
+    n = 100
+    training_data = pd.DataFrame({
+        'normal': np.random.normal(5, 2, n),
+        'lognormal': np.exp(np.random.normal(1, 0.5, n)),
+        'uniform': np.random.uniform(0, 10, n)
+    })
+    
+    # Create "external" data that we'll pretend is already transformed
+    # For this test, we'll generate values in the typical normal score range (-3 to 3)
+    external_transformed = pd.DataFrame({
+        'normal': np.random.normal(0, 1, 1),  # Already in normal score space
+        'lognormal': np.random.normal(0, 1, 1),
+        'uniform': np.random.normal(0, 1, 1)
+    })
+    
+    # Initialize and fit transformer on training data
+    nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True)
+    nst.fit(training_data)
+    
+    # Transform training data to verify transformation works
+    transformed_training = nst.transform(training_data)
+    
+    # Check that transformed data has properties of normal distribution
+    for col in training_data.columns:
+        # Mean should be close to 0
+        assert abs(transformed_training[col].mean()) < 0.3
+        # Standard deviation should be close to 1
+        assert abs(transformed_training[col].std() - 1.0) < 0.3
+    
+    # Store column parameters for inspection
+    z_scores = {}
+    originals = {}
+    for col in training_data.columns:
+        params = nst.column_parameters.get(col, {})
+        z_scores[col] = params.get('z_scores', [])
+        originals[col] = params.get('originals', [])
+        
+        # Verify column parameters were created
+        assert len(z_scores[col]) > 0
+        assert len(originals[col]) > 0
+    
+    # Apply inverse transform to external transformed data directly
+    back_external = nst.inverse_transform(external_transformed)
+    
+    # Verify the shape matches
+    assert back_external.shape == external_transformed.shape
+    
+    # Apply the transform to back_external to check if it recovers external_transformed
+    re_transformed = nst.transform(back_external)
+    
+    # Check that re-transforming recovers values close to the external_transformed
+    # Note: exact recovery isn't expected due to interpolation/extrapolation
+    for col in external_transformed.columns:
+        # Values inside the normal range (-2 to 2) should be very close
+        inside_range = (external_transformed[col] >= -2) & (external_transformed[col] <= 2)
+        if inside_range.any():
+            np.testing.assert_allclose(
+                re_transformed.loc[inside_range, col].values,
+                external_transformed.loc[inside_range, col].values,
+                rtol=0.2
+            )
+    
+    # Test external values that are far outside the z-score range
+    extreme_transformed = pd.DataFrame({
+        'normal': np.array([-5, 0, 5],dtype=float),  # Includes extreme values
+        'lognormal': np.array([-5, 0, 5],dtype=float),
+        'uniform': np.array([-5, 0, 5],dtype=float)
+    })
+    
+    # Test with extrapolation first
+    nst_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True)
+    nst_extrap.fit(training_data)
+    back_extreme_extrap = nst_extrap.inverse_transform(extreme_transformed)
+    
+    # Test without extrapolation
+    nst_no_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False)
+    nst_no_extrap.fit(training_data)
+    back_extreme_no_extrap = nst_no_extrap.inverse_transform(extreme_transformed)
+    
+    # With extrapolation, extreme values should be outside the original data range
+    for col in training_data.columns:
+        min_orig = training_data[col].min()
+        max_orig = training_data[col].max()
+        
+        # Check extrapolation is working (values outside original range)
+        assert back_extreme_extrap[col].min() < min_orig or back_extreme_extrap[col].max() > max_orig
+        
+        # Without extrapolation, values should be clamped to original range
+        assert back_extreme_no_extrap[col].min() >= min_orig - 1e-10  # Allow for floating point error
+        assert back_extreme_no_extrap[col].max() <= max_orig + 1e-10
+    
+    # Test with AutobotsAssemble to ensure the pipeline works with external transformed data
+    aa = pyemu.emulators.AutobotsAssemble(training_data.copy())
+    aa.apply('normal_score')
+    
+    # Test applying inverse transform to external data
+    back_from_aa = aa.inverse(external_transformed.copy())
+    
+    # Verify results with direct inverse transform
+    np.testing.assert_allclose(
+        back_from_aa.values,
+        nst.inverse_transform(external_transformed).values,
+        rtol=1e-3
+    )
\ No newline at end of file

From d1d684e3a0dfb2692fa08b90d53e42dc4663af5a Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 16 Jun 2025 16:17:05 +0100
Subject: [PATCH 07/58] dsi initial commit

---
 pyemu/emulators/dsi.py | 598 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 598 insertions(+)
 create mode 100755 pyemu/emulators/dsi.py

diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
new file mode 100755
index 000000000..6891f18e8
--- /dev/null
+++ b/pyemu/emulators/dsi.py
@@ -0,0 +1,598 @@
+"""
+Data Space Inversion (DSI) emulator implementation.
+"""
+from __future__ import print_function, division
+import numpy as np
+import pandas as pd
+import inspect
+from pyemu.utils.helpers import dsi_forward_run, series_to_insfile
+import pickle
+import os
+import shutil
+from pyemu.pst.pst_handler import Pst
+from pyemu.en import ObservationEnsemble,ParameterEnsemble
+from .base import Emulator
+
+class DSI(Emulator):
+    """
+    Data Space Inversion emulator class.
+    
+    #TODO: add more docstring details
+    
+    Parameters
+    ----------
+    pst : Pst, optional
+        A Pst object. If provided, the emulator will be initialized with the
+        information from the Pst object.
+    sim_ensemble : ObservationEnsemble, optional
+        An ensemble of simulated observations. If provided, the emulator will
+        be initialized with the information from the ensemble.
+    transforms : list of dict, optional
+        List of transformation specifications. Each dict should have:
+        - 'type': str - Type of transformation (e.g.,'log10', 'normal_score').
+        - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
+        - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
+        Example:
+        transforms = [
+            {'type': 'log10', 'columns': ['obs1', 'obs2']},
+            {'type': 'normal_score', 'quadratic_extrapolation': True}
+        ]
+        Default is None, which means no transformations will be applied.
+    energy_threshold : float, optional 
+        The energy threshold for the SVD. Default is 1.0, no truncation.
+    verbose : bool, optional
+        If True, enable verbose logging. Default is False.
+    """
+
+    def __init__(self, 
+                pst=None,
+                sim_ensemble=None,
+                transforms=None,
+                energy_threshold=1.0,
+                verbose=False):
+        """
+        Initialize the DSI emulator.
+
+        Parameters
+        ----------
+        pst : Pst, optional
+            A Pst object. If provided, the emulator will be initialized with the
+            information from the Pst object.
+        sim_ensemble : ObservationEnsemble, optional
+            An ensemble of simulated observations. If provided, the emulator will
+            be initialized with the information from the ensemble.
+        transforms : list of dict, optional
+            List of transformation specifications. Each dict should have:
+            - 'type': str - Type of transformation (e.g.,'log10', 'normal_score').
+            - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
+            - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
+            Example:
+            transforms = [
+                {'type': 'log10', 'columns': ['obs1', 'obs2']},
+                {'type': 'normal_score', 'quadratic_extrapolation': True}
+            ]
+            Default is None, which means no transformations will be applied.
+        energy_threshold : float, optional 
+            The energy threshold for the SVD. Default is 1.0, no truncation.
+        verbose : bool, optional
+            If True, enable verbose logging. Default is False.
+        """
+
+        super().__init__(verbose=verbose)
+
+        self.__org_observation_data = pst.observation_data.copy() if pst is not None else None
+        #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None
+        #self.__org_control_data = pst.control_data.copy() #breaks pickling
+        if isinstance(sim_ensemble, ObservationEnsemble):
+            sim_ensemble = sim_ensemble._df.copy()
+        #self.__org_sim_ensemble = sim_ensemble.copy() if sim_ensemble is not None else None
+        self.data = sim_ensemble.copy() if sim_ensemble is not None else None
+        #self.feature_scaler = None
+        self.energy_threshold = energy_threshold
+        assert isinstance(transforms, list) or transforms is None, "transforms must be a list of dicts or None"
+        if transforms is not None:
+            for t in transforms:
+                assert isinstance(t, dict), "each transform must be a dict"
+                assert 'type' in t, "each transform dict must have a 'type' key"
+                if 'columns' in t:
+                    assert isinstance(t['columns'], list), "'columns' must be a list of column names"
+                    #all columns must be in the data
+                    assert all([col in self.data.columns for col in t['columns']]), "some columns in 'columns' are not in the data"
+                if t['type'] == 'normal_score':
+                    # check for quadratic_extrapolation
+                    if 'quadratic_extrapolation' in t:
+                        assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean"
+            self.transforms = transforms
+        self.fitted = False
+        self.data_transformed = None
+        self.decision_variable_names = None #used for DSIVC
+        
+    def prepare_training_data(self, data=None):
+        """
+        Prepare training data by applying transformations and computing the projection matrix.
+        
+        This method follows these steps:
+        1. Apply feature transformations (log transform, normal score transform)
+        2. Compute projection matrix using SVD
+        
+        Parameters
+        ----------
+        data : pandas.DataFrame, optional
+            Data to prepare. If None, uses self.data.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            The prepared data.
+        """
+        if data is None:
+            data = self.data
+            
+        if data is None:
+            raise ValueError("No data provided and no data stored in the emulator")
+        
+        # Always use the base class transformation method for consistency
+        if self.transforms is not None:
+            self.data_transformed = self.apply_feature_transforms(data, self.transforms)
+        else:
+            # Still need to set up a dummy transformer for inverse operations
+            from .transformers import AutobotsAssemble
+            self.feature_transformer = AutobotsAssemble(data.copy())
+            self.data_transformed = data.copy()
+    
+        return self.data_transformed
+        
+    def compute_projection_matrix(self, energy_threshold=None):
+        """
+        Compute the projection matrix using SVD.
+        
+        Parameters
+        ----------
+        energy_threshold : float, optional
+            Energy threshold for truncation. Default is None, which uses the threshold from initialization.
+            
+        Returns
+        -------
+        None
+        """
+        self.logger.statement("normalizing data")
+        # normalize the data by subtracting the mean and dividing by the standard deviation
+        X = self.data_transformed.copy()
+        deviations = X - X.mean()
+        z = deviations / np.sqrt(float(X.shape[0] - 1))
+        if isinstance(z, pd.DataFrame):
+            z = z.values
+
+        self.logger.statement("undertaking SVD")
+        u, s, v = np.linalg.svd(z, full_matrices=False)
+        us = np.dot(v.T, np.diag(s))
+        if energy_threshold is None:
+            energy_threshold = self.energy_threshold
+        if energy_threshold<1.0:
+            self.logger.statement("applying energy truncation")
+            # compute the cumulative energy of the singular values
+            cumulative_energy = np.cumsum(s**2) / np.sum(s**2)
+            print(cumulative_energy)
+            # find the number of components needed to reach the energy threshold
+            num_components = np.argmax(cumulative_energy >= energy_threshold) + 1
+            # keep only the first num_components singular values and vectors
+            us = us[:, :num_components]
+            s = s[:num_components]
+            u = u[:, :num_components]
+            print(f"Truncated from {len(s)} to {num_components} components while retaining {energy_threshold*100:.1f}% of variance")
+            if num_components<=1:
+                print(f"Warning: only {num_components} component retained, you may need to check the data")
+        
+        self.logger.statement("calculating us matrix")
+        
+        # store components needed for forward run
+        # store mean vector
+        self.ovals = self.data_transformed.mean(axis=0)
+        # store proj matrix and singular values
+        self.pmat = us
+        self.s = s
+        return
+    
+    def fit(self, X=None, y=None):
+        """
+        Fit the emulator to training data.
+        
+        Parameters
+        ----------
+        X : pandas.DataFrame
+            Input data to fit the emulator on.
+        y : None
+            Not used, present for API consistency.
+            
+        Returns
+        -------
+        self : DSI
+            The fitted emulator.
+        """
+        if X is not None:
+            self.data = X
+            self.logger.statement("transforming new training data")
+            self.data_transformed = self.prepare_training_data()
+        
+        if self.data_transformed is None:
+            self.logger.statement("transforming training data")
+            self.data_transformed = self.prepare_training_data()
+
+        # Compute projection matrix
+        self.compute_projection_matrix()
+        self.fitted = True
+        return self
+    
+    def predict(self, pvals):
+        """
+        Generate predictions from the emulator.
+        
+        Parameters
+        ----------
+        pvals : numpy.ndarray or pandas.Series
+            Parameter values for prediction.
+            
+        Returns
+        -------
+        pandas.Series
+            Predicted observation values.
+        """
+        if not self.fitted:
+            raise ValueError("Emulator must be fitted before prediction")
+            
+        if not hasattr(self, 'feature_transformer') or self.feature_transformer is None:
+            raise ValueError("Emulator must be fitted and have valid transformations before prediction")
+        
+        if isinstance(pvals, pd.Series):
+            pvals = pvals.values.flatten()
+        assert pvals.shape[0] == self.s.shape[0], "pvals must be the same length as the number of singular values"
+        assert pvals.shape[0] == self.pmat.shape[1], "pvals must be the same length as the number of singular values"
+        pmat = self.pmat
+        ovals = self.ovals
+        sim_vals = ovals + np.dot(pmat,pvals)
+        ft = self.feature_transformer
+        sim_vals = ft.inverse(sim_vals)
+        sim_vals.index.name = 'obsnme'
+        sim_vals.name = "obsval"
+        self.sim_vals = sim_vals
+        return sim_vals
+    
+    def check_for_pdc(self):
+        """Check for Prior data conflict."""
+        #TODO
+        return
+        
+    def prepare_pestpp(self, t_d=None, observation_data=None):
+        """
+        Prepare PEST++ control files for the emulator.
+        
+        Parameters
+        ----------
+        t_d : str, optional
+            Template directory path. Must be provided.
+        observation_data : pandas.DataFrame, optional
+            Observation data to use. If None, uses the data from initialization.
+            
+        Returns
+        -------
+        Pst
+            PEST++ control file object.
+        """
+        
+        assert t_d is not None, "template directory must be provided"
+        self.template_dir = t_d
+
+        if os.path.exists(t_d):
+            shutil.rmtree(t_d)
+        os.makedirs(t_d)
+        self.logger.statement("creating template directory {0}".format(t_d))
+
+        self.logger.log("creating tpl files")
+        dsi_in_file = os.path.join(t_d, "dsi_pars.csv")
+        dsi_tpl_file = dsi_in_file + ".tpl"
+        ftpl = open(dsi_tpl_file, 'w')
+        fin = open(dsi_in_file, 'w')
+        ftpl.write("ptf ~\n")
+        fin.write("parnme,parval1\n")
+        ftpl.write("parnme,parval1\n")
+        npar = self.s.shape[0]
+        assert npar>0, "no parameters found in the DSI emulator"
+        dsi_pnames = []
+        for i in range(npar):
+            pname = "dsi_par{0:04d}".format(i)
+            dsi_pnames.append(pname)
+            fin.write("{0},0.0\n".format(pname))
+            ftpl.write("{0},~   {0}   ~\n".format(pname, pname))
+        fin.close()
+        ftpl.close()
+        self.logger.log("creating tpl files")
+
+        # run once to get the dsi_pars.csv file
+        pvals = np.zeros_like(self.s)
+        sim_vals = self.predict(pvals)
+        
+        self.logger.log("creating ins file")
+        out_file = os.path.join(t_d,"dsi_sim_vals.csv")
+        sim_vals.to_csv(out_file,index=True)
+              
+        ins_file = out_file + ".ins"
+        sdf = pd.read_csv(out_file,index_col=0)
+        with open(ins_file,'w') as f:
+            f.write("pif ~\n")
+            f.write("l1\n")
+            for oname in sdf.index.values:
+                f.write("l1 ~,~ !{0}!\n".format(oname))
+        self.logger.log("creating ins file")
+
+        self.logger.log("creating Pst")
+        pst = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[ins_file],[out_file],pst_path=".")
+
+        par = pst.parameter_data
+        dsi_pars = par.loc[par.parnme.str.startswith("dsi_par"),"parnme"]
+        par.loc[dsi_pars,"parval1"] = 0
+        par.loc[dsi_pars,"parubnd"] = 10.0
+        par.loc[dsi_pars,"parlbnd"] = -10.0
+        par.loc[dsi_pars,"partrans"] = "none"
+        with open(os.path.join(t_d,"dsi.unc"),'w') as f:
+            f.write("START STANDARD_DEVIATION\n")
+            for p in dsi_pars:
+                f.write("{0} 1.0\n".format(p))
+            f.write("END STANDARD_DEVIATION")
+        pst.pestpp_options['parcov'] = "dsi.unc"
+
+        obs = pst.observation_data
+
+        if observation_data is None:
+            observation_data = self.__org_observation_data
+        assert isinstance(observation_data, pd.DataFrame), "observation_data must be a pandas DataFrame"
+        for col in observation_data.columns:
+            obs.loc[sim_vals.index,col] = observation_data.loc[:,col]
+
+        # check if any observations are missing
+        missing_obs = list(set(obs.index) - set(observation_data.index))
+        assert len(missing_obs) == 0, "missing observations: {0}".format(missing_obs)
+
+        pst.control_data.noptmax = 0
+        pst.model_command = "python forward_run.py"
+        self.logger.log("creating Pst")
+
+
+        function_source = inspect.getsource(dsi_forward_run)
+        with open(os.path.join(t_d,"forward_run.py"),'w') as file:
+            file.write(function_source)
+            file.write("\n\n")
+            file.write("if __name__ == \"__main__\":\n")
+            file.write(f"    {function_source.split('(')[0].split('def ')[1]}()\n")
+        self.logger.log("creating Pst")
+
+        pst.pestpp_options["save_binary"] = True
+        pst.pestpp_options["overdue_giveup_fac"] = 1e30
+        pst.pestpp_options["overdue_giveup_minutes"] = 1e30
+        pst.pestpp_options["panther_agent_freeze_on_fail"] = True
+        pst.pestpp_options["ies_no_noise"] = False
+        pst.pestpp_options["ies_subset_size"] = -10 # the more the merrier
+        #pst.pestpp_options["ies_bad_phi_sigma"] = 2.0
+        #pst.pestpp_options["save_binary"] = True
+
+        pst.write(os.path.join(t_d,"dsi.pst"),version=2)
+        self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst")))
+        
+        #self.pst_dsi = pst #breaks pickling #TODO: add save/load methods to Emulator class
+        with open(os.path.join(t_d,"dsi.pickle"),"wb") as f:
+            pickle.dump(self,f)
+        return pst
+        
+    def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None):
+        """
+        Prepare Data Space Inversion Variable Control (DSIVC) control files.
+        
+        Parameters
+        ----------
+        decvar_names : list or str
+            Names of decision variables.
+        t_d : str, optional
+            Template directory path.
+        pst : Pst, optional
+            PST control file object.
+        oe : ObservationEnsemble, optional
+            Observation ensemble.
+        track_stack : bool, optional
+            Whether to track the stack. Default is False.
+        dsi_args : dict, optional
+            Arguments for DSI.
+        percentiles : list, optional
+            Percentiles to calculate. Default is [0.25, 0.75, 0.5].
+        mou_population_size : int, optional
+            Population size for multi-objective optimization.
+            
+        Returns
+        -------
+        Pst
+            PEST++ control file object for DSIVC.
+        """
+        # check that percentiles is a list or array of floats between 0 and 1.
+        assert isinstance(percentiles, (list, np.ndarray)), "percentiles must be a list or array of floats"
+        assert all([isinstance(i, (float, int)) for i in percentiles]), "percentiles must be a list or array of floats"
+        assert all([0 <= i <= 1 for i in percentiles]), "percentiles must be between 0 and 1"
+        # ensure that pecentiles are unique
+        percentiles = np.unique(percentiles)
+
+
+        #track dsivc args for forward run
+        self.dsivc_args = {"percentiles":percentiles,
+                        "decvar_names":decvar_names,
+                            "track_stack":track_stack,
+                        }
+
+        if t_d is None:
+            self.logger.statement("using existing DSI template dir...")
+            t_d = self.template_dir
+        self.logger.statement(f"using {t_d} as template directory...")
+        assert os.path.exists(t_d), f"template directory {t_d} does not exist"
+
+        if pst is None:
+            self.logger.statement("no pst provided...")
+            self.logger.statement("using dsi.pst in DSI template dir...")
+            assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}"
+            pst = Pst(os.path.join(t_d,"dsi.pst"))
+        if oe is None:
+            self.logger.statement("no posterior DSI observation ensemble provided, using dsi.3.obs.jcb in DSI template dir...")
+            self.logger.statement(f"using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...")
+            assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}"
+            oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb"))
+        else:
+            assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble"
+
+        #check if decvar_names str
+        if isinstance(decvar_names, str):
+            decvar_names = [decvar_names]
+        # chekc htat decvars are in the oe columns
+        missing = [col for col in decvar_names if col not in oe.columns]
+        assert len(missing) == 0, f"The following decvars are missing from the DSI obs ensemble: {missing}"
+        # chekc htat decvars are in the pst observation data
+        missing = [col for col in decvar_names if col not in pst.obs_names]
+        assert len(missing) == 0, f"The following decvars are missing from the DSI pst control file: {missing}"
+
+
+        # handle DSI args
+        default_dsi_args =  {"noptmax":pst.control_data.noptmax,
+                            "decvar_weight":1.0,
+                            #"decvar_phi_factor":0.5,
+                            "num_pyworkers":1,
+                            }
+        # ensure it's a dict
+        if dsi_args is None:
+            dsi_args = {}
+        elif not isinstance(dsi_args, dict):
+            raise TypeError("Expected a dictionary for 'options'")
+        # merge with defaults (user values override defaults)
+        dsi_args = {**default_dsi_args, **dsi_args}
+
+
+        out_files = []
+
+        self.logger.statement(f"preparing stack stats observations...")
+        assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble"
+        if oe.index.name is None:
+            id_vars="index"
+        else:
+            id_vars=oe.index.name
+        stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars=id_vars)
+        stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True)
+        stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1)
+        stack_stats.set_index("obsnme",inplace=True)
+        stack_stats = stack_stats.obsval
+        self.logger.statement(f"stack osb recorded to dsi.stack_stats.csv...")
+        out_file = os.path.join(t_d,"dsi.stack_stats.csv")
+        out_files.append(out_file)
+        stack_stats.to_csv(out_file,float_format="%.6e")
+        series_to_insfile(out_file,ins_file=None)
+
+
+        if track_stack:
+            self.logger.statement(f"including {oe.values.flatten().shape[0]} stack observations...")
+
+            stack = oe._df.reset_index().melt(id_vars=id_vars)
+            stack.rename(columns={"value":"obsval"},inplace=True)
+            stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.index,axis=1)
+            stack.set_index("obsnme",inplace=True)
+            stack = stack.obsval
+            out_file = os.path.join(t_d,"dsi.stack.csv")
+            out_files.append(out_file)
+            stack.to_csv(out_file,float_format="%.6e")
+            series_to_insfile(out_file,ins_file=None)
+
+
+
+        self.logger.statement(f"prepare DSIVC template files...")
+        dsi_in_file = os.path.join(t_d, "dsivc_pars.csv")
+        dsi_tpl_file = dsi_in_file + ".tpl"
+        ftpl = open(dsi_tpl_file, 'w')
+        fin = open(dsi_in_file, 'w')
+        ftpl.write("ptf ~\n")
+        fin.write("parnme,parval1\n")
+        ftpl.write("parnme,parval1\n")
+        for pname in decvar_names:
+            val = oe._df.loc[:,pname].mean()
+            fin.write(f"{pname},{val:.6e}\n")
+            ftpl.write(f"{pname},~   {pname}   ~\n")
+        fin.close()
+        ftpl.close()
+
+        
+        self.logger.statement(f"building DSIVC control file...")
+        pst_dsivc = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[i+".ins" for i in out_files],out_files,pst_path=".")
+
+        self.logger.statement(f"setting dec var bounds...")
+        par = pst_dsivc.parameter_data
+        # set all parameters fixed
+        par.loc[:,"partrans"] = "fixed"
+        # constrain decvar pars to training data bounds
+        par.loc[decvar_names,"pargp"] = "decvars"
+        par.loc[decvar_names,"partrans"] = "none"
+        par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max()
+        par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min()
+        
+        self.logger.statement(f"zero-weighting observation data...")
+        # prepemtpively set obs weights 0.0
+        obs = pst_dsivc.observation_data
+        obs.loc[:,"weight"] = 0.0
+
+        self.logger.statement(f"getting obs metadata from DSI observation_data...")
+        obsorg = pst.observation_data.copy()
+        columns = [i for i in obsorg.columns if i !='obsnme']
+        for o in obsorg.obsnme.values:
+            obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values
+
+        obs.loc[stack_stats.index,"obgnme"] = "stack_stats"
+        #obs.loc[stack.index,"obgnme"] = "stack"
+
+        self.logger.statement(f"building dsivc_forward_run.py...")
+        pst_dsivc.model_command = "python dsivc_forward_run.py"
+        from pyemu.utils.helpers import dsivc_forward_run
+        function_source = inspect.getsource(dsivc_forward_run)
+        with open(os.path.join(t_d,"dsivc_forward_run.py"),'w') as file:
+            file.write(function_source)
+            file.write("\n\n")
+            file.write("if __name__ == \"__main__\":\n")
+            file.write(f"    {function_source.split('(')[0].split('def ')[1]}()\n")
+
+        self.logger.statement(f"preparing nominal initial population...")
+        if mou_population_size is None:
+            # set the population size to 2 * number of decision variables
+            # this is a good rule of thumb for MOU
+            mou_population_size = 2 * len(decvar_names)
+        # these should generally be twice the number of decision variables
+        if mou_population_size < 2 * len(decvar_names):
+            self.logger.statement(f"mou population is less than 2x number of decision variables, this may be too small...")
+        # sample 160 sets of decision variables from a unform distribution
+        dvpop = ParameterEnsemble.from_uniform_draw(pst_dsivc,num_reals=mou_population_size)
+        # record to external file for PESTPP-MOU
+        dvpop.to_binary(os.path.join(t_d,"initial_dvpop.jcb"))
+        # tell PESTPP-MOU about the new file
+        pst_dsivc.pestpp_options["mou_dv_population_file"] = 'initial_dvpop.jcb'
+
+
+        # some additional PESTPP-MOU options:
+        pst_dsivc.pestpp_options["mou_population_size"] = mou_population_size #twice the number of decision variables
+        pst_dsivc.pestpp_options["mou_save_population_every"] = 1 # save lots of files! 
+        
+        pst_dsivc.control_data.noptmax = 0 #just for a test run
+        pst_dsivc.write(os.path.join(t_d,"dsivc.pst"),version=2)  
+
+        # updating the DSI pst control file
+        self.logger.statement(f"updating DSI pst control file...")
+        self.logger.statement("overwriting dsi.pst file...")
+        pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"]
+        pst.control_data.noptmax = dsi_args["noptmax"]
+        pst.write(os.path.join(t_d,"dsi.pst"), version=2)
+        
+        
+        self.logger.statement("overwriting dsi.pickle file...")
+        self.decision_variable_names = decvar_names
+        # re-pickle dsi to track dsivc args
+        with open(os.path.join(t_d,"dsi.pickle"),"wb") as f:
+            pickle.dump(self,f)
+
+        self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...")
+        return pst_dsivc
\ No newline at end of file

From 0337259e6e24351921e967824a324f44de2d429f Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 16 Jun 2025 20:07:00 +0100
Subject: [PATCH 08/58] refactor dsi helper functions

---
 pyemu/utils/helpers.py | 375 +++++++++++++++--------------------------
 1 file changed, 132 insertions(+), 243 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index e8162f076..24cf748dc 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4043,7 +4043,7 @@ def get_current_prop(_cur_thresh):
     return thresh, prop
 
 
-def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d="template",gp_kernel=None,nverf=0,
+def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_kernel=None,nverf=0,
                  plot_fits=False,apply_standard_scalar=False, include_emulated_std_obs=False):
     """helper function to setup a gaussian-process-regression (GPR) emulator for outputs of interest.  This
     is primarily targeted at low-dimensional settings like those encountered in PESTPP-MOU
@@ -4054,7 +4054,6 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d
         output_fnames (str | list[str]): usually a list of observation population files that
             corresponds to the simulation results associated with `input_fnames`
         gpr_t_d (str): the template file dir to create that will hold the GPR emulators
-        t_d (str): the template dir containing the PESTPP-MOU outputs that the GPR emulators are trained on
         gp_kernel (sklearn GaussianProcess kernel): the kernel to use.  if None, a standard RBF kernel
             is created and used
         nverf (int): the number of input-output pairs to hold back for a simple verification test
@@ -4181,7 +4180,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d
         import matplotlib.pyplot as plt
         from matplotlib.backends.backend_pdf import PdfPages
         pdf = PdfPages(os.path.join(gpr_t_d,"gpr_fits.pdf"))
-    for i,output_name in enumerate(output_names):
+    for output_name in output_names:
 
         y_verf = df.loc[:,output_name].values.copy()[cut:]
         y_train = df.loc[:, output_name].values.copy()[:cut]
@@ -4221,8 +4220,8 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d
                 plt.close(fig)
 
 
-        objname = f'obj_{i}'
-        model_fname = os.path.split(pst_fname)[1]+"."+objname+".pkl"
+
+        model_fname = os.path.split(pst_fname)[1]+"."+output_name+".pkl"
         if os.path.exists(os.path.join(gpr_t_d,model_fname)):
             print("WARNING: model_fname '{0}' exists, overwriting...".format(model_fname))
         with open(os.path.join(gpr_t_d,model_fname),'wb') as f:
@@ -4324,13 +4323,6 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d
     gpst_fname = os.path.split(pst_fname)[1]
     gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2)
     print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d)
-
-    #if they exist, copy pestpp bins from t_d over to gpr_t_d. otherwise, we assume bin is in path
-    pp_bins = [f for f in os.listdir(t_d) if 'pestpp-' in f]
-    if len(pp_bins)>0:
-        for pp_bin in pp_bins:
-            shutil.copy2(os.path.join(t_d,pp_bin),os.path.join(gpr_t_d,pp_bin))
-
     try:
         pyemu.os_utils.run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d)
     except Exception as e:
@@ -4425,82 +4417,123 @@ def gpr_forward_run():
     return mdf
 
 
-def dsi_forward_run(pmat=None,ovals=None,pvals=None,
-                    write_csv=True
-                    
-                    ):
+def dsi_forward_run(pvals,dsi,write_csv=False):
+    assert isinstance(dsi,pyemu.emulators.DSI), "dsi must be a pyemu DSI object" 
+    if isinstance(pvals,pd.DataFrame):
+        pvals = pvals.parval1
+    sim_vals = dsi.predict(pvals)
+    if write_csv:
+        sim_vals.to_csv("dsi_sim_vals.csv")
+    return sim_vals
 
-    if pvals is None:
-        pvals = pd.read_csv("dsi_pars.csv",index_col=0)
-    if pmat is None:
-        pmat = np.load("dsi_proj_mat.npy")
-    if ovals is None:
-        ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0)
+def dsivc_forward_run(md_ies="."):
+    import pandas as pd
+    import pyemu
+    import os
+    import pickle
+    from pyemu.utils.os_utils import PortManager
+
+    # load the dsi pest control file
+    pst_dsi = pyemu.Pst(os.path.join(md_ies,"dsi.pst"))
+    noptmax = pst_dsi.control_data.noptmax
 
     try:
-        offset = np.load("dsi_obs_offset.npy")
+        os.remove("dsi.noise.jcb")
     except:
-        #print("no offset file found, assuming no offset")
-        offset = np.zeros(ovals.shape[0])
+        print("dsi.noise.jcb not found, continuing...")
     try:
-        log_trans = np.load("dsi_obs_log.npy")
+        os.remove("dsi.stack.csv")
     except:
-        #print("no log-tansform file found, assuming no log-transform")
-        log_trans = np.zeros(ovals.shape[0])
-
+        print("dsi.stack.csv not found, continuing...")
     try:
-        backtransformvals = np.load("dsi_obs_backtransformvals.npy")
-        backtransformobsnmes = np.load("dsi_obs_backtransformobsnmes.npy",allow_pickle=True)
-        backtransform=True
+        os.remove("dsi.stack_stats.csv")
     except:
-        #print("no back-transform file found, assuming no back-transform")
-        backtransform=False
-
-
-    sim_vals = ovals + np.dot(pmat,pvals.values)
-
-    if backtransform:
-        #print("applying back-transform")
-        obsnmes = np.unique(backtransformobsnmes)
-        back_vals = [
-                    inverse_normal_score_transform(
-                                        backtransformvals[np.where(backtransformobsnmes==o)][:,1],
-                                        backtransformvals[np.where(backtransformobsnmes==o)][:,0],
-                                        sim_vals.loc[o].mn,
-                                        extrap=None
-                                        )[0] 
-                    for o in obsnmes
-                    ]     
-        sim_vals.loc[obsnmes,'mn'] = back_vals
-
-    #print("reversing offset and log-transform")
-    assert log_trans.shape[0] == sim_vals.mn.values.shape[0], f"log transform shape mismatch: {log_trans.shape[0]},{sim_vals.mn.values.shape[0]}"
-    assert offset.shape[0] == sim_vals.mn.values.shape[0], f"offset transform shape mismatch: {offset.shape[0]},{sim_vals.mn.values.shape[0]}"
-    vals = sim_vals.mn.values
-    vals[np.where(log_trans==1)] = 10**vals[np.where(log_trans==1)]
-    vals-= offset
-    sim_vals.loc[:,'mn'] = vals
-    #print(sim_vals)
-    if write_csv:
-        sim_vals.to_csv("dsi_sim_vals.csv")
-    return sim_vals
-
-
-def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None):
+        print("dsi.stack_stats.csv not found, continuing...")
+    try:
+        os.remove(f"dsi.{noptmax}.obs.jcb")
+    except:
+        print(f"dsi.{noptmax}.obs.jcb not found, continuing...")
+
+    # load decvars
+    decvars = pd.read_csv(os.path.join(md_ies, "dsivc_pars.csv"),index_col=0)
+    assert decvars.shape[0]>0, "no decvars found in dsivc_pars.csv"
+
+
+
+    # update the decavar obs values in the observation data
+    obs = pst_dsi.observation_data
+    assert obs.loc[decvars.index].shape[0] == decvars.shape[0], "not all decvars found in obs data"
+    assert all(obs.loc[decvars.index].weight > 0.0), "decvar weights should be > 0.0"
+    obs.loc[decvars.index,"obsval"] = decvars.values
+
+    # update the obs+noise file with the decvar values to ensure NO NOISE on the decvars
+    noise = pyemu.ObservationEnsemble.from_binary(pst_dsi,os.path.join(md_ies,"dsi.obs+noise.jcb"))
+    # check that all of decvars.index are in noise.columns
+    assert len([i for i in decvars.index if i not in noise.columns.tolist()]) == 0, "some decvars not in noise columns"
+    # update columns in noise if column name in decvars.index
+    for col in decvars.index:
+        noise.loc[:,col] = noise.loc[:,col].astype(float)
+        noise.loc[:,col] = decvars.loc[col].values[0]
+    # record noise 
+    noise.to_binary(os.path.join(md_ies,"dsi.noise.jcb"))
+    # make sure pestpp options 
+    pst_dsi.pestpp_options["ies_observation_ensemble"] = "dsi.noise.jcb"
+    # rewrite the dsi.pst file 
+    pst_dsi.write(os.path.join(md_ies,"dsi.pst"),version=2)
+
+    # deploy dsi...
+    pvals = pd.read_csv(os.path.join(md_ies,"dsi_pars.csv"),index_col=0)
+    num_workers=1
+    worker_root="."
+    dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb"))
+    num_workers = dsi.dsivc_args.get("num_pyworkers",1)
+    pyemu.os_utils.start_workers(md_ies,"pestpp-ies","dsi.pst",
+                                num_workers=num_workers,
+                                worker_root=worker_root,
+                                port = PortManager().get_available_port(),
+                                    master_dir=md_ies,
+                                    reuse_master =True,
+                                    ppw_function=pyemu.helpers.dsi_pyworker,
+                                    ppw_kwargs={"dsi":dsi,"pvals":pvals})    
+    assert os.path.exists(os.path.join(md_ies,f"dsi.{noptmax}.obs.jcb")), f"dsi.{noptmax}.obs.jcb not found...pst failed?"
+
+
+    #TODO: checks on PDC or Eulerian distance to training data?
+
+    #postprocess stack
+    oe = pyemu.ObservationEnsemble.from_binary(pst_dsi,os.path.join(md_ies,f"dsi.{noptmax}.obs.jcb"))
+    assert oe.shape[0] == noise.shape[0], "stack and noise shapes do not match; failed runs?"
+    if dsi.dsivc_args.get("track_stack",False):
+        # write long form oe
+        stack = oe._df.reset_index().melt(id_vars="real_name")
+        stack.rename(columns={"value":"obsval"},inplace=True)
+        stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.real_name,axis=1)
+        stack.set_index("obsnme",inplace=True)
+        stack = stack.obsval
+        out_file = os.path.join(md_ies,"dsi.stack.csv")
+        stack.to_csv(out_file,float_format="%.6e")
+    #write stats
+    #get user-specified quantiles
+    percentiles = dsi.dsivc_args.get("percentiles",[0.25,0.75,0.5])
+    stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars="index")
+    stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True)
+    stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1)
+    stack_stats.set_index("obsnme",inplace=True)
+    stack_stats = stack_stats.obsval
+    out_file = os.path.join(md_ies,"dsi.stack_stats.csv")
+    stack_stats.to_csv(out_file,float_format="%.6e")
+
+    return
+
+def dsi_pyworker(pst,host,port,dsi=None,pvals=None):
     
-    import os
     import pandas as pd
-    import numpy as np
-
-
     # if explicit args weren't passed, get the default ones...
     if pvals is None:
         pvals = pd.read_csv("dsi_pars.csv",index_col=0)
-    if pmat is None:
-        pmat = np.load("dsi_proj_mat.npy")
-    if ovals is None:
-        ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0)
-
+    if dsi is None:
+        import pickle
+        dsi = pickle.load(open("dsi.pickle","rb"))
 
     ppw = PyPestWorker(pst,host,port,verbose=False)
 
@@ -4521,10 +4554,10 @@ def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None):
         # df needed to run the emulator
         pvals.parval1 = parameters.loc[pvals.index]
         # do the emulation
-        simdf = dsi_forward_run(pmat=pmat,ovals=ovals,pvals=pvals,write_csv=False)
+        simdf = dsi_forward_run(dsi=dsi,pvals=pvals,write_csv=False)
 
         # replace the emulated quantities in the obs series
-        obs.loc[simdf.index] = simdf.mn.values
+        obs.loc[simdf.index] = simdf.values
 
         #send the obs series to the master
         ppw.send_observations(obs.values)
@@ -4535,171 +4568,27 @@ def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None):
         if parameters is None:
             break
 
-
-def randrealgen_optimized(nreal, tol=1e-7, max_samples=1000000):
-    """
-    Generate a set of random realizations with a normal distribution.
-    
-    Parameters:
-    nreal : int
-        The number of realizations to generate.
-    tol : float
-        Tolerance for the stopping criterion.
-    max_samples : int
-        Maximum number of samples to use.
-        
-    Returns:
-    numpy.ndarray
-        An array of nreal random realizations.
+def series_to_insfile(out_file,ins_file=None):
     """
-    rval = np.zeros(nreal)
-    nsamp = 0
-    # if nreal is even add 1
-    if nreal % 2 == 0:
-        numsort = (nreal + 1) // 2
-    else:
-        numsort = nreal // 2
-    while nsamp < max_samples:
-        nsamp += 1
-        work1 = np.random.normal(size=nreal)
-        work1.sort()
-        
-        if nsamp > 1:
-            previous_mean = rval[:numsort] / (nsamp - 1)
-            rval[:numsort] += work1[:numsort]
-            current_mean = rval[:numsort] / nsamp
-            max_diff = np.max(np.abs(current_mean - previous_mean))
-            
-            if max_diff <= tol:
-                break
-        else:
-            rval[:numsort] = work1[:numsort]
-    
-    rval[:numsort] /= nsamp
-    if nreal % 2 == 0:
-        rval[numsort:] = -rval[:numsort][::-1]
-    else:
-        rval[numsort+1:] = -rval[:numsort][::-1]
-    
-    return rval
-
-
-def normal_score_transform(nstval, val, value):
+    convert a Pandas Series to an ins file
+    Parameters
+    ----------
+    out_file : str
+        name of the output file to convert to ins file
+    ins_file : str
+        name of the ins file to create. if None, then out_file+".ins" is used
+    Returns
+    -------
+    None
     """
-    Transform a value to its normal score using a normal score transform table.
-    
-    Parameters:
-    nstval : array-like
-        Normal score transform table values.
-    val : array-like
-        Original values corresponding to the normal score transform table.
-    value : float
-        The value to transform.
-        
-    Returns:
-    float
-        The normal score of the value.
-    int
-        The index of the value in the normal score transform table."""
-    
-    # make sure the input is numpy arrays
-    val = np.asarray(val)
-    nstval = np.asarray(nstval)
-    
-    # if the value is outside the range of the table, return the first or last value
-    assert value >= val[0], "Value is below the minimum value in the table."
-    assert value <= val[-1], "Value is greater than the maximum value in the table."
-    # ensure that val is sorted
-    assert np.all(np.diff(val) > 0), f"Values in the table must be sorted in ascending order:{list(zip(np.diff(val)>0,val))}"
-
-    # find the rank of the value in the table
-    rank = np.searchsorted(val, value, side='right') - 1
-    if rank == len(val) - 1:
-        return nstval[-1], len(val)
-    # if the value coincides with a value in the table, return the corresponding normal score
-    nstdiff = nstval[rank + 1] - nstval[rank]
-    diff = val[rank + 1] - val[rank]
-    if nstdiff <= 0.0 or diff <= 0.0:
-        return nstval[rank], rank
-    
-    # otherwise, interpolate to get the normal score
-    dist = value - val[rank]
-    interpolated_value = nstval[rank] + (dist / diff) * nstdiff
-    return interpolated_value, rank
-
-
-def inverse_normal_score_transform(nstval, val, value, extrap='quadratic'):
-    nreal = len(val)
-    # check that nstval is sorted
-    assert np.all(np.diff(nstval) > 0), "Values in the table must be sorted in ascending order"
-    # check that val is sorted
-    assert np.all(np.diff(val) > 0), "Values in the table must be sorted in ascending order"
-    
-    def linear_extrapolate(x0, y0, x1, y1, x):
-        if x1 != x0:
-            return y0 + (y1 - y0) / (x1 - x0) * (x - x0)
-        return y0
-
-    def quadratic_extrapolate(x1, y1, x2, y2, x3, y3, x4):
-        y12=y1-y2
-        x23=x2-x3
-        y23=y2-y3
-        x12=x1-x2
-        x13=x1-x3
-        if x12==0 or x23==0 or x13==0:
-            raise ValueError("Input x values must be distinct")
-        a = (y12*x23-y23*x12)
-        den = x12*x23*x13
-        a = a/den
-        b = y23/x23 - a*(x2+x3)
-        c=y1-x1*(a*x1+b)
-        y4 = a*x4**2 + b*x4 + c
-        return y4
-
-    ilim = 0
-    if value in nstval:
-        rank = np.searchsorted(nstval, value)
-        value = val[rank]
-
-    elif value < nstval[0]:
-        ilim = -1
-        if extrap is None:
-            value = val[0]
-        elif extrap == 'linear':
-            value = linear_extrapolate(nstval[0], val[0], nstval[1], val[1], value)
-            #value = min(value, val[0])
-        elif extrap == 'quadratic' and nreal >= 3:
-            y_vals = np.unique(val)[:3]
-            idxs = np.searchsorted(val,y_vals)
-            x_vals = nstval[idxs]
-            value = quadratic_extrapolate(x_vals[-3], y_vals[-3], x_vals[-2], y_vals[-2], x_vals[-1], y_vals[-1], value)
-            #value = min(value, val[0])
-        else:
-            value = val[0]
-
-    elif value > nstval[-1]:
-        ilim = 1
-        if extrap is None:
-            value = val[-1]
-        elif extrap == 'linear':
-            value = linear_extrapolate(nstval[-2], val[-2], nstval[-1], val[-1], value)
-            #value = max(value, val[-1])
-        elif extrap == 'quadratic' and nreal >= 3:
-            y_vals = np.unique(val)[-3:]
-            idxs = np.searchsorted(val,y_vals)
-            x_vals = nstval[idxs]
-            value = quadratic_extrapolate(x_vals[-3], y_vals[-3], x_vals[-2], y_vals[-2], x_vals[-1], y_vals[-1], value)
-            #value = max(value, val[-1])
-        else:
-            value = val[-1]
-
-    else:
-        rank = np.searchsorted(nstval, value) - 1
-        # Get the bounding x and y values
-        x0, x1 = nstval[rank], nstval[rank + 1]
-        y0, y1 = val[rank], val[rank + 1]
-        # Perform linear interpolation
-        value = y0 + (y1 - y0) * (value - x0) / (x1 - x0)
-    
-    return value, ilim
-
+    if ins_file is None:
+        ins_file = out_file+".ins"
+    sdf = pd.read_csv(out_file,index_col=0)
+    assert sdf.shape[1] == 1, "only one column allowed"
+    sdf = sdf.iloc[:,0]
+    with open(ins_file,'w') as f:
+        f.write("pif ~\n")
+        f.write("l1\n")
+        for oname in sdf.index.values:
+            f.write("l1 ~,~ !{0}!\n".format(oname))
+    return

From 484865564809aa51f1f9b129ead476152d8f3253 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 16 Jun 2025 20:08:22 +0100
Subject: [PATCH 09/58] refactor dsi out of EnDS

---
 pyemu/eds.py | 413 ---------------------------------------------------
 1 file changed, 413 deletions(-)

diff --git a/pyemu/eds.py b/pyemu/eds.py
index 5b68f2611..1bb647098 100644
--- a/pyemu/eds.py
+++ b/pyemu/eds.py
@@ -11,7 +11,6 @@
 from pyemu.mat.mat_handler import Matrix, Jco, Cov
 from pyemu.pst.pst_handler import Pst
 from pyemu.utils.os_utils import _istextfile,run
-from pyemu.utils.helpers import normal_score_transform,randrealgen_optimized
 from .logger import Logger
 
 
@@ -494,415 +493,3 @@ def get_posterior_prediction_moments(self, obslist_dict=None,sim_ensemble=None,i
         dfper = dfper.loc[groups,self.predictions]
 
         return mean_dfs,dfstd,dfper
-
-
-    def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template",
-                     apply_normal_score_transform=False,nst_extrap=None,
-                     use_ztz=False,energy=1.0):
-        """Setup a new PEST interface for the data-space inversion process.
-        If the observation data in the Pst object has a "obstransform" column, then observations for which "log" is specified will be subject to log-transformation. 
-        If the `apply_normal_score_transform` flag is set to `True`, then the observations and predictions will be subject to a normal score transform.
-
-        Args:
-
-            sim_ensemble (`pyemu.ObservationEnsemble`): observation ensemble to use for DSI latent space
-                variables.  If `None`, use `self.sim_ensemble`.  Default is `None`
-            t_d (`str`): template directory to setup the DSI model + pest files in.  Default is `dsi_template`
-            apply_normal_score_transform (`bool`): flag to apply a normal score transform to the observations
-                and predictions.  Default is `False`
-            nst_extrap (`str`): flag to apply extrapolation to the normal score transform. Can be None, 'linear' or 'quadratic'. Default is None. 
-            use_ztz (`bool`): flag to use the condensed ZtZ matrix for SVD. The ZtZ matrix has dimensions nreal*nreal, instead of the nreal*nobs dimensions of Z. 
-                This makes the SVD computation faster and more memory efficient when nobs >> nreal. 
-                Default is `False`
-            energy (`float`): energy threshold for truncating the sqrt(C) matrix.  Default is `1.0` which applies no truncation. 
-
-        Example::
-
-        #assumes "my.pst" exists
-        ends = pyemu.EnDS(ensemble="my.0.obs.jcb",forecasts=["fore1","fore2"])
-        ends.prep_for_dsi() #setup a new pest interface() based on the DSI approach
-        pyemu.os_utils.start_workers("pestpp-ies","my.pst","dsi_template",num_workers=20,
-                                      master_dir="dsi_master")
-                                      
-
-
-        """
-        if sim_ensemble is None:
-            sim_ensemble = self.sim_ensemble.copy()
-
-        if nst_extrap is not None:
-            assert nst_extrap in ["linear","quadratic"], "nst_extrap must be None, 'linear' or 'quadratic'"
-
-        if os.path.exists(t_d):
-            self.logger.warn("EnDS.prep_for_dsi(): t_d '{0}' exists, removing...".format(t_d))
-            shutil.rmtree(t_d)
-        os.makedirs(t_d)
-
-        
-        nz_names = self.pst.nnz_obs_names
-        snz_names = set(nz_names)
-        z_names = [n for n in self.pst.obs_names if n not in snz_names]
-        names = z_names.copy()
-        names.extend(nz_names)
-        names.sort()
-
-        # make sure names are sorted
-        sim_ensemble = sim_ensemble.loc[:,names]
-        
-        self.logger.log("applying transformations")
-        # implement log-transform/offset and normal score transform
-        transf_names = nz_names.copy()
-        transf_names.extend(self.predictions)
-        
-        if "obstransform" in self.pst.observation_data.columns:
-            obs = self.pst.observation_data.copy()
-            #make sure names are ordered
-            obs = obs.loc[names,:]
-            #TODO: deal with "scale" and user-specified "offset"
-            obs["offset"] = 0.0 #TODO: more elegant? in case all 'none' are passed...
-            obsnmes = obs.loc[obs.obstransform=='log'].obsnme.values
-            if len(obsnmes) > 0:
-                for name in obsnmes:
-                    #TODO: make more efficient
-                    self.logger.log("applying obs log-transform to:"+name)
-                    values = sim_ensemble.loc[:,name].astype(float).values
-                    offset = abs(min(values))+1.0 #arbitrary; enforce positive values
-                    values+=offset
-                    assert min(values)>0, "values must be positive. min value is "+str(min(values))
-                    sim_ensemble.loc[:,name] = np.log10(values)
-                    obs.loc[obs.obsnme==name,'offset'] = offset
-            obs[['obsnme','obsval','obstransform','offset']].to_csv(os.path.join(t_d,"dsi_obs_transform.csv"),index=False)
-            #numpy binary for i/o speed
-            np.save(os.path.join(t_d,"dsi_obs_offset.npy"),
-                    obs.offset.values, 
-                    allow_pickle=False, fix_imports=True)
-            obs['flag'] = 0
-            obs.loc[obs.obstransform=='log', "flag"] = 1
-            np.save(os.path.join(t_d,"dsi_obs_log.npy"),
-                    obs.flag.values, 
-                    allow_pickle=False, fix_imports=True)
-            
-        if apply_normal_score_transform:
-            # prepare for normal score transform
-            nstval = randrealgen_optimized(sim_ensemble.shape[0])
-            back_transform_df = pd.DataFrame()
-            self.logger.log("applying normal score transform to non-zero obs and predictions")
-            #TODO: make more efficient
-            for name in  transf_names:
-                print("transforming:",name)
-                values = sim_ensemble._df.loc[:,name].copy()
-                values.sort_values(inplace=True)
-                if values.iloc[0] != values.iloc[-1]:
-                    # apply smoothing as per DSI2; window sizes are arbitrary...                
-                    window_size=3   
-                    if values.shape[0]>40:
-                        window_size=5                    
-                    if values.shape[0]>90:
-                        window_size=7
-                    if values.shape[0]>200:
-                        window_size=9            
-                    #print("window size:",window_size,values.shape[0])     
-                    values.loc[:] = moving_average_with_endpoints(values.values, window_size)
-                    transformed_values = [normal_score_transform(nstval, values.values, v)[0] for v in values.values]
-                    #transformed_values, sorted_values, sorted_idxs = normal_score_transform(values) #transformed data retains the same order as the original data
-                elif values.iloc[0] == values.iloc[-1]:
-                    print("all values are the same, skipping nst")
-                    transformed_values = values.values
-                sim_ensemble.loc[values.index,name] = transformed_values
-                df = pd.DataFrame()
-                df['real'] = values.index
-                df['sorted_values'] = values.values
-                df['transformed_values'] = transformed_values
-                df['nstval'] = nstval
-                df['obsnme'] = name
-                back_transform_df=pd.concat([back_transform_df,df],ignore_index=True)
-            #back_transform_df.to_csv(os.path.join(t_d,"dsi_obs_backtransform.csv"),index=False)
-            #numpy binary for speed
-            np.save(os.path.join(t_d,"dsi_obs_backtransformvals.npy"),
-                    back_transform_df[['sorted_values',"nstval"]].values, 
-                    allow_pickle=False, fix_imports=True)
-            np.save(os.path.join(t_d,"dsi_obs_backtransformobsnmes.npy"),
-                    back_transform_df['obsnme'].values, 
-                    allow_pickle=True, fix_imports=True)
-        
-        self.logger.log("applying transformations")
-
-        self.logger.log("computing projection matrix")
-        if use_ztz:
-            self.logger.log("using ztz approach...")
-            pmat, s = compute_using_ztz(sim_ensemble)
-            self.logger.log("using ztz approach...")
-        else:
-            self.logger.log("using z approach...")
-            pmat, s = compute_using_z(sim_ensemble)
-            self.logger.log("using z approach...")
-        self.logger.log("computing projection matrix")
-
-        self.logger.log("applying truncation...")
-        apply_energy_based_truncation(energy,s,pmat)
-        self.logger.log("applying truncation...")
-
-        self.logger.log("creating tpl files")
-        dsi_in_file = os.path.join(t_d, "dsi_pars.csv")
-        dsi_tpl_file = dsi_in_file + ".tpl"
-        ftpl = open(dsi_tpl_file, 'w')
-        fin = open(dsi_in_file, 'w')
-        ftpl.write("ptf ~\n")
-        fin.write("parnme,parval1\n")
-        ftpl.write("parnme,parval1\n")
-        npar = s.shape[0]
-        dsi_pnames = []
-        for i in range(npar):
-            pname = "dsi_par{0:04d}".format(i)
-            dsi_pnames.append(pname)
-            fin.write("{0},0.0\n".format(pname))
-            ftpl.write("{0},~   {0}   ~\n".format(pname, pname))
-        fin.close()
-        ftpl.close()
-
-        mn_vec = sim_ensemble.mean(axis=0)
-        # check that sim_ensemble has names ordered
-        assert (mn_vec.index.values == names).all(), "sim_ensemble names are not ordered"
-        mn_in_file = os.path.join(t_d, "dsi_pr_mean.csv")
-        mn_tpl_file = mn_in_file + ".tpl"
-        fin = open(mn_in_file, 'w')
-        ftpl = open(mn_tpl_file, 'w')
-        ftpl.write("ptf ~\n")
-        fin.write("obsnme,mn\n")
-        ftpl.write("obsnme,mn\n")
-        mn_dict = {}
-        for oname in names:
-            pname = "dsi_prmn_{0}".format(oname)
-            fin.write("{0},{1}\n".format(oname, mn_vec[oname]))
-            ftpl.write("{0},~   {1}   ~\n".format(oname, pname))
-            mn_dict[pname] = mn_vec[oname]
-        fin.close()
-        ftpl.close()
-        self.logger.log("creating tpl files")
-
-        self.logger.log("saving proj mat")
-        #row_names = ["sing_vec_{0}".format(i) for i in range(pmat.shape[0])]
-        pmat = Matrix(x=pmat,col_names=dsi_pnames,row_names=names)
-        pmat.col_names = dsi_pnames
-        #proj_name = "dsi_proj_mat.jcb" # dont change this name!!!
-        proj_name = "dsi_proj_mat.npy" # dont change this name!!!
-        proj_path = os.path.join(t_d,proj_name)
-        #pmat.to_coo(proj_path)
-        # use numpy for speed
-        np.save(os.path.join(t_d,proj_name), pmat.x, allow_pickle=False, fix_imports=True)
-
-        self.logger.statement("projection matrix dimensions:"+str(pmat.shape))
-        self.logger.statement("projection matrix saved to "+proj_path)
-        self.logger.log("saving proj mat")
-
-
-        # this is the dsi forward run function - it is harded coded below!
-        def dsi_forward_run():
-            import os
-            import numpy as np
-            import pandas as pd
-            from pyemu.utils.helpers import inverse_normal_score_transform
-            pmat = np.load("dsi_proj_mat.npy")
-            pvals = pd.read_csv("dsi_pars.csv",index_col=0)
-            ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0)
-            sim_vals = ovals + np.dot(pmat,pvals.values)
-            filename = "dsi_obs_backtransformvals.npy"
-            if os.path.exists(filename):
-                print("applying back-transform")
-                backtransformvals = np.load("dsi_obs_backtransformvals.npy")
-                backtransformobsnmes = np.load("dsi_obs_backtransformobsnmes.npy",allow_pickle=True)
-                obsnmes = np.unique(backtransformobsnmes)
-                back_vals = [
-                            inverse_normal_score_transform(
-                                                backtransformvals[np.where(backtransformobsnmes==o)][:,1],
-                                                backtransformvals[np.where(backtransformobsnmes==o)][:,0],
-                                                sim_vals.loc[o].mn,
-                                                extrap=None
-                                                )[0] 
-                            for o in obsnmes
-                            ]     
-                sim_vals.loc[obsnmes,'mn'] = back_vals
-            if os.path.exists("dsi_obs_transform.csv"):
-                print("reversing log-transform")
-                offset = np.load("dsi_obs_offset.npy")
-                log_trans = np.load("dsi_obs_log.npy")
-                assert log_trans.shape[0] == sim_vals.mn.values.shape[0], f"log transform shape mismatch: {log_trans.shape[0]},{sim_vals.mn.values.shape[0]}"
-                assert offset.shape[0] == sim_vals.mn.values.shape[0], f"offset transform shape mismatch: {offset.shape[0]},{sim_vals.mn.values.shape[0]}"
-                vals = sim_vals.mn.values
-                vals[np.where(log_trans==1)] = 10**vals[np.where(log_trans==1)]
-                vals-= offset
-                sim_vals.loc[:,'mn'] = vals
-            #print(sim_vals)
-            sim_vals.to_csv("dsi_sim_vals.csv")
-
-        self.logger.log("test run")
-        b_d = os.getcwd()
-        os.chdir(t_d)
-        dsi_forward_run()
-        os.chdir(b_d)
-        self.logger.log("test run")
-
-        self.logger.log("creating ins file")
-        out_file = os.path.join(t_d,"dsi_sim_vals.csv")
-        ins_file = out_file + ".ins"
-        sdf = pd.read_csv(out_file,index_col=0)
-        with open(ins_file,'w') as f:
-            f.write("pif ~\n")
-            f.write("l1\n")
-            for oname in sdf.index.values:
-                f.write("l1 ~,~ !{0}!\n".format(oname))
-        self.logger.log("creating ins file")
-
-        self.logger.log("creating Pst")
-        pst = Pst.from_io_files([mn_tpl_file,dsi_tpl_file],[mn_in_file,dsi_in_file],[ins_file],[out_file],pst_path=".")
-
-        par = pst.parameter_data
-        dsi_pars = par.loc[par.parnme.str.startswith("dsi_par"),"parnme"]
-        par.loc[dsi_pars,"parval1"] = 0
-        par.loc[dsi_pars,"parubnd"] = 10.0
-        par.loc[dsi_pars,"parlbnd"] = -10.0
-        par.loc[dsi_pars,"partrans"] = "none"
-        with open(os.path.join(t_d,"dsi.unc"),'w') as f:
-            f.write("START STANDARD_DEVIATION\n")
-            for p in dsi_pars:
-                f.write("{0} 1.0\n".format(p))
-            f.write("END STANDARD_DEVIATION")
-        pst.pestpp_options['parcov'] = "dsi.unc"
-
-        mn_pars = par.loc[par.parnme.str.startswith("dsi_prmn"),"parnme"]
-        par.loc[mn_pars,"partrans"] = "fixed"
-        for pname,pval in mn_dict.items():
-            par.loc[pname,"parval1"] = pval
-            par.loc[pname, "parubnd"] = pval + 1000
-            par.loc[pname, "parlbnd"] = pval - 1000
-
-        obs = pst.observation_data
-        org_obs = self.pst.observation_data
-        for col in org_obs.columns:
-            obs.loc[org_obs.obsnme,col] = org_obs.loc[:,col]
-        pst.control_data.noptmax = 0
-        pst.model_command = "python forward_run.py"
-        self.logger.log("creating Pst")
-        import inspect
-        #print([l for l in inspect.getsource(dsi_forward_run).split("\n")])
-        lines = [line[12:] for line in inspect.getsource(dsi_forward_run).split("\n")][1:]
-        with open(os.path.join(t_d,"forward_run.py"),'w') as f:
-            for line in lines:
-                if nst_extrap is not None:
-                    if "extrap=None" in line:
-                        line = line.replace("None",f"'{nst_extrap}'") 
-                f.write(line+"\n")
-        pst.write(os.path.join(t_d,"dsi.pst"),version=2)
-        self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst")))
-        try:
-            run("pestpp-ies dsi.pst",cwd=t_d)
-        except Exception as e:
-            self.logger.warn("error testing noptmax=0 run:{0}".format(str(e)))
-
-        return pst
-
-
-def compute_using_z(sim_ensemble):
-    z = sim_ensemble.get_deviations() / np.sqrt(float(sim_ensemble._df.shape[0] - 1))
-    z = z.values
-    u, s, v = np.linalg.svd(z, full_matrices=False)
-    us = np.dot(v.T, np.diag(s))
-    return us,s
-
-def compute_using_ztz(sim_ensemble):
-    # rval are the transformed obs values
-    rval = sim_ensemble._df.copy()
-    #mu2 is the mean of the transformed obs values
-    mu2 = rval.mean()
-    #adjust rval by subtracting mu2
-    rval -= mu2
-    #divide rval by the sqrt of nreal-1
-    nreal = rval.shape[0]
-    rval = rval*np.sqrt(1/(nreal-1))
-    # rval.T to match pest utils implementation
-    z = rval.T.values
-    # Compute the ZtZ matrix
-    ztz = np.dot(z.T,z)
-    assert ztz.shape[0] == z.shape[1], "ZtZ matrix is not square"
-    assert ztz.shape[0] == sim_ensemble.shape[0], "ZtZ matrix is not nreal*nreal"
-
-    #We now do SVD on ZtZ.
-    print("doing SVD on ZtZ")
-    u, s2, v = np.linalg.svd(ztz, full_matrices=False)
-    s =  np.sqrt(s2)
-    s[z.shape[0]:] = 0  #truncation to match compute_using_z()
-
-    # formulate the sqrt of the covariance matrix
-    us = np.dot(z,u)
-    return us, s
-
-def apply_energy_based_truncation(energy,s,us):
-    if energy >= 1.0:
-        print("Warning: energy>=1.0, no truncation applied")
-        return us
-    # Determine where to truncate
-    # Determine nn
-    if us.shape[0]==us.shape[1]:
-        nn = us.shape[0] - 1
-    else:
-        nobs = us.shape[0]
-        nreal = us.shape[1]
-        nn = min(nobs, nreal) - 1
-    # Compute total_energy
-    total_energy = np.sum((np.sqrt(s))[:nn])
-    # Find energy truncation point
-    ntrunc = np.where((np.sqrt(s)).cumsum()/total_energy<=energy)[0].shape[0]
-    # Initialize threshold
-    #s1 = s[0]
-    #thresh = 1.0e-7 * s1 #NOTE: JDoh's implementation uses an additional level of truncation
-    #ntrunc = min(np.where(s>=thresh)[0][0], ntrunc)+1
-    ntrunc=ntrunc+1
-    if ntrunc>=us.shape[1]:
-        print("ntrunc>=us.shape[1], no truncation applied")
-    else:
-        print("truncating to {0} singular values".format(ntrunc))
-        # Apply threshold logic
-        us = us[:,:ntrunc]
-    return us
-
-def moving_average_with_endpoints(y_values, window_size):
-    # Ensure the window size is odd
-    if window_size % 2 == 0:
-        raise ValueError("window_size must be odd")
-    # Calculate half-window size
-    half_window = window_size // 2
-    # Initialize the output array
-    smoothed_y = np.zeros_like(y_values)
-    # Handle the endpoints
-    for i in range(0,half_window):
-        # Start
-        smoothed_y[i] = np.mean(y_values[:i + half_window ])
-    for i in range(1,half_window+1):
-        # End
-        smoothed_y[-i] = np.mean(y_values[::-1][:i + half_window +1])
-    # Handle the middle part with full window
-    for i in range(half_window, len(y_values) - half_window):
-        smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window])
-    #Enforce endpoints
-    smoothed_y[0] = y_values[0]
-    smoothed_y[-1] = y_values[-1]
-    # Ensure uniqueness by adding small increments if values are duplicated
-    #NOTE: this is a hack to ensure uniqueness in the normal score transform
-    smoothed_y = make_unique(smoothed_y, delta=1e-10)
-    return smoothed_y
-
-
-def make_unique(arr, delta=1e-10):
-    """
-    Modifies a sorted numpy array in-place to ensure all elements are unique.
-    
-    Parameters:
-    arr (np.ndarray): The sorted numpy array.
-    delta (float): The minimum increment to apply to duplicate elements. 
-                   Default is a very small value (1e-10).
-    """
-    for i in range(1, len(arr)):
-        if arr[i] <= arr[i - 1]:
-            arr[i] = arr[i - 1] + delta
-
-    return arr

From c776d8fb416f2a265b0bd54974aa878c87a4a254 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 17 Jun 2025 11:00:24 +0100
Subject: [PATCH 10/58] initial tests commit

---
 autotest/dsi_tests.py | 108 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 autotest/dsi_tests.py

diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py
new file mode 100644
index 000000000..724973c09
--- /dev/null
+++ b/autotest/dsi_tests.py
@@ -0,0 +1,108 @@
+import os
+import sys
+import shutil
+import pytest
+import numpy as np
+import pandas as pd
+import platform
+import pyemu
+from pst_from_tests import setup_tmp, ies_exe_path, _get_port
+from pyemu.emulators import DSI
+
+
+#def test_dsi_feature_transforms():
+#    """Test feature transforms in DSI emulator"""
+#    # Create test data simulating an ensemble
+#    np.random.seed(42)
+#    n_reals = 10
+#    n_obs = 5
+#    sim_names = [f"obs{i}" for i in range(n_obs)]
+#    sim_data = np.random.lognormal(mean=0, sigma=1, size=(n_reals, n_obs))
+#    sim_ensemble = pd.DataFrame(sim_data, columns=sim_names)
+#    
+#    # Create DSI emulator
+#    pst = pyemu.Pst.from_par_obs_names(["p1"], sim_names)
+#    dsi = pyemu.emulators.DSI(
+#        pst=pst,
+#        sim_ensemble=sim_ensemble,
+#        transforms = [{"type": "log10", "columns": sim_names},
+#                        {"type": "normal_score", "columns": sim_names}],
+#        
+#    )
+#    
+#    # Test feature transforms
+#    dsi.apply_feature_transforms()
+#    
+#    # Check that transformed data exists
+#    assert dsi.data_transformed is not None
+#    
+#    # Check log transform was applied (values should be smaller than original lognormal data)
+#    assert dsi.data_transformed.mean().mean() < sim_ensemble.mean().mean()
+#    
+#    # Check the feature transformer object exists
+#    assert hasattr(dsi, "feature_transformer")
+#    
+#    # Test with specific columns for log transform
+#    dsi2 = pyemu.emulators.DSI(
+#        pst=pst,
+#        sim_ensemble=sim_ensemble,
+#        transforms = [{"type": "log10", "columns": sim_names[:2]}]
+#                            )
+#    dsi2.apply_feature_transforms()
+#    
+#    # Check only specified columns were log transformed
+#    orig_means = sim_ensemble.mean()
+#    transformed_means = dsi2.data_transformed.mean()
+#    
+#    for i, col in enumerate(sim_names):
+#        if i < 2:  # Should be log transformed
+#            assert transformed_means[col] < orig_means[col]
+#        else:  # Should be unchanged
+#            assert np.isclose(transformed_means[col], orig_means[col])
+
+def test_dsi_freyberg(tmp_d):
+
+    test_d = "ends_master"
+    test_d = setup_tmp(test_d, tmp_d)
+
+    case = "freyberg6_run_ies"
+    pst_name = os.path.join(test_d, case + ".pst")
+    pst = pyemu.Pst(pst_name)
+    predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"]
+    pst.pestpp_options["predictions"] = predictions
+
+    oe_name = pst_name.replace(".pst", ".0.obs.csv")
+    oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
+    data = oe._df.copy()
+
+    dsi = DSI(sim_ensemble=data)
+    dsi.apply_feature_transforms()
+    dsi.fit()
+
+    # history match
+    obsdata = pst.observation_data.copy()
+    td = "template_dsi"
+    pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata)
+    pstdsi.control_data.noptmax = 3
+    pstdsi.pestpp_options["ies_num_reals"] = 100
+    pstdsi.write(os.path.join(td, "dsi.pst"),version=2)
+
+    pvals = pd.read_csv(os.path.join(td, "dsi_pars.csv"), index_col=0)
+    md = "master_dsi"
+    num_workers= 3
+    worker_root = "."
+    pyemu.os_utils.start_workers(
+        td,ies_exe_path,"dsi.pst", num_workers=num_workers,
+        worker_root=worker_root, master_dir=md, port=_get_port(),
+        ppw_function=pyemu.helpers.dsi_pyworker,
+        ppw_kwargs={
+            "dsi": dsi, "pvals": pvals,
+        }
+    )
+
+
+    return
+
+
+if __name__ == "__main__":
+    test_dsi_freyberg("temp")
\ No newline at end of file

From 2b735750ed35803ef1089cda6d82bed752a5f339 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 17 Jun 2025 13:40:29 +0100
Subject: [PATCH 11/58] Portmanager class for dsivc

---
 pyemu/utils/os_utils.py | 184 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 4 deletions(-)

diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py
index 1247cf457..d16183662 100644
--- a/pyemu/utils/os_utils.py
+++ b/pyemu/utils/os_utils.py
@@ -13,6 +13,12 @@
 import socket
 import time
 from datetime import datetime
+import random
+import logging
+import tempfile
+from contextlib import contextmanager
+import json
+import uuid
 
 import numpy as np
 import pandas as pd
@@ -948,10 +954,180 @@ def send_killed_run(self,group=None,runid=None,desc="killed"):
 
 
 
+
+class PortManager(object):
+    """Cross-platform port manager for parallel processes."""
+    def __init__(self,
+                 port_range=(4004, 65535),
+                 lock_dir=None,
+                 max_retries=50,
+                 lock_timeout=5,
+                 log_level=logging.INFO):
+        """
+        Initialize the port manager.
+        Args:
+            port_range: Tuple of (min_port, max_port) to search within
+            lock_dir: Directory to store lock files (default: system temp dir)
+            max_retries: Maximum attempts to find an available port
+            lock_timeout: Time in seconds after which a lock is considered stale
+        """
+        # Set up instance-specific logger
+        self.logger = logging.getLogger(f"{__name__}.PortManager.{id(self)}")
+        self.logger.setLevel(log_level)
+        # Add a handler if none exists
+        if not self.logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter(
+                '%(asctime)s - %(processName)s - %(levelname)s - %(message)s'
+            )
+            handler.setFormatter(formatter)
+            self.logger.addHandler(handler)
+        self.min_port, self.max_port = port_range
+        self.lock_dir = lock_dir or os.path.join(tempfile.gettempdir(), "port_locks")
+        self.max_retries = max_retries
+        self.lock_timeout = lock_timeout
+        # Ensure lock directory exists
+        os.makedirs(self.lock_dir, exist_ok=True)
+        # Generate a unique ID for this process instance
+        self.instance_id = str(uuid.uuid4())
+    
+    def _is_port_available(self, port):
+        """Check if a port is available by attempting to bind to it."""
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                # Set socket to reuse address to handle TIME_WAIT state
+                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                s.bind(('localhost', port))
+                return True
+            except (socket.error, OSError):
+                return False
+    
+    def _get_lock_file(self, port):
+        """Get the path to the lock file for a specific port."""
+        return os.path.join(self.lock_dir, f"port_{port}.lock")
+    
+    def _clean_stale_locks(self):
+        """Remove stale lock files based on timeout."""
+        now = time.time()
+        try:
+            for filename in os.listdir(self.lock_dir):
+                if filename.startswith("port_") and filename.endswith(".lock"):
+                    lock_path = os.path.join(self.lock_dir, filename)
+                    if os.path.exists(lock_path):
+                        # Check if lock is stale
+                        if now - os.path.getmtime(lock_path) > self.lock_timeout:
+                            try:
+                                os.remove(lock_path)
+                                self.logger.debug(f"Removed stale lock file: {lock_path}")
+                            except OSError:
+                                # Another process might have removed it already
+                                pass
+        except Exception as e:
+            self.logger.warning(f"Error cleaning stale locks: {e}")
+    
+    @contextmanager
+    def _try_lock_port(self, port):
+        """
+        Try to create a lock file for a port using a cross-platform approach.
+        Uses atomic file creation to implement locking.
+        """
+        lock_file = self._get_lock_file(port)
+        lock_acquired = False
+        try:
+            # Try to create the lock file - will only succeed if it doesn't exist
+            lock_data = {
+                "pid": os.getpid(),
+                "instance_id": self.instance_id,
+                "timestamp": time.time()
+            }
+            try:
+                # Try exclusive creation of the file (atomic operation)
+                with open(lock_file, 'x') as f:
+                    json.dump(lock_data, f)
+                lock_acquired = True
+                yield True
+            except FileExistsError:
+                # Lock file already exists
+                try:
+                    # Check if lock file is stale
+                    if os.path.exists(lock_file):
+                        if time.time() - os.path.getmtime(lock_file) > self.lock_timeout:
+                            # Lock is stale, try to replace it
+                            try:
+                                os.remove(lock_file)
+                                with open(lock_file, 'x') as f:
+                                    json.dump(lock_data, f)
+                                lock_acquired = True
+                                yield True
+                                return
+                            except (FileExistsError, OSError):
+                                # Failed to acquire lock
+                                pass
+                except OSError:
+                    pass
+                yield False
+        finally:
+            # Clean up the lock file if we created it
+            if lock_acquired:
+                try:
+                    if os.path.exists(lock_file):
+                        os.remove(lock_file)
+                except OSError as e:
+                    self.logger.warning(f"Error removing lock file for port {port}: {e}")
+    
+    def get_available_port(self):
+        """
+        Find and reserve an available port.
+        Returns:
+            An available port number.
+        Raises:
+            RuntimeError: If no available port can be found after max_retries.
+        """
+        # Clean up stale locks first
+        self._clean_stale_locks()
+        # Shuffle port range to distribute port selection
+        port_list = list(range(self.min_port, self.max_port + 1))
+        random.shuffle(port_list)
+        attempts = 0
+        while attempts < self.max_retries:
+            # Pick a random port from our shuffled list
+            if not port_list:
+                raise RuntimeError("Exhausted all ports in range")
+            port = port_list.pop(0)
+            attempts += 1
+            # First check if port is available
+            if not self._is_port_available(port):
+                continue
+            # Try to acquire a lock
+            with self._try_lock_port(port) as locked:
+                if not locked:
+                    # Another process got this port
+                    continue
+                # Double-check port is still available after locking
+                if self._is_port_available(port):
+                    self.logger.info(f"Reserved port {port} for process {os.getpid()}")
+                    return port
+        raise RuntimeError(f"Could not find available port after {self.max_retries} attempts")
+    
+    @contextmanager
+    def reserved_port(self):
+        """Context manager that reserves a port and releases it after use."""
+        port = self.get_available_port()
+        lock_file = self._get_lock_file(port)
+        try:
+            yield port
+        finally:
+            # Release the port by removing the lock file
+            if os.path.exists(lock_file):
+                try:
+                    os.remove(lock_file)
+                    self.logger.info(f"Released port {port}")
+                except OSError as e:
+                    self.logger.warning(f"Error releasing port {port}: {e}")
+
+
 if __name__ == "__main__":
     host = "localhost"
-    port = 4004
+    port = PortManager().get_available_port()
     ppw = PyPestWorker(None,host,port)
-    #ppw.initialize()
-
-
+    #ppw.initialize()
\ No newline at end of file

From 069fadc1ab1a03aee5707f0c48a38f12f23817fd Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 17 Jun 2025 13:54:38 +0100
Subject: [PATCH 12/58] adding ies_exe path arg to dsivc_fwd run fnx to deal
 with pytest

---
 pyemu/utils/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index 24cf748dc..b109fb7e4 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4426,7 +4426,7 @@ def dsi_forward_run(pvals,dsi,write_csv=False):
         sim_vals.to_csv("dsi_sim_vals.csv")
     return sim_vals
 
-def dsivc_forward_run(md_ies="."):
+def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"):
     import pandas as pd
     import pyemu
     import os
@@ -4487,7 +4487,7 @@ def dsivc_forward_run(md_ies="."):
     worker_root="."
     dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb"))
     num_workers = dsi.dsivc_args.get("num_pyworkers",1)
-    pyemu.os_utils.start_workers(md_ies,"pestpp-ies","dsi.pst",
+    pyemu.os_utils.start_workers(md_ies,ies_exe_path,"dsi.pst",
                                 num_workers=num_workers,
                                 worker_root=worker_root,
                                 port = PortManager().get_available_port(),

From eb26410c8996cb3b7fb41e69427c93ddf942952b Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 17 Jun 2025 13:56:22 +0100
Subject: [PATCH 13/58] updates to dsivc for pytest'

---
 pyemu/emulators/dsi.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index 6891f18e8..02f3e3277 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -80,7 +80,7 @@ def __init__(self,
 
         super().__init__(verbose=verbose)
 
-        self.__org_observation_data = pst.observation_data.copy() if pst is not None else None
+        self.observation_data = pst.observation_data.copy() if pst is not None else None
         #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None
         #self.__org_control_data = pst.control_data.copy() #breaks pickling
         if isinstance(sim_ensemble, ObservationEnsemble):
@@ -102,35 +102,31 @@ def __init__(self,
                     # check for quadratic_extrapolation
                     if 'quadratic_extrapolation' in t:
                         assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean"
-            self.transforms = transforms
+        self.transforms = transforms
         self.fitted = False
         self.data_transformed = None
         self.decision_variable_names = None #used for DSIVC
         
     def prepare_training_data(self, data=None):
         """
-        Prepare training data by applying transformations and computing the projection matrix.
-        
-        This method follows these steps:
-        1. Apply feature transformations (log transform, normal score transform)
-        2. Compute projection matrix using SVD
+        Prepare and transform training data for model fitting.
         
         Parameters
         ----------
         data : pandas.DataFrame, optional
-            Data to prepare. If None, uses self.data.
+            Raw training data. If None, uses self.data.
             
         Returns
         -------
-        pandas.DataFrame
-            The prepared data.
+        tuple
+            Processed data ready for model fitting.
         """
         if data is None:
             data = self.data
-            
         if data is None:
             raise ValueError("No data provided and no data stored in the emulator")
-        
+
+        self.logger.statement("applying feature transforms")
         # Always use the base class transformation method for consistency
         if self.transforms is not None:
             self.data_transformed = self.apply_feature_transforms(data, self.transforms)
@@ -342,8 +338,10 @@ def prepare_pestpp(self, t_d=None, observation_data=None):
 
         obs = pst.observation_data
 
-        if observation_data is None:
-            observation_data = self.__org_observation_data
+        if observation_data is not None:
+            self.observation_data = observation_data
+        else:
+            observation_data = self.observation_data
         assert isinstance(observation_data, pd.DataFrame), "observation_data must be a pandas DataFrame"
         for col in observation_data.columns:
             obs.loc[sim_vals.index,col] = observation_data.loc[:,col]
@@ -382,7 +380,7 @@ def prepare_pestpp(self, t_d=None, observation_data=None):
             pickle.dump(self,f)
         return pst
         
-    def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None):
+    def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None,ies_exe_path="pestpp-ies"):
         """
         Prepare Data Space Inversion Variable Control (DSIVC) control files.
         
@@ -532,6 +530,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
         par.loc[decvar_names,"partrans"] = "none"
         par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max()
         par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min()
+        par.loc[decvar_names,"parval1"] = self.data.loc[:,decvar_names].quantile(.5)
         
         self.logger.statement(f"zero-weighting observation data...")
         # prepemtpively set obs weights 0.0
@@ -545,6 +544,9 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
             obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values
 
         obs.loc[stack_stats.index,"obgnme"] = "stack_stats"
+        obs.loc[stack_stats.index,"org_obsnme"] = [i.split("_stat:")[0] for i in stack_stats.index.values]
+        pst_dsivc.try_parse_name_metadata()
+
         #obs.loc[stack.index,"obgnme"] = "stack"
 
         self.logger.statement(f"building dsivc_forward_run.py...")
@@ -555,7 +557,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
             file.write(function_source)
             file.write("\n\n")
             file.write("if __name__ == \"__main__\":\n")
-            file.write(f"    {function_source.split('(')[0].split('def ')[1]}()\n")
+            file.write(f"    {function_source.split('(')[0].split('def ')[1]}(ies_exe_path='{ies_exe_path}')\n")
 
         self.logger.statement(f"preparing nominal initial population...")
         if mou_population_size is None:

From 4dcbeb47e0667e73896336e4969218d46c25d9c8 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 09:41:34 +0100
Subject: [PATCH 14/58] checkin dsi

---
 autotest/dsi_tests.py  | 102 ++++++++++++++++++++++++++++++++++++++---
 pyemu/emulators/dsi.py |   3 +-
 2 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py
index 724973c09..7e267a214 100644
--- a/autotest/dsi_tests.py
+++ b/autotest/dsi_tests.py
@@ -60,7 +60,7 @@
 #        else:  # Should be unchanged
 #            assert np.isclose(transformed_means[col], orig_means[col])
 
-def test_dsi_freyberg(tmp_d):
+def dsi_freyberg(tmp_d,transforms=None,tag=""):
 
     test_d = "ends_master"
     test_d = setup_tmp(test_d, tmp_d)
@@ -75,21 +75,26 @@ def test_dsi_freyberg(tmp_d):
     oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
     data = oe._df.copy()
 
-    dsi = DSI(sim_ensemble=data)
+    dsi = DSI(sim_ensemble=data,transforms=transforms)
     dsi.apply_feature_transforms()
     dsi.fit()
 
     # history match
     obsdata = pst.observation_data.copy()
+    if "quadratic_extrapolation" in transforms[0].keys():
+        nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist()
+        ovals = oe.loc[:,nzobs].max(axis=0) * 1.1
+        obsdata.loc[nzobs,"obsval"] = ovals.values
+
     td = "template_dsi"
     pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata)
-    pstdsi.control_data.noptmax = 3
+    pstdsi.control_data.noptmax = 1
     pstdsi.pestpp_options["ies_num_reals"] = 100
     pstdsi.write(os.path.join(td, "dsi.pst"),version=2)
 
     pvals = pd.read_csv(os.path.join(td, "dsi_pars.csv"), index_col=0)
-    md = "master_dsi"
-    num_workers= 3
+    md = f"master_dsi{tag}"
+    num_workers = 1
     worker_root = "."
     pyemu.os_utils.start_workers(
         td,ies_exe_path,"dsi.pst", num_workers=num_workers,
@@ -99,10 +104,95 @@ def test_dsi_freyberg(tmp_d):
             "dsi": dsi, "pvals": pvals,
         }
     )
+    return
+
+def test_dsi_basic(tmp_d="temp"):
+    dsi_freyberg(tmp_d,transforms=None)
+    return
 
+def test_dsi_nst(tmp_d="temp"):
+    transforms = [
+        {"type": "normal_score", }
+    ]
+    dsi_freyberg(tmp_d,transforms=transforms)
+    return
 
+def test_dsi_nst_extrap(tmp_d="temp"):
+    transforms = [
+        {"type": "normal_score", "quadratic_extrapolation":True}
+    ]
+    dsi_freyberg(tmp_d,transforms=transforms)
     return
 
+def test_dsi_mixed(tmp_d="temp"):
+    transforms = [
+        {"type": "log10", "columns": ["headwater_20171130", "tailwater_20161130"]},
+        {"type": "normal_score", }
+    ]
+    dsi_freyberg(tmp_d,transforms=transforms)
+    return
+
+def test_dsivc_freyberg():
+
+    md_hm = "master_dsi"
+    assert os.path.exists(md_hm), f"Master directory {md_hm} does not exist."
+    td = "template_dsivc"
+    if os.path.exists(td):
+        shutil.rmtree(td)
+    shutil.copytree(md_hm, td)
+
+    dsi = DSI.load(os.path.join(td, "dsi.pickle"))
+
+    pst = pyemu.Pst(os.path.join(td, "dsi.pst"))
+    oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.1.obs.jcb"))
+
+    obsdata = dsi.observation_data
+    decvars = obsdata.loc[obsdata.obgnme=="out_wel"].obsnme.tolist()
+    pstdsivc = dsi.prepare_dsivc(t_d=td,
+                                oe=oe,
+                                decvar_names=decvars,
+                                track_stack=False,
+                                percentiles=[0.05, 0.25, 0.5, 0.75, 0.95],
+                                dsi_args={
+                                    "noptmax":3,
+                                    "decvar_weight":10.0,
+                                    "num_pyworkers":1,
+                                }
+                                )
+
+    obs = pstdsivc.observation_data
+    obs.org_obsnme.unique()
+
+    obsnme = obsdata.loc[obsdata.obgnme=="tailwater"].obsnme.tolist()[-1]
+    mou_objectives = obs.loc[(obs.org_obsnme==obsnme) & (obs.stat=="50%")].obsnme.tolist()
+
+    pstdsivc.pestpp_options["mou_objectives"] = mou_objectives
+    obs.loc[mou_objectives, "weight"] = 1.0
+    obs.loc[mou_objectives, "obgnme"] = "less_than_obj"
+
+    pstdsivc.control_data.noptmax = 1 #just for testing
+    pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing 
+
+    pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2)
+
+    md = "master_dsivc"
+    num_workers = 1
+    worker_root = "."
+
+    pyemu.os_utils.start_workers(td,
+                                 "pestpp-mou",
+                                    "dsivc.pst",
+                                    num_workers=num_workers,
+                                    worker_root=worker_root,
+                                    master_dir=md,
+                                    port=_get_port(),)
+
+
+
 
 if __name__ == "__main__":
-    test_dsi_freyberg("temp")
\ No newline at end of file
+    #test_dsi_basic()
+    #test_dsi_nst()
+    #test_dsi_nst_extrap()
+    #test_dsi_mixed()
+    test_dsivc_freyberg()
\ No newline at end of file
diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index 02f3e3277..eb3be1914 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -402,7 +402,8 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
             Percentiles to calculate. Default is [0.25, 0.75, 0.5].
         mou_population_size : int, optional
             Population size for multi-objective optimization.
-            
+        ies_exe_path : str, optional
+            Path to the PEST++ IES executable. Default is "pestpp-ies".
         Returns
         -------
         Pst

From 5b1ad836b20df4b56ce588843891945565966195 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 09:47:41 +0100
Subject: [PATCH 15/58] docstrings

---
 pyemu/emulators/dsi.py | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index eb3be1914..ec4c4a599 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -15,33 +15,9 @@
 
 class DSI(Emulator):
     """
-    Data Space Inversion emulator class.
-    
-    #TODO: add more docstring details
-    
-    Parameters
-    ----------
-    pst : Pst, optional
-        A Pst object. If provided, the emulator will be initialized with the
-        information from the Pst object.
-    sim_ensemble : ObservationEnsemble, optional
-        An ensemble of simulated observations. If provided, the emulator will
-        be initialized with the information from the ensemble.
-    transforms : list of dict, optional
-        List of transformation specifications. Each dict should have:
-        - 'type': str - Type of transformation (e.g.,'log10', 'normal_score').
-        - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
-        - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
-        Example:
-        transforms = [
-            {'type': 'log10', 'columns': ['obs1', 'obs2']},
-            {'type': 'normal_score', 'quadratic_extrapolation': True}
-        ]
-        Default is None, which means no transformations will be applied.
-    energy_threshold : float, optional 
-        The energy threshold for the SVD. Default is 1.0, no truncation.
-    verbose : bool, optional
-        If True, enable verbose logging. Default is False.
+    Data Space Inversion (DS) emulator class. Based on DSI as described in Sun &
+    Durlofsky (2017) and Sun et al (2017).
+        
     """
 
     def __init__(self, 

From 7d3fbff783f94654142ed71e99b88e383ca98e74 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 09:50:05 +0100
Subject: [PATCH 16/58] init

---
 pyemu/emulators/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py
index 3bd39b1da..5b521dceb 100755
--- a/pyemu/emulators/__init__.py
+++ b/pyemu/emulators/__init__.py
@@ -8,9 +8,11 @@
     AutobotsAssemble
 )
 from .base import Emulator
+from .dsi import DSI
 
 __all__ = [
     'Emulator', #base Emulator Class
+    'DSI',  # DSI Emulator Class
     'BaseTransformer',
     'Log10Transformer',
     'RowWiseMinMaxScaler',

From 8f57091edaa27b7e8c3171f0e01a79fb2d0a3495 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 09:51:10 +0100
Subject: [PATCH 17/58] init

---
 pyemu/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pyemu/__init__.py b/pyemu/__init__.py
index 9b88113c7..85e9fe45d 100644
--- a/pyemu/__init__.py
+++ b/pyemu/__init__.py
@@ -20,7 +20,13 @@
 from .sc import Schur
 from .utils import (geostats, gw_utils, helpers, metrics, optimization,
                     os_utils, pp_utils, smp_utils)
-from .emulators import (Emulator, BaseTransformer, Log10Transformer,
+from .emulators import (
+                      #emulators
+                      Emulator, DSI,
+                    
+                      
+                      #transformers
+                      BaseTransformer, Log10Transformer,
                       RowWiseMinMaxScaler, StandardScalerTransformer, NormalScoreTransformer,
                       TransformerPipeline, AutobotsAssemble)
 #from .prototypes import *

From e83f18f8f3bfe4f833e529ab41dc2bf22e779cc5 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 10:43:02 +0100
Subject: [PATCH 18/58] fix to dsi tests

---
 autotest/dsi_tests.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py
index 7e267a214..3f0fa04a8 100644
--- a/autotest/dsi_tests.py
+++ b/autotest/dsi_tests.py
@@ -81,10 +81,11 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""):
 
     # history match
     obsdata = pst.observation_data.copy()
-    if "quadratic_extrapolation" in transforms[0].keys():
-        nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist()
-        ovals = oe.loc[:,nzobs].max(axis=0) * 1.1
-        obsdata.loc[nzobs,"obsval"] = ovals.values
+    if transforms is not None:
+        if "quadratic_extrapolation" in transforms[0].keys():
+            nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist()
+            ovals = oe.loc[:,nzobs].max(axis=0) * 1.1
+            obsdata.loc[nzobs,"obsval"] = ovals.values
 
     td = "template_dsi"
     pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata)
@@ -157,7 +158,8 @@ def test_dsivc_freyberg():
                                     "noptmax":3,
                                     "decvar_weight":10.0,
                                     "num_pyworkers":1,
-                                }
+                                },
+                                ies_exe_path=ies_exe_path,
                                 )
 
     obs = pstdsivc.observation_data

From 61e8d9528542fcc7060981f0ced4dcc6dbff4804 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 10:45:54 +0100
Subject: [PATCH 19/58] moved dsi tests to dsi_tests.py

---
 autotest/la_tests.py | 157 -------------------------------------------
 1 file changed, 157 deletions(-)

diff --git a/autotest/la_tests.py b/autotest/la_tests.py
index 5ec2f8640..9b426c8f2 100644
--- a/autotest/la_tests.py
+++ b/autotest/la_tests.py
@@ -595,167 +595,10 @@ def ends_freyberg_test(tmp_path):
 
 
 
-def ends_run_freyberg_dsi(tmp_d, nst=False, nst_extrap=None, ztz=False, energy=1.0):
-    import pyemu
-    import os
-    import pandas as pd
-    import numpy as np
-    test_d = "ends_master"
-    test_d = setup_tmp(test_d, tmp_d)
-    case = "freyberg6_run_ies"
-    pst_name = os.path.join(test_d, case + ".pst")
-    pst = pyemu.Pst(pst_name)
-    predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"]
-    pst.pestpp_options["predictions"] = predictions
-
-    oe_name = pst_name.replace(".pst", ".0.obs.csv")
-    oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
-
-    ends = pyemu.EnDS(pst=pst, sim_ensemble=oe,verbose=True)
-    t_d = os.path.join(tmp_d, "dsi_template")
-
-    ends.prep_for_dsi(t_d=t_d,
-                      use_ztz=ztz,
-                      apply_normal_score_transform=nst,
-                      nst_extrap=nst_extrap,
-                      energy=energy)
-    # copy exe to dsi_template
-    #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe"))
-    filename=os.path.join(t_d,"dsi.0.obs.csv")
-    if os.path.exists(filename):
-        os.remove(filename)
-    pst = pyemu.Pst(os.path.join(t_d,"dsi.pst"))
-    pst.control_data.noptmax = -1
-    pst.pestpp_options["overdue_giveup_fac"] = 100000000
-    pst.write(os.path.join(t_d,"dsi.pst"),version=2)
-    #pyemu.os_utils.run("pestpp-ies dsi.pst",cwd=t_d)
-
-    pvals = pd.read_csv(os.path.join(t_d,"dsi_pars.csv"),index_col=0)
-    pmat = np.load(os.path.join(t_d,"dsi_proj_mat.npy"))
-    ovals = pd.read_csv(os.path.join(t_d,"dsi_pr_mean.csv"),index_col=0)
-
-
-    m_d = t_d.replace("template","master")
-    port = _get_port()
-    pyemu.os_utils.start_workers(t_d, ies_exe_path,"dsi.pst",
-                                 worker_root=tmp_d,
-                                 master_dir=m_d, num_workers=10, port=port,
-                                ppw_function=pyemu.helpers.dsi_pyworker,
-                                ppw_kwargs={"pmat":pmat,"ovals":ovals,"pvals":pvals})
-    #read in the results
-    oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=os.path.join(m_d,"dsi.0.obs.csv"))
-    assert oe.shape[0]==50, f"{50-oe.shape[0]} failed runs"
-    phi_vector = oe.phi_vector.sort_values().values
-    assert phi_vector[0] != phi_vector[1],phi_vector
-
-def ends_freyberg_dsi_test(tmp_path):
-    ends_run_freyberg_dsi(tmp_path)
-
-def ends_freyberg_dsi_nst_test(tmp_path):
-    ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap=None)
-
-def ends_freyberg_dsi_extrap_test(tmp_path):
-    ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap='quadratic')
-
-def ends_freyberg_dsi_ztz_test(tmp_path):
-    ends_run_freyberg_dsi(tmp_path,ztz=True)
-
-def ends_freyberg_dsi_svd_test(tmp_path):
-    ends_run_freyberg_dsi(tmp_path,ztz=True,energy=0.999)
-
-
-def plot_freyberg_dsi():
-    import pandas as pd
-    import pyemu
-    import matplotlib.pyplot as plt
-
-    test_d = "ends_master"
-    case = "freyberg6_run_ies"
-    pst_name = os.path.join(test_d, case + ".pst")
-    pst = pyemu.Pst(pst_name)
-    predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"]
-    oe_name = pst_name.replace(".pst", ".0.obs.csv")
-    pr_oe = pd.read_csv(os.path.join(test_d,"freyberg6_run_ies.0.obs.csv"),index_col=0)
-    pt_oe = pd.read_csv(os.path.join(test_d, "freyberg6_run_ies.3.obs.csv"), index_col=0)
-
-    m_d = os.path.join("dsi", "master_dsi")
-    pst = pyemu.Pst(os.path.join(m_d,"dsi.pst"))
-    pr_oe_dsi = pd.read_csv(os.path.join(m_d,"dsi.0.obs.csv"),index_col=0)
-    pt_oe_dsi = pd.read_csv(os.path.join(m_d, "dsi.3.obs.csv"), index_col=0)
-
-    pv = pyemu.ObservationEnsemble(pst=pst,df=pt_oe).phi_vector
-    pv_dsi = pyemu.ObservationEnsemble(pst=pst, df=pt_oe_dsi).phi_vector
-    #print(pt_oe.shape)
-    pt_oe = pt_oe.loc[pv<25, :]
-    pt_oe_dsi = pt_oe_dsi.loc[pv_dsi < 25, :]
-
-    # print(pt_oe.shape)
-    # fig,ax = plt.subplots(1,1,figsize=(5,5))
-    # ax.hist(pv,bins=10,facecolor="b",alpha=0.5,density=True)
-    # ax.hist(pv_dsi, bins=10, facecolor="m", alpha=0.5,density=True)
-    # ax.set_yticks([])
-    # plt.tight_layout()
-    # plt.show()
-
-
-
-    fig,axes = plt.subplots(len(predictions),1,figsize=(10,10))
-    for p,ax in zip(predictions,axes):
-        ax.hist(pr_oe.loc[:,p].values,bins=10,alpha=0.5,facecolor="0.5",density=True,label="prior")
-        ax.hist(pt_oe.loc[:, p].values, bins=10, alpha=0.5, facecolor="b",density=True,label="posterior")
-        ax.hist(pr_oe_dsi.loc[:, p].values, bins=10, facecolor="none",hatch="/",edgecolor="0.5",
-                lw=2.5,density=True,label="dsi prior")
-        ax.hist(pt_oe_dsi.loc[:, p].values, bins=10, facecolor="none",density=True,hatch="/",edgecolor="b",lw=2.5,
-                label="dsi posterior")
-        ax.set_title(p,loc="left")
-        ax.legend(loc="upper right")
-        ax.set_yticks([])
-    plt.tight_layout()
-    plt.savefig("dsi_pred.pdf")
-
-
-def dsi_normscoretransform_test():
-    import numpy as np
-    import pyemu
-    from pyemu.utils.helpers import randrealgen_optimized,normal_score_transform,inverse_normal_score_transform
-    test_d = "ends_master"
-    case = "freyberg6_run_ies"
-    pst_name = os.path.join(test_d, case + ".pst")
-    pst = pyemu.Pst(pst_name)
-
-    oe_name = pst_name.replace(".pst", ".0.obs.csv")
-    oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
-
-    nstval = randrealgen_optimized(oe.shape[0], 1e-7, 1e4)
-    window_size=3   
-    if oe.shape[0]>40:
-        window_size=5                    
-    if oe.shape[0]>90:
-        window_size=7
-    if oe.shape[0]>200:
-        window_size=9            
-    for name in oe.columns:
-        print("transforming:",name)
-        sorted_values = oe._df.loc[:,name].sort_values().copy()
-        #if all values are the same, skip
-        if sorted_values.iloc[0] == sorted_values.iloc[-1]:
-            print("all values are the same, skipping")
-            continue
-        sorted_values.loc[:] = pyemu.eds.moving_average_with_endpoints(sorted_values.values, window_size)
-        transformed_values = np.asarray([normal_score_transform(nstval, sorted_values, value)[0] for value in sorted_values])
-        backtransformed_values = np.asarray([inverse_normal_score_transform(nstval, sorted_values, value)[0] for value in transformed_values])
-        
-        diff = backtransformed_values-sorted_values
-        assert max(abs(diff))<1e-7, backtransformed_values
-
 
 if __name__ == "__main__":
-    #dsi_normscoretransform_test()
     #ends_freyberg_test("temp")
-    ends_freyberg_dsi_test("temp")
     #ends_freyberg_dev()
-    #ends_freyberg_dsi_test("temp")
-    #plot_freyberg_dsi()
     #obscomp_test()
     #alternative_dw()
     #freyberg_verf_test()

From 042c96a8355e918bf77dd45c3947af62507a3d9a Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 10:46:16 +0100
Subject: [PATCH 20/58] moved dsi tests to dsi_tests.py

---
 autotest/la_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/la_tests.py b/autotest/la_tests.py
index 9b426c8f2..6e79ec9c7 100644
--- a/autotest/la_tests.py
+++ b/autotest/la_tests.py
@@ -597,7 +597,7 @@ def ends_freyberg_test(tmp_path):
 
 
 if __name__ == "__main__":
-    #ends_freyberg_test("temp")
+    ends_freyberg_test("temp")
     #ends_freyberg_dev()
     #obscomp_test()
     #alternative_dw()

From 8a817a59f20a41f3d3399c50b75f93eec5cecaac Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 10:57:50 +0100
Subject: [PATCH 21/58] docstrings

---
 pyemu/emulators/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py
index f088d91ee..c00809dc1 100755
--- a/pyemu/emulators/base.py
+++ b/pyemu/emulators/base.py
@@ -99,7 +99,8 @@ def prepare_training_data(self, data=None):
     def apply_feature_transforms(self, data=None, transforms=None):
         """
         Apply feature transformations to data with customizable transformer sequence.
-        
+        This function is not intended to be used directly by users.
+
         Parameters
         ----------
         data : pandas.DataFrame, optional

From 18ed8ed90886dfd19945e970e2b99fee97d70ab5 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 10:58:05 +0100
Subject: [PATCH 22/58] use class save instead of pickle

---
 pyemu/emulators/dsi.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index ec4c4a599..a3b699d73 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -6,7 +6,6 @@
 import pandas as pd
 import inspect
 from pyemu.utils.helpers import dsi_forward_run, series_to_insfile
-import pickle
 import os
 import shutil
 from pyemu.pst.pst_handler import Pst
@@ -351,9 +350,8 @@ def prepare_pestpp(self, t_d=None, observation_data=None):
         pst.write(os.path.join(t_d,"dsi.pst"),version=2)
         self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst")))
         
-        #self.pst_dsi = pst #breaks pickling #TODO: add save/load methods to Emulator class
-        with open(os.path.join(t_d,"dsi.pickle"),"wb") as f:
-            pickle.dump(self,f)
+        self.logger.statement("pickling dsi object to {0}".format(os.path.join(t_d,"dsi.pickle")))
+        self.save(os.path.join(t_d,"dsi.pickle"))
         return pst
         
     def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None,ies_exe_path="pestpp-ies"):
@@ -570,8 +568,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
         self.logger.statement("overwriting dsi.pickle file...")
         self.decision_variable_names = decvar_names
         # re-pickle dsi to track dsivc args
-        with open(os.path.join(t_d,"dsi.pickle"),"wb") as f:
-            pickle.dump(self,f)
+        self.save(os.path.join(t_d,"dsi.pickle"))
 
         self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...")
         return pst_dsivc
\ No newline at end of file

From 19b3801affd5fd11ff69f3032cb50f461e01d248 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 18 Jun 2025 14:31:21 +0100
Subject: [PATCH 23/58] checkin baseline ldfa with sklearn

---
 pyemu/emulators/ldfa.py | 505 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 505 insertions(+)
 create mode 100644 pyemu/emulators/ldfa.py

diff --git a/pyemu/emulators/ldfa.py b/pyemu/emulators/ldfa.py
new file mode 100644
index 000000000..707ba80f3
--- /dev/null
+++ b/pyemu/emulators/ldfa.py
@@ -0,0 +1,505 @@
+"""
+Learning-based pattern-data-driven forecast approach (LDFA) emulator implementation.
+
+"""
+from __future__ import print_function, division
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.neural_network import MLPRegressor
+
+from .base import Emulator
+from .transformers import RowWiseMinMaxScaler
+
+# Define scikit-learn based model class
+class LDFAModel:
+    """
+    Scikit-learn MLPRegressor wrapper for LDFA neural network model.
+    """
+    def __init__(self, input_dim, output_dim, hidden_units=None, activation='relu', 
+                 dropout_rate=0.0, learning_rate=0.01, max_iter=200, early_stopping=True):
+        
+        if hidden_units is None:
+            hidden_units = (2 * input_dim,)
+        elif isinstance(hidden_units, list):
+            hidden_units = tuple(hidden_units)
+            
+        # Map activation functions from PyTorch to scikit-learn
+        activation_map = {
+            'relu': 'relu',
+            'tanh': 'tanh', 
+            'sigmoid': 'logistic'
+        }
+        
+        self.model = MLPRegressor(
+            hidden_layer_sizes=hidden_units,
+            activation=activation_map.get(activation, 'relu'),
+            learning_rate_init=learning_rate,
+            max_iter=max_iter,
+            early_stopping=early_stopping,
+            validation_fraction=0.2,
+            n_iter_no_change=20,  # Patience for early stopping
+            random_state=42,
+            warm_start=False,
+            alpha=dropout_rate if dropout_rate > 0 else 0.0001  # Use L2 regularization instead of dropout
+        )
+    
+    def fit(self, X, y):
+        """Fit the model"""
+        return self.model.fit(X, y)
+    
+    def predict(self, X):
+        """Make predictions"""
+        return self.model.predict(X)
+    
+    @property
+    def loss_curve_(self):
+        """Get training loss curve"""
+        return getattr(self.model, 'loss_curve_', [])
+    
+
+class LDFA(Emulator):
+    """
+    Class for the Learning-based pattern-data-driven forecast approach from Kim et al (2025).
+    
+    This emulator uses neural networks to learn the relationships between inputs 
+    and forecast outputs, with dimensionality reduction via PCA.
+    
+    Parameters
+    ----------
+    data : pandas.DataFrame
+        The training data with input and forecast columns.
+    input_cols : list
+        List of column names to use as inputs.
+    groups : dict
+        Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling.
+    fit_groups : dict
+        Dictionary mapping group names to lists of column names used to fit the scaling.
+    forecast_names : list, optional
+        List of column names to forecast. If None, all columns in data will be used.
+    energy_threshold : float, optional
+        Energy threshold for the PCA. Default is 1.0.
+    seed : int, optional
+        Random seed for reproducibility. Default is None.
+    early_stop : bool, optional
+        Whether to use early stopping during training. Default is True.
+    apply_std_scaler : bool, optional
+        Whether to apply standard scaling before min-max scaling. Default is False.
+    verbose : bool, optional
+        If True, enable verbose logging. Default is True.
+    """
+
+    def __init__(self,
+                 data,
+                 input_cols,
+                 groups,
+                 fit_groups,
+                 forecast_names=None,
+                 energy_threshold=1.0,
+                 seed=None,
+                 early_stop=True,
+                 transforms=None,
+                 verbose=True):
+        """
+        Initialize the Learning-based pattern-data-driven NN emulator.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            The training data with input and forecast columns.
+        input_cols : list
+            List of column names to use as inputs.
+        groups : dict
+            Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling.
+        fit_groups : dict
+            Dictionary mapping group names to lists of column names used to fit the scaling.
+        forecast_names : list, optional
+            List of column names to forecast. If None, all columns in data will be used.
+        energy_threshold : float, optional
+            Energy threshold for the PCA. Default is 1.0.
+        seed : int, optional
+            Random seed for reproducibility. Default is None.
+        early_stop : bool, optional
+            Whether to use early stopping during training. Default is True.
+        transforms : list of dict, optional
+            List of transformation specifications. Each dict should have:
+            - 'type': str - Type of transformation (e.g.,'log10', 'normal_score').
+            - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
+            - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
+            Example:
+            transforms = [
+                {'type': 'log10', 'columns': ['obs1', 'obs2']},
+                {'type': 'normal_score', 'quadratic_extrapolation': True}
+            ]
+            Default is None, which means no transformations will be applied.
+        verbose : bool, optional
+            If True, enable verbose logging. Default is True.
+        """
+
+        
+        super().__init__(verbose=verbose)
+
+        self.seed = seed
+        self.data = data
+        self.input_cols = input_cols
+        self.groups = groups
+        self.fit_groups = fit_groups
+        
+        if forecast_names is None:
+            forecast_names = data.columns
+        self.forecast_names = forecast_names
+        
+        self.energy_threshold = energy_threshold
+        
+        # Store early stopping preference
+        self.use_early_stopping = early_stop
+            
+        self.transforms = transforms
+        self.noise_model = None
+        self.model = None
+        self.train_data = None
+        self.test_data = None
+        
+    def prepare_training_data(self, data=None, test_size=0.2):
+        """
+        Prepare the training data for model fitting.
+        
+        This method:
+        1. Splits the data into training and test sets
+        2. Applies transform pipelines if specified
+        3. Applies row-wise min-max scaling
+        4. Performs PCA dimensionality reduction
+        
+        Parameters
+        ----------
+        data : pandas.DataFrame, optional
+            Data to prepare. If None, uses self.data. Default is None.
+        test_size : float, optional
+            Fraction of data to use for testing. Default is 0.2.
+            
+        Returns
+        -------
+        dict
+            Dictionary containing prepared data components:
+            - X_train: Input training data after transformation and PCA
+            - y_train: Target training data after transformation and PCA
+            - X_test: Input testing data after transformation and PCA
+            - y_test: Target testing data after transformation and PCA
+        """
+        if data is None:
+            data = self.data
+            
+        if data is None:
+            raise ValueError("No data provided and no data stored in the emulator")
+            
+        # Split the data into training and test sets
+        train, test = train_test_split(
+            data, 
+            test_size=test_size, 
+            random_state=self.seed
+        )
+        
+        self.logger.statement("preparing training data: data split complete")
+        
+        # Store for later use
+        self.train_data = train.copy()
+        self.test_data = test.copy()
+        
+        
+        # TODO: Apply feature transformations if specified
+        # Always use the base class transformation method for consistency
+        if self.transforms is None:
+            from .transformers import AutobotsAssemble
+            self.feature_transformer = AutobotsAssemble(train.copy())
+            train_transformed = train
+            test_transformed = test
+        else:
+            train_transformed = self.apply_feature_transforms(train, self.transforms)
+            test_transformed = self.feature_transformer.transform(test)
+
+        
+        # Apply row-wise min-max scaling directly (not through the pipeline)
+        # We need to keep train and test separate; there may be a more elgant solution to this....
+        # training data
+        self.logger.statement("applying row-wise min-max scaling")
+        self.rowwise_mm_scalers ={
+            "train": RowWiseMinMaxScaler(
+                        feature_range=(-1, 1),
+                        groups=self.groups,
+                        fit_groups=self.fit_groups )
+        }    
+        self.rowwise_mm_scalers["train"].fit(train_transformed)
+        train_scaled = self.rowwise_mm_scalers["train"].transform(train_transformed)
+
+        # test data
+        # We need to fit a new scaler on the test data
+        self.rowwise_mm_scalers["test"] =  RowWiseMinMaxScaler(
+                        feature_range=(-1, 1),
+                        groups=self.groups,
+                        fit_groups=self.fit_groups )
+        self.rowwise_mm_scalers["train"].fit(test_transformed)
+        test_scaled = self.rowwise_mm_scalers["test"].transform(test_transformed)
+
+        self.logger.statement("row-wise min-max scaling complete")
+        
+        # Split datasets into input (X) and target (y) variables
+        X_train = train_scaled.loc[:, self.input_cols].copy()
+        y_train = train_scaled.loc[:, self.forecast_names].copy()
+        
+        X_test = test_scaled.loc[:, self.input_cols].copy()
+        y_test = test_scaled.loc[:, self.forecast_names].copy()
+        
+        # Apply PCA to reduce the dimensionality of the data
+        self.logger.statement("applying PCA dimensionality reduction")
+        self.pcaX = PCA()#n_components=X_test.shape[1])
+        self.pcay = PCA()#n_components=y_test.shape[1])
+
+        self.X = self.pcaX.fit_transform(X_train)
+        self.y = self.pcay.fit_transform(y_train)
+        
+        self.X_test = self.pcaX.transform(X_test)
+        self.y_test = self.pcay.transform(y_test)
+        
+        self.logger.statement("PCA dimensionality reduction complete")
+        
+        return {
+            'X_train': self.X,
+            'y_train': self.y,
+            'X_test': self.X_test,
+            'y_test': self.y_test
+        }
+        
+    def _build_model(self, params=None, prob=False):
+        """
+        Build a neural network model with the specified parameters.
+        
+        Parameters
+        ----------
+        params : dict or pandas.Series, optional
+            Dictionary with model parameters including:
+            - activation: Activation function to use
+            - hidden_units: List of units in each hidden layer
+            - dropout_rate: Rate of dropout for regularization
+            - learning_rate: Learning rate for optimizer
+            If None, uses default parameters. Default is None.
+        prob : bool, optional
+            Whether to build a probabilistic model. Default is False.
+            
+        Returns
+        -------
+        LDFAModel
+            The scikit-learn MLPRegressor wrapper instance.
+        """
+        if params is None:
+            params = {
+                'activation': 'relu', 
+                'hidden_units': None, 
+                'dropout_rate': 0.0,
+                'learning_rate': 0.01
+            }
+        
+        if isinstance(params, pd.Series):
+            params = params.to_dict()
+
+        input_dim = self.X.shape[1]
+        output_dim = self.y.shape[1]
+        
+        # Create the model architecture
+        model = LDFAModel(
+            input_dim=input_dim,
+            output_dim=output_dim,
+            hidden_units=params['hidden_units'],
+            activation=params['activation'],
+            dropout_rate=params['dropout_rate'],
+            learning_rate=params['learning_rate'],
+            early_stopping=self.use_early_stopping
+        )
+        
+        return model
+
+    def create_model(self, params=None):
+        """
+        Create and store the main model.
+        
+        Parameters
+        ----------
+        params : dict, optional
+            Dictionary of model parameters. Default is None.
+            
+        Returns
+        -------
+        self : LDFA
+            The emulator instance with model created.
+        """
+        self.model = self._build_model(params)
+        return self
+
+    def add_noise_model(self, params=None):
+        """
+        Add a noise model to capture residuals.
+        
+        Parameters
+        ----------
+        params : dict, optional
+            Dictionary of model parameters for the noise model. Default is None.
+            
+        Returns
+        -------
+        self : LDFA
+            The emulator instance with noise model added.
+        """
+        # Create noise model
+        self.noise_model = self._build_model(params)
+        
+        # Get residuals from main model
+        self.logger.statement("calculating residuals for noise model")
+        
+        # Get predictions from main model
+        pred_train = self.model.predict(self.X)
+        residuals_train = self.y - pred_train
+        
+        # Train noise model on residuals
+        self.logger.statement("training noise model on residuals")
+        self.noise_model.fit(self.X, residuals_train)
+        
+        return self
+
+    def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True):
+        """
+        Fit the model to the training data.
+        
+        Parameters
+        ----------
+        epochs : int, optional
+            Number of training epochs. Default is 200.
+        batch_size : int, optional
+            Batch size for training. Default is 32.
+        X : pandas.DataFrame, optional
+            Input data for training. If None and prepare_data is True,
+            will run prepare_training_data(). Default is None.
+        y : pandas.DataFrame, optional
+            Not used directly but included for API consistency. Default is None.
+        prepare_data : bool, optional
+            Whether to prepare training data if not already done. Default is True.
+            
+        Returns
+        -------
+        self : LDFA
+            The fitted emulator.
+        """
+        if prepare_data and (X is None or self.X is None):
+            self.prepare_training_data()
+            
+        if self.model is None:
+            self.create_model()
+        
+        # Update max_iter for the model
+        self.model.model.max_iter = epochs
+        
+        # Simple fit - scikit-learn handles batching, early stopping, etc.
+        self.logger.statement(f"fitting model with MLPRegressor: {epochs} epochs")
+        
+        X_train = self.X if X is None else X
+        y_train = self.y
+        
+        # Fit the model
+        self.model.fit(X_train, y_train)
+        
+        # Store training history
+        self.history = {
+            'loss': self.model.loss_curve_,
+            'val_loss': []  # MLPRegressor doesn't provide separate validation loss
+        }
+        
+        # Log final training info
+        n_iter = getattr(self.model.model, 'n_iter_', epochs)
+        final_loss = self.model.loss_curve_[-1] if self.model.loss_curve_ else "N/A"
+        self.logger.statement(f"Training completed in {n_iter} iterations, final loss: {final_loss}")
+        
+        self.fitted = True
+        return self
+
+    def predict(self, data):
+        """
+        Generate predictions for new data.
+        
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            New data to generate predictions for.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            Predictions for the input data.
+        """
+        if not self.fitted:
+            raise ValueError("Emulator must be fitted before prediction")
+        
+        if self.model is None:
+            raise ValueError("No model has been created. Call create_model() first")
+            
+        self.logger.statement("generating predictions from fitted model")
+            
+        # Make a copy of the input data to avoid modifying the original
+        truth = data.copy()
+        predictions = truth.copy()
+        predictions[:] = np.nan
+        
+        # STEP 1: Apply the same sequence of transformations used during training
+        self.logger.statement("applying transformations to input data")
+        
+        # Apply transfrom pipeline if it was used during training
+        truth_transformed = self.feature_transformer.transform(truth)
+
+        
+        # Apply row-wise min-max scaling
+        # We need to fit a new scaler on the truth data
+        forecast_rowwise_mm_scaler = RowWiseMinMaxScaler(
+            feature_range=(-1, 1),
+            groups=self.groups,
+            fit_groups=self.fit_groups
+        )
+        forecast_rowwise_mm_scaler.fit(truth_transformed)
+        truth_scaled = forecast_rowwise_mm_scaler.transform(truth_transformed)
+        
+        # Extract input columns and apply PCA transformation
+        X_truth = truth_scaled.loc[:, self.input_cols].copy()
+        y_truth = truth_scaled.loc[:, self.forecast_names].copy()
+        
+        # Apply PCA transform
+        truth_pca = self.pcaX.transform(X_truth.values)
+        
+        # Run model prediction 
+        self.logger.statement("running model prediction")
+        
+        # Get model prediction
+        pred_pca = self.model.predict(truth_pca)
+        
+        # Add noise prediction if available
+        if self.noise_model is not None:
+            self.logger.statement("adding noise model prediction")
+            noise_pred = self.noise_model.predict(truth_pca)
+            pred_pca = pred_pca + noise_pred
+        
+        # Apply inverse transformations in REVERSE order of the original transformations
+        self.logger.statement("performing inverse transformations")
+        
+        # First inverse the PCA transform (was the last transform applied)
+        pred_scaled = pd.DataFrame(
+            self.pcay.inverse_transform(pred_pca),
+            columns=y_truth.columns, 
+            index=y_truth.index
+        )
+        
+        # Then inverse the row-wise min-max scaling (applied before PCA)
+        pred_transformed = forecast_rowwise_mm_scaler.inverse_transform(pred_scaled)
+        
+        # Assign predictions to output
+        predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names]
+        
+        # Finally, inverse the transform pipeline if it was applied (was the first transform)
+        predictions = self.feature_transformer.inverse_transform(predictions)
+        
+        return predictions
\ No newline at end of file

From 133e0dea84c0218cd332b87c2d5d38e20365b79c Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 19 Jun 2025 09:30:40 +0100
Subject: [PATCH 24/58] rename test file

---
 autotest/{dsi_tests.py => emulator_tests.py} | 103 ++++++++++---------
 1 file changed, 53 insertions(+), 50 deletions(-)
 rename autotest/{dsi_tests.py => emulator_tests.py} (68%)

diff --git a/autotest/dsi_tests.py b/autotest/emulator_tests.py
similarity index 68%
rename from autotest/dsi_tests.py
rename to autotest/emulator_tests.py
index 3f0fa04a8..459e3d0c7 100644
--- a/autotest/dsi_tests.py
+++ b/autotest/emulator_tests.py
@@ -10,55 +10,6 @@
 from pyemu.emulators import DSI
 
 
-#def test_dsi_feature_transforms():
-#    """Test feature transforms in DSI emulator"""
-#    # Create test data simulating an ensemble
-#    np.random.seed(42)
-#    n_reals = 10
-#    n_obs = 5
-#    sim_names = [f"obs{i}" for i in range(n_obs)]
-#    sim_data = np.random.lognormal(mean=0, sigma=1, size=(n_reals, n_obs))
-#    sim_ensemble = pd.DataFrame(sim_data, columns=sim_names)
-#    
-#    # Create DSI emulator
-#    pst = pyemu.Pst.from_par_obs_names(["p1"], sim_names)
-#    dsi = pyemu.emulators.DSI(
-#        pst=pst,
-#        sim_ensemble=sim_ensemble,
-#        transforms = [{"type": "log10", "columns": sim_names},
-#                        {"type": "normal_score", "columns": sim_names}],
-#        
-#    )
-#    
-#    # Test feature transforms
-#    dsi.apply_feature_transforms()
-#    
-#    # Check that transformed data exists
-#    assert dsi.data_transformed is not None
-#    
-#    # Check log transform was applied (values should be smaller than original lognormal data)
-#    assert dsi.data_transformed.mean().mean() < sim_ensemble.mean().mean()
-#    
-#    # Check the feature transformer object exists
-#    assert hasattr(dsi, "feature_transformer")
-#    
-#    # Test with specific columns for log transform
-#    dsi2 = pyemu.emulators.DSI(
-#        pst=pst,
-#        sim_ensemble=sim_ensemble,
-#        transforms = [{"type": "log10", "columns": sim_names[:2]}]
-#                            )
-#    dsi2.apply_feature_transforms()
-#    
-#    # Check only specified columns were log transformed
-#    orig_means = sim_ensemble.mean()
-#    transformed_means = dsi2.data_transformed.mean()
-#    
-#    for i, col in enumerate(sim_names):
-#        if i < 2:  # Should be log transformed
-#            assert transformed_means[col] < orig_means[col]
-#        else:  # Should be unchanged
-#            assert np.isclose(transformed_means[col], orig_means[col])
 
 def dsi_freyberg(tmp_d,transforms=None,tag=""):
 
@@ -190,6 +141,57 @@ def test_dsivc_freyberg():
                                     port=_get_port(),)
 
 
+def plot_freyberg_dsi():
+    import pandas as pd
+    import pyemu
+    import matplotlib.pyplot as plt
+
+    test_d = "ends_master"
+    case = "freyberg6_run_ies"
+    pst_name = os.path.join(test_d, case + ".pst")
+    pst = pyemu.Pst(pst_name)
+    predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"]
+    oe_name = pst_name.replace(".pst", ".0.obs.csv")
+    pr_oe = pd.read_csv(os.path.join(test_d,"freyberg6_run_ies.0.obs.csv"),index_col=0)
+    #pt_oe = pd.read_csv(os.path.join(test_d, "freyberg6_run_ies.3.obs.csv"), index_col=0)
+    pt_oe = pr_oe.copy()
+
+
+    m_d = os.path.join( "master_dsi")
+    pst = pyemu.Pst(os.path.join(m_d,"dsi.pst"))
+    pr_oe_dsi = pyemu.ObservationEnsemble.from_binary(pst=pst, filename=os.path.join(m_d,"dsi.0.obs.jcb"))._df
+    pt_oe_dsi = pyemu.ObservationEnsemble.from_binary(pst=pst, filename=os.path.join(m_d,"dsi.1.obs.jcb"))._df
+
+    pv = pyemu.ObservationEnsemble(pst=pst,df=pt_oe).phi_vector
+    pv_dsi = pyemu.ObservationEnsemble(pst=pst, df=pt_oe_dsi).phi_vector
+    #print(pt_oe.shape)
+    pt_oe = pt_oe.loc[pv<25, :]
+    pt_oe_dsi = pt_oe_dsi.loc[pv_dsi < 25, :]
+
+    # print(pt_oe.shape)
+    # fig,ax = plt.subplots(1,1,figsize=(5,5))
+    # ax.hist(pv,bins=10,facecolor="b",alpha=0.5,density=True)
+    # ax.hist(pv_dsi, bins=10, facecolor="m", alpha=0.5,density=True)
+    # ax.set_yticks([])
+    # plt.tight_layout()
+    # plt.show()
+
+
+
+    fig,axes = plt.subplots(len(predictions),1,figsize=(10,10))
+    for p,ax in zip(predictions,axes):
+        ax.hist(pr_oe.loc[:,p].values,bins=10,alpha=0.5,facecolor="0.5",density=True,label="prior")
+        ax.hist(pt_oe.loc[:, p].values, bins=10, alpha=0.5, facecolor="b",density=True,label="posterior")
+        ax.hist(pr_oe_dsi.loc[:, p].values, bins=10, facecolor="none",hatch="/",edgecolor="0.5",
+                lw=2.5,density=True,label="dsi prior")
+        ax.hist(pt_oe_dsi.loc[:, p].values, bins=10, facecolor="none",density=True,hatch="/",edgecolor="b",lw=2.5,
+                label="dsi posterior")
+        ax.set_title(p,loc="left")
+        ax.legend(loc="upper right")
+        ax.set_yticks([])
+    plt.tight_layout()
+    plt.savefig("dsi_pred.pdf")
+
 
 
 if __name__ == "__main__":
@@ -197,4 +199,5 @@ def test_dsivc_freyberg():
     #test_dsi_nst()
     #test_dsi_nst_extrap()
     #test_dsi_mixed()
-    test_dsivc_freyberg()
\ No newline at end of file
+    #test_dsivc_freyberg()
+    plot_freyberg_dsi()
\ No newline at end of file

From c7cfadc7e9f6536ebe6681a339df112ffd28ae90 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 19 Jun 2025 09:38:49 +0100
Subject: [PATCH 25/58] rename ldfa to lpfa

---
 pyemu/__init__.py                    |  2 +-
 pyemu/emulators/__init__.py          |  3 +-
 pyemu/emulators/{ldfa.py => lpfa.py} | 49 ++++++++++++++++++++--------
 3 files changed, 38 insertions(+), 16 deletions(-)
 rename pyemu/emulators/{ldfa.py => lpfa.py} (89%)

diff --git a/pyemu/__init__.py b/pyemu/__init__.py
index 85e9fe45d..a53c116ac 100644
--- a/pyemu/__init__.py
+++ b/pyemu/__init__.py
@@ -22,7 +22,7 @@
                     os_utils, pp_utils, smp_utils)
 from .emulators import (
                       #emulators
-                      Emulator, DSI,
+                      Emulator, DSI, LPFA,
                     
                       
                       #transformers
diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py
index 5b521dceb..5bb861e71 100755
--- a/pyemu/emulators/__init__.py
+++ b/pyemu/emulators/__init__.py
@@ -9,10 +9,11 @@
 )
 from .base import Emulator
 from .dsi import DSI
-
+from .lpfa import LPFA
 __all__ = [
     'Emulator', #base Emulator Class
     'DSI',  # DSI Emulator Class
+    'LPFA',
     'BaseTransformer',
     'Log10Transformer',
     'RowWiseMinMaxScaler',
diff --git a/pyemu/emulators/ldfa.py b/pyemu/emulators/lpfa.py
similarity index 89%
rename from pyemu/emulators/ldfa.py
rename to pyemu/emulators/lpfa.py
index 707ba80f3..104bb24bd 100644
--- a/pyemu/emulators/ldfa.py
+++ b/pyemu/emulators/lpfa.py
@@ -1,5 +1,5 @@
 """
-Learning-based pattern-data-driven forecast approach (LDFA) emulator implementation.
+Learning-based pattern-data-driven forecast approach (LPFA) emulator implementation.
 
 """
 from __future__ import print_function, division
@@ -13,9 +13,9 @@
 from .transformers import RowWiseMinMaxScaler
 
 # Define scikit-learn based model class
-class LDFAModel:
+class LPFAModel:
     """
-    Scikit-learn MLPRegressor wrapper for LDFA neural network model.
+    Scikit-learn MLPRegressor wrapper for LPFA neural network model.
     """
     def __init__(self, input_dim, output_dim, hidden_units=None, activation='relu', 
                  dropout_rate=0.0, learning_rate=0.01, max_iter=200, early_stopping=True):
@@ -59,7 +59,7 @@ def loss_curve_(self):
         return getattr(self.model, 'loss_curve_', [])
     
 
-class LDFA(Emulator):
+class LPFA(Emulator):
     """
     Class for the Learning-based pattern-data-driven forecast approach from Kim et al (2025).
     
@@ -252,11 +252,32 @@ def prepare_training_data(self, data=None, test_size=0.2):
         
         # Apply PCA to reduce the dimensionality of the data
         self.logger.statement("applying PCA dimensionality reduction")
-        self.pcaX = PCA()#n_components=X_test.shape[1])
-        self.pcay = PCA()#n_components=y_test.shape[1])
+        self.pcaX = PCA()
+        self.pcay = PCA()
 
-        self.X = self.pcaX.fit_transform(X_train)
-        self.y = self.pcay.fit_transform(y_train)
+        X_transformed = self.pcaX.fit_transform(X_train)
+        y_transformed = self.pcay.fit_transform(y_train)
+        
+        # Apply energy-based truncation
+        if self.energy_threshold < 1.0:
+            self.logger.statement("applying energy-based PCA truncation")
+            # For input PCA
+            explained_var_ratio_X = np.cumsum(self.pcaX.explained_variance_ratio_)
+            n_components_X = np.argmax(explained_var_ratio_X >= self.energy_threshold) + 1
+            self.pcaX = PCA(n_components=n_components_X)
+            X_transformed = self.pcaX.fit_transform(X_train)
+            
+            # For output PCA
+            explained_var_ratio_y = np.cumsum(self.pcay.explained_variance_ratio_)
+            n_components_y = np.argmax(explained_var_ratio_y >= self.energy_threshold) + 1
+            self.pcay = PCA(n_components=n_components_y)
+            y_transformed = self.pcay.fit_transform(y_train)
+            
+            self.logger.statement(f"Reduced X from {X_train.shape[1]} to {n_components_X} components")
+            self.logger.statement(f"Reduced y from {y_train.shape[1]} to {n_components_y} components")
+        
+        self.X = X_transformed
+        self.y = y_transformed
         
         self.X_test = self.pcaX.transform(X_test)
         self.y_test = self.pcay.transform(y_test)
@@ -288,7 +309,7 @@ def _build_model(self, params=None, prob=False):
             
         Returns
         -------
-        LDFAModel
+        LPFAModel
             The scikit-learn MLPRegressor wrapper instance.
         """
         if params is None:
@@ -306,7 +327,7 @@ def _build_model(self, params=None, prob=False):
         output_dim = self.y.shape[1]
         
         # Create the model architecture
-        model = LDFAModel(
+        model = LPFAModel(
             input_dim=input_dim,
             output_dim=output_dim,
             hidden_units=params['hidden_units'],
@@ -329,7 +350,7 @@ def create_model(self, params=None):
             
         Returns
         -------
-        self : LDFA
+        self : LPFA
             The emulator instance with model created.
         """
         self.model = self._build_model(params)
@@ -346,7 +367,7 @@ def add_noise_model(self, params=None):
             
         Returns
         -------
-        self : LDFA
+        self : LPFA
             The emulator instance with noise model added.
         """
         # Create noise model
@@ -385,7 +406,7 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True):
             
         Returns
         -------
-        self : LDFA
+        self : LPFA
             The fitted emulator.
         """
         if prepare_data and (X is None or self.X is None):
@@ -500,6 +521,6 @@ def predict(self, data):
         predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names]
         
         # Finally, inverse the transform pipeline if it was applied (was the first transform)
-        predictions = self.feature_transformer.inverse_transform(predictions)
+        predictions = self.feature_transformer.inverse(predictions)
         
         return predictions
\ No newline at end of file

From d4ae84eac0efb3e74b5840e255e009631222e467 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 19 Jun 2025 10:11:09 +0100
Subject: [PATCH 26/58] lpfa test

---
 autotest/emulator_tests.py | 119 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 2 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 459e3d0c7..879f03987 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -7,7 +7,7 @@
 import platform
 import pyemu
 from pst_from_tests import setup_tmp, ies_exe_path, _get_port
-from pyemu.emulators import DSI
+from pyemu.emulators import DSI, LPFA
 
 
 
@@ -193,6 +193,120 @@ def plot_freyberg_dsi():
     plt.savefig("dsi_pred.pdf")
 
 
+def test_lpfa(tmp_d,transforms=None):
+
+    test_d = "ends_master"
+    test_d = setup_tmp(test_d, tmp_d)
+
+    case = "freyberg6_run_ies"
+    pst_name = os.path.join(test_d, case + ".pst")
+    pst = pyemu.Pst(pst_name)
+    predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"]
+    pst.pestpp_options["predictions"] = predictions
+
+    oe_name = pst_name.replace(".pst", ".0.obs.csv")
+    oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
+    data = oe._df.copy()
+
+    obs = pst.observation_data.copy()
+    #obs["date"] = pd.to_datetime(obs.obsnme.str.split("_")[-1])
+    #obs.sort_values(by=["obgnme", "date"], inplace=True)
+
+    fit_groups = {
+        o: obs.loc[obs.obgnme == o, "obsnme"].tolist()[:12] for o in obs.obgnme.unique()
+    }
+    groups ={
+        o: obs.loc[obs.obgnme == o, "obsnme"].tolist() for o in obs.obgnme.unique()
+    }
+
+    input_cols = obs.loc[obs.weight>0, "obsnme"].tolist()
+    forecast_names = obs.obsnme.tolist()
+
+    # Create LPFA emulator
+    lpfa = LPFA(
+        data=data,
+        input_cols=input_cols,
+        groups=groups,
+        fit_groups=fit_groups,
+        forecast_names=forecast_names,
+        energy_threshold=0.9999,  # Keep most variance in PCA
+        seed=42,
+        early_stop=True,
+        #transforms=None,  # No additional transforms for this demo
+        transforms = transforms,
+        verbose=True
+    )
+
+    training_data = lpfa.prepare_training_data(test_size=0.2)
+
+    # Define model parameters
+    model_params = {
+        'activation': 'relu',
+        'hidden_units': [128, 64],  # Two hidden layers
+        'dropout_rate': 0.1,
+        'learning_rate': 0.01
+    }
+
+    # Create the model
+    lpfa.create_model(model_params)
+
+    # Train the model
+    lpfa.fit(epochs=200)
+
+    # Add noise model to capture residuals
+    noise_params = {
+        'activation': 'relu',
+        'hidden_units': [64, 32],  # Smaller network for residuals
+        'dropout_rate': 0.05,
+        'learning_rate': 0.005
+    }
+
+    lpfa.add_noise_model(noise_params)
+
+    # Generate predictions
+    predictions = lpfa.predict(obs[["obsval"]].T)
+
+
+    # Create scatter plot comparing predictions vs truth
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
+
+    # Get non-zero weight observations for comparison
+    comparison_obs = obs.loc[obs.weight > 0].obsnme.values
+
+    # Extract values for plotting
+    nzobsnmes = obs.loc[obs.weight>0].obsnme.tolist()
+    truth_values = obs.loc[nzobsnmes].obsval.values.flatten()
+    pred_values = predictions.loc[:,nzobsnmes].values.flatten()
+
+    # Create scatter plot
+    ax.scatter(truth_values, pred_values, alpha=0.6, s=20)
+    ax.set_xlabel('Truth Values')
+    ax.set_ylabel('Predicted Values')
+    ax.set_title('lpfa Emulator: Predicted vs Truth')
+
+    # Add 1:1 line
+    min_val = min(ax.get_xlim()[0], ax.get_ylim()[0])
+    max_val = max(ax.get_xlim()[1], ax.get_ylim()[1])
+    ax.plot([min_val, max_val], [min_val, max_val], 'k-', lw=1, alpha=0.7)
+    ax.set_xlim(min_val, max_val)
+    ax.set_ylim(min_val, max_val)
+
+    # Calculate R²
+    correlation = np.corrcoef(truth_values, pred_values)[0, 1]
+    r_squared = correlation ** 2
+    assert r_squared >= 0.9, "R-squared should deccent"
+    ax.text(0.05, 0.95, f'R² = {r_squared:.3f}', transform=ax.transAxes, 
+            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
+
+    plt.tight_layout()
+    #plt.show()
+
+    print(f"Correlation coefficient: {correlation:.3f}")
+    print(f"R-squared: {r_squared:.3f}")
+
+    return
+
 
 if __name__ == "__main__":
     #test_dsi_basic()
@@ -200,4 +314,5 @@ def plot_freyberg_dsi():
     #test_dsi_nst_extrap()
     #test_dsi_mixed()
     #test_dsivc_freyberg()
-    plot_freyberg_dsi()
\ No newline at end of file
+    #plot_freyberg_dsi()
+    test_lpfa(tmp_d="temp",)
\ No newline at end of file

From 6a8900ba44d486320e550a73294abfc6c50e9c72 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 19 Jun 2025 10:16:18 +0100
Subject: [PATCH 27/58] added transform pipeline test for ldfa

---
 autotest/emulator_tests.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 879f03987..3645cf341 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -193,7 +193,7 @@ def plot_freyberg_dsi():
     plt.savefig("dsi_pred.pdf")
 
 
-def test_lpfa(tmp_d,transforms=None):
+def lpfa_freyberg(tmp_d="temp",transforms=None):
 
     test_d = "ends_master"
     test_d = setup_tmp(test_d, tmp_d)
@@ -307,6 +307,15 @@ def test_lpfa(tmp_d,transforms=None):
 
     return
 
+def test_lpfa_basic():
+    lpfa_freyberg(tmp_d="temp",transforms=None)
+    return
+
+def test_lpfa_std():
+    lpfa_freyberg(tmp_d="temp",transforms=[
+        {"type": "standard_scaler"}
+    ])
+    return
 
 if __name__ == "__main__":
     #test_dsi_basic()
@@ -315,4 +324,4 @@ def test_lpfa(tmp_d,transforms=None):
     #test_dsi_mixed()
     #test_dsivc_freyberg()
     #plot_freyberg_dsi()
-    test_lpfa(tmp_d="temp",)
\ No newline at end of file
+    test_lpfa_std()
\ No newline at end of file

From 54985370010dc72f2838da9123af3d0123797a73 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Fri, 20 Jun 2025 08:43:46 -0600
Subject: [PATCH 28/58] chasing CI issue on linux

---
 .github/workflows/ci.yml | 2 +-
 etc/environment.yml      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 03ab32948..62749abb1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -128,7 +128,7 @@ jobs:
       working-directory: ./examples
       run: |
         micromamba install --name pyemu jupyter jupytext
-        pytest -v -rP -rx --capture=no -n=auto --nbmake --cov=pyemu --cov-report=lcov:../autotest/coverage.lcov \
+        pytest -v -s --nbmake --cov=pyemu --cov-report=lcov:../autotest/coverage.lcov \
           --cov-config=../autotest/.coveragerc *.ipynb
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/etc/environment.yml b/etc/environment.yml
index d4ba654e2..5df38d2a5 100644
--- a/etc/environment.yml
+++ b/etc/environment.yml
@@ -1,6 +1,7 @@
 name: pyemu
 channels:
   - conda-forge
+  - nodefaults
 dependencies:
   # required
   - python>=3.8

From ad8893d70225c8d0cfc63a855e88d80ec34e78d4 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Fri, 20 Jun 2025 12:37:14 -0600
Subject: [PATCH 29/58] more chasing

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 62749abb1..1a5a598dc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -118,7 +118,7 @@ jobs:
       shell: bash -l {0}
       working-directory: ./autotest
       run: |
-        pytest -rP -rx --capture=no -v -n=auto --tb=native --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
+        pytest -v -s --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 

From b2761dc72ce88a5646dc41e6905c9a9420bb9a1f Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 30 Jun 2025 15:39:44 +0100
Subject: [PATCH 30/58] refactor StandardSclaer to use sklearn

---
 pyemu/emulators/transformers.py | 84 +++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 30 deletions(-)

diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py
index 22c1bbb02..f786f252b 100755
--- a/pyemu/emulators/transformers.py
+++ b/pyemu/emulators/transformers.py
@@ -4,6 +4,8 @@
 from __future__ import print_function, division
 import numpy as np
 import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
 
 class BaseTransformer:
     """Base class for all transformers providing a consistent interface."""
@@ -56,6 +58,7 @@ class RowWiseMinMaxScaler(BaseTransformer):
     groups : dict or None, default=None
         Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group).
         If None, all columns will be treated as a single group.
+        Example: {'group1': ['col1', 'col2'], 'group2': ['col3', 'col4']}
     fit_groups : dict or None, default=None
         Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max.
         If None, defaults to using the same columns as in groups.
@@ -315,40 +318,61 @@ def inverse_transform(self, X):
         return result
 
 class StandardScalerTransformer(BaseTransformer):
-    """Apply standard scaling (zero mean, unit variance) to data."""
-
-    def __init__(self):
-        self.means = {}
-        self.stds = {}
-
+    def __init__(self, with_mean=True, with_std=True, copy=True):
+        self.with_mean = with_mean
+        self.with_std = with_std  
+        self.copy = copy
+        self._sklearn_scaler = None
+        self._columns = None
+        
     def fit(self, X):
-        """Compute mean and standard deviation for each feature."""
-        for col in X.columns:
-            self.means[col] = X[col].mean()
-            self.stds[col] = X[col].std()
-            if self.stds[col] == 0:
-                self.stds[col] = 1.0  # Avoid division by zero
+        # Store column names for DataFrame reconstruction
+        self._columns = X.columns.tolist()
+        
+        # Create sklearn StandardScaler
+        self._sklearn_scaler = StandardScaler(
+            with_mean=self.with_mean,
+            with_std=self.with_std,
+            copy=self.copy
+        )
+        
+        # Fit on numpy array (sklearn expects this)
+        self._sklearn_scaler.fit(X.values)
         return self
-
+        
     def transform(self, X):
-        """Transform the data using mean and std from fit."""
-        result = X.copy()
-        for col in X.columns:
-            if col in self.means:
-                mean = self.means[col]
-                std = self.stds[col]
-                result[col] = (X[col] - mean) / std
-        return result
-
+        if self._sklearn_scaler is None:
+            raise ValueError("Transformer must be fitted before transform")
+            
+        # Transform using sklearn
+        transformed_values = self._sklearn_scaler.transform(X.values)
+        
+        # Reconstruct DataFrame with original structure
+        if isinstance(X, pd.DataFrame):
+            return pd.DataFrame(
+                transformed_values, 
+                index=X.index, 
+                columns=X.columns
+            )
+        else:
+            return transformed_values
+            
     def inverse_transform(self, X):
-        """Inverse transform data back to original scale."""
-        result = X.copy()
-        for col in X.columns:
-            if col in self.means:
-                mean = self.means[col]
-                std = self.stds[col]
-                result[col] = (X[col] * std) + mean
-        return result
+        if self._sklearn_scaler is None:
+            raise ValueError("Transformer must be fitted before inverse_transform")
+            
+        # Inverse transform using sklearn
+        inverse_values = self._sklearn_scaler.inverse_transform(X.values)
+        
+        # Reconstruct DataFrame
+        if isinstance(X, pd.DataFrame):
+            return pd.DataFrame(
+                inverse_values,
+                index=X.index,
+                columns=X.columns
+            )
+        else:
+            return inverse_values
 
 class NormalScoreTransformer(BaseTransformer):
     """A transformer for normal score transformation."""

From 9e62644abe714639a1cd2fe585971fe690a2edc0 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 30 Jun 2025 15:40:09 +0100
Subject: [PATCH 31/58] fix imports

---
 pyemu/emulators/transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py
index f786f252b..39345159e 100755
--- a/pyemu/emulators/transformers.py
+++ b/pyemu/emulators/transformers.py
@@ -198,7 +198,7 @@ class MinMaxScaler(BaseTransformer):
     
     Parameters
     ----------
-    feature_range : tuple (min, max), default=(0, 1)
+    feature_range : tuple (min, max), default=(-1, 1)
         The range to scale features into.
     columns : list, optional
         List of column names to be scaled. If None, all columns will be scaled.

From 444591bb0e548db120052c1175f8c1f97311d3b8 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 1 Jul 2025 16:03:54 +0100
Subject: [PATCH 32/58] refactor naming and streamline emulator building
 workflow

---
 autotest/emulator_tests.py |  15 +++---
 pyemu/emulators/base.py    |  90 +++++++++++++++++++++++---------
 pyemu/emulators/dsi.py     |  52 +++++++++----------
 pyemu/emulators/lpfa.py    | 102 ++++++++++++++++++++-----------------
 4 files changed, 151 insertions(+), 108 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 3645cf341..b2aa7fcb9 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -7,7 +7,7 @@
 import platform
 import pyemu
 from pst_from_tests import setup_tmp, ies_exe_path, _get_port
-from pyemu.emulators import DSI, LPFA
+from pyemu.emulators import DSI, LPFA, GPR
 
 
 
@@ -26,8 +26,8 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""):
     oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :]
     data = oe._df.copy()
 
-    dsi = DSI(sim_ensemble=data,transforms=transforms)
-    dsi.apply_feature_transforms()
+    dsi = DSI(data=data,transforms=transforms)
+    #dsi._fit_transformer_pipeline()
     dsi.fit()
 
     # history match
@@ -225,10 +225,10 @@ def lpfa_freyberg(tmp_d="temp",transforms=None):
     # Create LPFA emulator
     lpfa = LPFA(
         data=data,
-        input_cols=input_cols,
+        input_names=input_cols,
         groups=groups,
         fit_groups=fit_groups,
-        forecast_names=forecast_names,
+        output_names=forecast_names,
         energy_threshold=0.9999,  # Keep most variance in PCA
         seed=42,
         early_stop=True,
@@ -237,7 +237,7 @@ def lpfa_freyberg(tmp_d="temp",transforms=None):
         verbose=True
     )
 
-    training_data = lpfa.prepare_training_data(test_size=0.2)
+    #training_data = lpfa.prepare_training_data(test_size=0.2)
 
     # Define model parameters
     model_params = {
@@ -312,6 +312,7 @@ def test_lpfa_basic():
     return
 
 def test_lpfa_std():
+    #NOTE: fit with standard scaler transform are worse than without
     lpfa_freyberg(tmp_d="temp",transforms=[
         {"type": "standard_scaler"}
     ])
@@ -324,4 +325,4 @@ def test_lpfa_std():
     #test_dsi_mixed()
     #test_dsivc_freyberg()
     #plot_freyberg_dsi()
-    test_lpfa_std()
\ No newline at end of file
+    test_lpfa_std()
diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py
index c00809dc1..f1dab6951 100755
--- a/pyemu/emulators/base.py
+++ b/pyemu/emulators/base.py
@@ -14,18 +14,25 @@ class Emulator:
     This class defines the common interface for all emulator implementations
     and provides shared functionality used by multiple emulator types.
     
-    Parameters
-    ----------
-    verbose : bool, optional
-        If True, enable verbose logging. Default is True.
     """
 
-    def __init__(self, verbose=True):
+    def __init__(self,transforms=None, verbose=True):
         """
         Initialize the Emulator base class.
 
         Parameters
         ----------
+        transforms : list of dict, optional
+                List of transformation specifications. Each dict should have:
+                - 'type': str - Type of transformation (e.g.,'log10', 'normal_score').
+                - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
+                - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
+                Example:
+                transforms = [
+                    {'type': 'log10', 'columns': ['obs1', 'obs2']},
+                    {'type': 'normal_score', 'quadratic_extrapolation': True}
+                ]
+                Default is None, which means no transformations will be applied.
         verbose : bool, optional
             If True, enable verbose logging. Default is True.
         """
@@ -34,9 +41,8 @@ def __init__(self, verbose=True):
         self.fitted = False
         self.data = None
         self.data_transformed = None
-        self.feature_scaler = None
-        self.energy_threshold = 1.0
-        self.feature_transformer = None
+        self.transforms = transforms
+        self.transformer_pipeline = None
 
     def fit(self, X, y=None):
         """
@@ -74,32 +80,46 @@ def predict(self, X):
             raise ValueError("Emulator must be fitted before prediction")
         raise NotImplementedError("Subclasses must implement predict method")
 
-    def prepare_training_data(self, data=None):
+    def _prepare_training_data(self):
         """
         Prepare and transform training data for model fitting.
         
         Parameters
         ----------
-        data : pandas.DataFrame, optional
-            Raw training data. If None, uses self.data.
-            
+        self : Emulator
+            The emulator instance.
         Returns
         -------
         tuple
             Processed data ready for model fitting.
         """
+        data = self.data
         if data is None:
-            if self.data is None:
-                raise ValueError("No data provided and no data stored in the emulator")
-            data = self.data
+            raise ValueError("No data provided and no data stored in the emulator")
+ 
+         # Common preprocessing logic could go here
+        self.logger.statement("preparing training data")
         
-        # Common preprocessing logic could go here
-        return data
+        # apply feature transformations if they exist, etc..        
+        # Always use the base class transformation method for consistency
+        if self.transforms is not None:
+            self.logger.statement("applying feature transforms")
+            self.data_transformed = self._fit_transformer_pipeline(data, self.transforms)
+        else:
+            # Still need to set up a dummy transformer for inverse operations
+            from .transformers import AutobotsAssemble
+            self.feature_transformer = AutobotsAssemble(data.copy())
+            self.data_transformed = data.copy()
+    
+        return self.data_transformed
+
+        return 
         
-    def apply_feature_transforms(self, data=None, transforms=None):
+    def _fit_transformer_pipeline(self, data=None, transforms=None):
         """
         Apply feature transformations to data with customizable transformer sequence.
         This function is not intended to be used directly by users.
+        External data must be accepted to handle train/test spliting for certain emulators (e.g., LPFA).
 
         Parameters
         ----------
@@ -137,10 +157,13 @@ def apply_feature_transforms(self, data=None, transforms=None):
         # Import AutobotsAssemble here to avoid circular import
         from .transformers import AutobotsAssemble
         
-        ft = AutobotsAssemble(data.copy())
+        transformer_pipeline = AutobotsAssemble(data.copy())
         
         # Process the transforms parameter if provided
+        if transforms is None:
+            transforms = self.transforms
         if transforms:
+            self._validate_transforms(transforms)
             for transform in transforms:
                 transform_type = transform.get('type')
                 columns = transform.get('columns')
@@ -149,13 +172,12 @@ def apply_feature_transforms(self, data=None, transforms=None):
                         if k not in ('type', 'columns')}
                 
                 self.logger.statement(f"applying {transform_type} transform")
-                ft.apply(transform_type, columns=columns, **kwargs)
+                transformer_pipeline.apply(transform_type, columns=columns, **kwargs)
         
-        transformed_data = ft.df.copy()
-        self.feature_transformer = ft
-        self.data_transformed = transformed_data
+        self.transformer_pipeline = transformer_pipeline
+        self.data_transformed = transformer_pipeline.df.copy()
             
-        return transformed_data
+        return self.data_transformed 
 
     def save(self, filename):
         """
@@ -185,4 +207,22 @@ def load(cls, filename):
             The loaded emulator instance.
         """
         with open(filename, "rb") as f:
-            return pickle.load(f)
\ No newline at end of file
+            return pickle.load(f)
+        
+
+    def _validate_transforms(self, transforms):
+        """Validate the transforms parameter."""
+        if not isinstance(transforms, list):
+            raise ValueError("transforms must be a list of dicts or None")
+        
+        for t in transforms:
+            if not isinstance(t, dict):
+                raise ValueError("each transform must be a dict")
+            if 'type' not in t:
+                raise ValueError("each transform dict must have a 'type' key")
+            if 'columns' in t and not isinstance(t['columns'], list):
+                raise ValueError("'columns' must be a list of column names")
+    
+
+
+    #TODO: implment helper function that scrapes  directory and collates training data from Pst ensemble files + control file information.
\ No newline at end of file
diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index a3b699d73..940868a57 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -14,14 +14,14 @@
 
 class DSI(Emulator):
     """
-    Data Space Inversion (DS) emulator class. Based on DSI as described in Sun &
+    Data Space Inversion (DSI) emulator class. Based on DSI as described in Sun &
     Durlofsky (2017) and Sun et al (2017).
         
     """
 
     def __init__(self, 
                 pst=None,
-                sim_ensemble=None,
+                data=None,
                 transforms=None,
                 energy_threshold=1.0,
                 verbose=False):
@@ -33,7 +33,7 @@ def __init__(self,
         pst : Pst, optional
             A Pst object. If provided, the emulator will be initialized with the
             information from the Pst object.
-        sim_ensemble : ObservationEnsemble, optional
+        data : DataFrame or ObservationEnsemble, optional
             An ensemble of simulated observations. If provided, the emulator will
             be initialized with the information from the ensemble.
         transforms : list of dict, optional
@@ -58,11 +58,12 @@ def __init__(self,
         self.observation_data = pst.observation_data.copy() if pst is not None else None
         #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None
         #self.__org_control_data = pst.control_data.copy() #breaks pickling
-        if isinstance(sim_ensemble, ObservationEnsemble):
-            sim_ensemble = sim_ensemble._df.copy()
-        #self.__org_sim_ensemble = sim_ensemble.copy() if sim_ensemble is not None else None
-        self.data = sim_ensemble.copy() if sim_ensemble is not None else None
-        #self.feature_scaler = None
+        if isinstance(data, ObservationEnsemble):
+            data = data._df.copy()
+        # set all data to be floats
+        data = data.astype(float) if data is not None else None
+        #self.__org_data = data.copy() if data is not None else None
+        self.data = data.copy() if data is not None else None
         self.energy_threshold = energy_threshold
         assert isinstance(transforms, list) or transforms is None, "transforms must be a list of dicts or None"
         if transforms is not None:
@@ -79,32 +80,31 @@ def __init__(self,
                         assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean"
         self.transforms = transforms
         self.fitted = False
-        self.data_transformed = None
+        self.data_transformed = self._prepare_training_data()
         self.decision_variable_names = None #used for DSIVC
         
-    def prepare_training_data(self, data=None):
+    def _prepare_training_data(self):
         """
         Prepare and transform training data for model fitting.
         
         Parameters
         ----------
-        data : pandas.DataFrame, optional
-            Raw training data. If None, uses self.data.
+        self : DSI
+            The DSI emulator instance.
             
         Returns
         -------
         tuple
             Processed data ready for model fitting.
         """
+        data = self.data
         if data is None:
-            data = self.data
-        if data is None:
-            raise ValueError("No data provided and no data stored in the emulator")
+            raise ValueError("No data stored in the emulator")
 
         self.logger.statement("applying feature transforms")
         # Always use the base class transformation method for consistency
         if self.transforms is not None:
-            self.data_transformed = self.apply_feature_transforms(data, self.transforms)
+            self.data_transformed = self._fit_transformer_pipeline(data, self.transforms)
         else:
             # Still need to set up a dummy transformer for inverse operations
             from .transformers import AutobotsAssemble
@@ -164,30 +164,24 @@ def compute_projection_matrix(self, energy_threshold=None):
         self.s = s
         return
     
-    def fit(self, X=None, y=None):
+    def fit(self):
         """
         Fit the emulator to training data.
         
         Parameters
         ----------
-        X : pandas.DataFrame
-            Input data to fit the emulator on.
-        y : None
-            Not used, present for API consistency.
+        self : DSI
+            The DSI emulator instance.
             
         Returns
         -------
         self : DSI
             The fitted emulator.
         """
-        if X is not None:
-            self.data = X
-            self.logger.statement("transforming new training data")
-            self.data_transformed = self.prepare_training_data()
         
         if self.data_transformed is None:
             self.logger.statement("transforming training data")
-            self.data_transformed = self.prepare_training_data()
+            self.data_transformed = self._prepare_training_data()
 
         # Compute projection matrix
         self.compute_projection_matrix()
@@ -211,7 +205,7 @@ def predict(self, pvals):
         if not self.fitted:
             raise ValueError("Emulator must be fitted before prediction")
             
-        if not hasattr(self, 'feature_transformer') or self.feature_transformer is None:
+        if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None:
             raise ValueError("Emulator must be fitted and have valid transformations before prediction")
         
         if isinstance(pvals, pd.Series):
@@ -221,8 +215,8 @@ def predict(self, pvals):
         pmat = self.pmat
         ovals = self.ovals
         sim_vals = ovals + np.dot(pmat,pvals)
-        ft = self.feature_transformer
-        sim_vals = ft.inverse(sim_vals)
+        pipeline = self.transformer_pipeline
+        sim_vals = pipeline.inverse(sim_vals)
         sim_vals.index.name = 'obsnme'
         sim_vals.name = "obsval"
         self.sim_vals = sim_vals
diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py
index 104bb24bd..4252a61d7 100644
--- a/pyemu/emulators/lpfa.py
+++ b/pyemu/emulators/lpfa.py
@@ -70,36 +70,46 @@ class LPFA(Emulator):
     ----------
     data : pandas.DataFrame
         The training data with input and forecast columns.
-    input_cols : list
+    input_names : list
         List of column names to use as inputs.
     groups : dict
         Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling.
     fit_groups : dict
         Dictionary mapping group names to lists of column names used to fit the scaling.
-    forecast_names : list, optional
-        List of column names to forecast. If None, all columns in data will be used.
+    output_names : list, optional
+        List of column names to forecast. If None, all columns not in input_names are used.
     energy_threshold : float, optional
         Energy threshold for the PCA. Default is 1.0.
     seed : int, optional
         Random seed for reproducibility. Default is None.
     early_stop : bool, optional
         Whether to use early stopping during training. Default is True.
-    apply_std_scaler : bool, optional
-        Whether to apply standard scaling before min-max scaling. Default is False.
+    transforms : list of dict, optional
+        List of transformation specifications. Each dict should have:
+        - 'type': str - Type of transformation (e.g., 'log10', 'normal_score').
+        - 'columns': list of str, optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns.
+        - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform).
+        Example:
+        transforms = [
+            {'type': 'log10', 'columns': ['obs1', 'obs2']},
+            {'type': 'normal_score', 'quadratic_extrapolation': True}   
+        ]
+        Default is None, which means no transformations will be applied.
     verbose : bool, optional
         If True, enable verbose logging. Default is True.
     """
 
     def __init__(self,
                  data,
-                 input_cols,
+                 input_names,
                  groups,
                  fit_groups,
-                 forecast_names=None,
+                 output_names=None,
                  energy_threshold=1.0,
                  seed=None,
                  early_stop=True,
                  transforms=None,
+                 test_size=0.2,
                  verbose=True):
         """
         Initialize the Learning-based pattern-data-driven NN emulator.
@@ -108,13 +118,13 @@ def __init__(self,
         ----------
         data : pandas.DataFrame
             The training data with input and forecast columns.
-        input_cols : list
+        input_names : list
             List of column names to use as inputs.
         groups : dict
             Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling.
         fit_groups : dict
             Dictionary mapping group names to lists of column names used to fit the scaling.
-        forecast_names : list, optional
+        output_names : list, optional
             List of column names to forecast. If None, all columns in data will be used.
         energy_threshold : float, optional
             Energy threshold for the PCA. Default is 1.0.
@@ -133,6 +143,8 @@ def __init__(self,
                 {'type': 'normal_score', 'quadratic_extrapolation': True}
             ]
             Default is None, which means no transformations will be applied.
+        test_size : float, optional
+            Fraction of data to use for testing. Default is 0.2.
         verbose : bool, optional
             If True, enable verbose logging. Default is True.
         """
@@ -142,13 +154,13 @@ def __init__(self,
 
         self.seed = seed
         self.data = data
-        self.input_cols = input_cols
+        self.input_names = input_names
         self.groups = groups
         self.fit_groups = fit_groups
         
-        if forecast_names is None:
-            forecast_names = data.columns
-        self.forecast_names = forecast_names
+        if output_names is None:
+            output_names = data.columns
+        self.output_names = output_names
         
         self.energy_threshold = energy_threshold
         
@@ -160,8 +172,13 @@ def __init__(self,
         self.model = None
         self.train_data = None
         self.test_data = None
+        self.test_size = test_size
+
+        # Prepare the training data
         
-    def prepare_training_data(self, data=None, test_size=0.2):
+        self._prepare_training_data()
+        
+    def _prepare_training_data(self):
         """
         Prepare the training data for model fitting.
         
@@ -187,36 +204,36 @@ def prepare_training_data(self, data=None, test_size=0.2):
             - X_test: Input testing data after transformation and PCA
             - y_test: Target testing data after transformation and PCA
         """
-        if data is None:
-            data = self.data
-            
+        
+        self.logger.statement("preparing training data")
+        data = self.data
         if data is None:
             raise ValueError("No data provided and no data stored in the emulator")
             
         # Split the data into training and test sets
         train, test = train_test_split(
             data, 
-            test_size=test_size, 
+            test_size=self.test_size, 
             random_state=self.seed
         )
         
-        self.logger.statement("preparing training data: data split complete")
+        self.logger.statement("train/test data split complete")
         
         # Store for later use
         self.train_data = train.copy()
         self.test_data = test.copy()
         
-        
-        # TODO: Apply feature transformations if specified
+        self.logger.statement("applying feature transformation pipeline")
+        # Apply feature transformations if specified
         # Always use the base class transformation method for consistency
         if self.transforms is None:
             from .transformers import AutobotsAssemble
-            self.feature_transformer = AutobotsAssemble(train.copy())
+            self.transformer_pipeline = AutobotsAssemble(train.copy())
             train_transformed = train
             test_transformed = test
         else:
-            train_transformed = self.apply_feature_transforms(train, self.transforms)
-            test_transformed = self.feature_transformer.transform(test)
+            train_transformed = self._fit_transformer_pipeline(train, self.transforms)
+            test_transformed = self.transformer_pipeline.transform(test)
 
         
         # Apply row-wise min-max scaling directly (not through the pipeline)
@@ -244,11 +261,11 @@ def prepare_training_data(self, data=None, test_size=0.2):
         self.logger.statement("row-wise min-max scaling complete")
         
         # Split datasets into input (X) and target (y) variables
-        X_train = train_scaled.loc[:, self.input_cols].copy()
-        y_train = train_scaled.loc[:, self.forecast_names].copy()
+        X_train = train_scaled.loc[:, self.input_names].copy()
+        y_train = train_scaled.loc[:, self.output_names].copy()
         
-        X_test = test_scaled.loc[:, self.input_cols].copy()
-        y_test = test_scaled.loc[:, self.forecast_names].copy()
+        X_test = test_scaled.loc[:, self.input_names].copy()
+        y_test = test_scaled.loc[:, self.output_names].copy()
         
         # Apply PCA to reduce the dimensionality of the data
         self.logger.statement("applying PCA dimensionality reduction")
@@ -386,7 +403,7 @@ def add_noise_model(self, params=None):
         
         return self
 
-    def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True):
+    def fit(self, epochs=200):
         """
         Fit the model to the training data.
         
@@ -394,23 +411,14 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True):
         ----------
         epochs : int, optional
             Number of training epochs. Default is 200.
-        batch_size : int, optional
-            Batch size for training. Default is 32.
-        X : pandas.DataFrame, optional
-            Input data for training. If None and prepare_data is True,
-            will run prepare_training_data(). Default is None.
-        y : pandas.DataFrame, optional
-            Not used directly but included for API consistency. Default is None.
-        prepare_data : bool, optional
-            Whether to prepare training data if not already done. Default is True.
-            
         Returns
         -------
         self : LPFA
             The fitted emulator.
         """
-        if prepare_data and (X is None or self.X is None):
-            self.prepare_training_data()
+        if self.data_transformed is None:
+            self.logger.statement("transforming training data")
+            self.data_transformed = self._prepare_training_data()
             
         if self.model is None:
             self.create_model()
@@ -421,7 +429,7 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True):
         # Simple fit - scikit-learn handles batching, early stopping, etc.
         self.logger.statement(f"fitting model with MLPRegressor: {epochs} epochs")
         
-        X_train = self.X if X is None else X
+        X_train = self.X 
         y_train = self.y
         
         # Fit the model
@@ -472,7 +480,7 @@ def predict(self, data):
         self.logger.statement("applying transformations to input data")
         
         # Apply transfrom pipeline if it was used during training
-        truth_transformed = self.feature_transformer.transform(truth)
+        truth_transformed = self.transformer_pipeline.transform(truth)
 
         
         # Apply row-wise min-max scaling
@@ -486,8 +494,8 @@ def predict(self, data):
         truth_scaled = forecast_rowwise_mm_scaler.transform(truth_transformed)
         
         # Extract input columns and apply PCA transformation
-        X_truth = truth_scaled.loc[:, self.input_cols].copy()
-        y_truth = truth_scaled.loc[:, self.forecast_names].copy()
+        X_truth = truth_scaled.loc[:, self.input_names].copy()
+        y_truth = truth_scaled.loc[:, self.output_names].copy()
         
         # Apply PCA transform
         truth_pca = self.pcaX.transform(X_truth.values)
@@ -518,9 +526,9 @@ def predict(self, data):
         pred_transformed = forecast_rowwise_mm_scaler.inverse_transform(pred_scaled)
         
         # Assign predictions to output
-        predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names]
+        predictions.loc[:, self.output_names] = pred_transformed.loc[:, self.output_names]
         
         # Finally, inverse the transform pipeline if it was applied (was the first transform)
-        predictions = self.feature_transformer.inverse(predictions)
+        predictions = self.transformer_pipeline.inverse(predictions)
         
         return predictions
\ No newline at end of file

From 23f57acfd049c5b20db95cdf6a6feb78f79c5c78 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 15:20:17 +0100
Subject: [PATCH 33/58] functional gpr class + pestpp setup

---
 pyemu/emulators/gpr.py | 495 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 495 insertions(+)
 create mode 100644 pyemu/emulators/gpr.py

diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py
new file mode 100644
index 000000000..ac43b567a
--- /dev/null
+++ b/pyemu/emulators/gpr.py
@@ -0,0 +1,495 @@
+"""
+Gaussian Process Regression (GPR) emulator implementation.
+"""
+from __future__ import print_function, division
+import numpy as np
+import pandas as pd
+import os
+import shutil
+import inspect
+from .base import Emulator
+from .transformers import AutobotsAssemble
+from sklearn.gaussian_process import GaussianProcessRegressor
+from pyemu.utils import run
+
+from pyemu.pst import Pst
+
+
+class GPR(Emulator):
+    """
+    Gaussian Process Regression (GPR) emulator class.
+    
+    This class implements a GPR-based emulator that trains separate Gaussian Process
+    models for each output variable. It supports various kernel types, feature
+    transformations, and provides uncertainty quantification.
+    
+    Parameters
+    ----------
+    data : pandas.DataFrame, optional
+        Input and output features for training.
+    input_names : list of str, optional
+        Names of input features to use. If None, all columns in input_data are used.
+    output_names : list of str, optional
+        Names of output variables to emulate. If None, all columns in output_data are used.
+    kernel : sklearn kernel object, optional
+        Kernel to use for GP regression. If None, defaults to Matern kernel.
+    transforms : list of dict, optional. Defaults to [{'type': 'standard_scaler'}]
+    n_restarts_optimizer : int, optional
+        Number of restarts for kernel hyperparameter optimization. Default is 10.
+    return_std : bool, optional
+        Whether to return prediction uncertainties. Default is True.
+    verbose : bool, optional
+        Enable verbose logging. Default is True.
+    """
+    
+    def __init__(self, 
+                 data,
+                 input_names=None,
+                 output_names=None,
+                 kernel=None,
+                 transforms=[{'type': 'standard_scaler'}],
+                 n_restarts_optimizer=10,
+                 return_std=True,
+                 verbose=True):
+        """Initialize the GPR emulator."""
+        
+        super().__init__(verbose=verbose)
+        
+        # Store initialization parameters
+        # check data is a DataFrame
+        if not isinstance(data, pd.DataFrame):
+            raise ValueError("data must be a pandas DataFrame")
+        self.data = data.copy()
+
+        # Check input and output names
+        # check input_names and output_names are lists or None
+        if input_names is not None and not isinstance(input_names, list):
+            raise ValueError("input_names must be a list or None")
+        if output_names is not None and not isinstance(output_names, list):
+            raise ValueError("output_names must be a list or None")
+        self.input_names = input_names
+        self.output_names = output_names
+
+        self.kernel = kernel
+        self.transforms = transforms
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.return_std = return_std
+        
+        # Initialize data
+        self.data = data
+        
+        # Model storage
+        self.models = {}
+        self.model_info = None
+        self.verification_results = {}
+        
+        # PEST++ integration
+        self.template_dir = None
+        
+        # Validate transforms parameter
+        if transforms is not None:
+            self._validate_transforms(transforms)
+            self._validate_transforms_for_gpr()
+         
+    def _validate_transforms_for_gpr(self):
+        """Validate transforms parameter for GPR. Make sure transforms are only applied to input data."""
+        # Validate transforms parameter
+        transforms = self.transforms
+        if transforms is not None:
+            # For the speicif case of GPR, we only transform input data    
+            for t in transforms:
+                if 'columns' in t:
+                    # check if any columns are in output_names
+                    if self.output_names is not None:
+                        common_cols = set(t['columns']).intersection(self.output_names)
+                        if common_cols:
+                            self.logger.statement(f"Transform {t['type']} will not be applied to output columns: {common_cols}")
+                            # remove these columns from transforms
+                            t['columns'] = [col for col in t['columns'] if col not in common_cols]
+                            if not t['columns']:
+                                self.logger.statement(f"Transform {t['type']} has no columns left after removing output columns: {common_cols}")
+                                # remove this transform
+                                self.logger.statement(f"Removing transform {t['type']} as it has no columns left")
+                                self.transforms.remove(t)
+                else:
+                    self.logger.statement(f"Transform {t['type']} has no specified columns, applying to all input columns")
+                    t['columns'] = self.input_names if self.input_names is not None else []
+        return transforms   
+
+#    def _combine_input_output_data(self, input_data, output_data):
+#        """Combine input and output data into a single DataFrame."""
+#        if input_data.shape[0] != output_data.shape[0]:
+#            raise ValueError("Input and output data must have the same number of rows")
+#        
+#        combined = input_data.copy()
+#        for col in output_data.columns:
+#            if col not in combined.columns:
+#                combined[col] = output_data[col]
+#            else:
+#                self.logger.statement(f"Warning: column '{col}' exists in both input and output data, using output data")
+#                combined[col] = output_data[col]
+#        
+#        return combined
+    
+    def _setup_kernel(self):
+        """Set up the GP kernel if not provided."""
+        if self.kernel is None:
+            try:
+                from sklearn.gaussian_process.kernels import Matern,ConstantKernel,RBF
+                self.kernel = ConstantKernel(1.0, (1e-3, 1e3)) * Matern(
+                                                                length_scale=np.ones(len(self.input_names)) * 2.0,
+                                                                length_scale_bounds=(1e-4, 1e4),
+                                                                nu=1.5)
+                self.logger.statement("Using default Matern kernel")
+            except ImportError:
+                raise ImportError("scikit-learn is required for GPR emulator")
+
+        # Log kernel hyperparameters
+        self.logger.statement(f"Using kernel: {self.kernel}")
+
+    
+    def _prepare_training_data(self):
+        """
+        Prepare and transform training data for model fitting.
+        
+        Parameters
+        ----------
+        self : GPR
+            The GPR emulator instance containing the data and configuration.
+            
+        Returns
+        -------
+        pandas.DataFrame
+            Processed data ready for model fitting.
+        """
+
+        if self.data is None:
+            raise ValueError("No data provided and no data stored in the emulator")
+        data = self.data
+        
+        # Apply feature transformations if specified
+        if self.transforms is not None:
+            self._validate_transforms_for_gpr()
+            self.logger.statement("applying feature transforms")
+            self.data_transformed = self._fit_transformer_pipeline(data, self.transforms)
+        else:
+            # Still need to set up a dummy transformer for consistency
+            from .transformers import AutobotsAssemble
+            self.transformer_pipeline = AutobotsAssemble(data.copy())
+            self.data_transformed = data.copy()
+        
+        return self.data_transformed
+    
+
+    def fit(self):
+        """
+        Fit the emulator to training data.
+        
+        Parameters
+        ----------
+        self: GPR
+            The GPR emulator instance containing the data and configuration.
+            
+        Returns
+        -------
+        self : GPR
+            Fitted GPR emulator instance.
+        """
+        
+        if self.data_transformed is None:
+            self.logger.statement("transforming training data")
+            self.data_transformed = self._prepare_training_data()
+        if self.kernel is None:
+            self._setup_kernel()
+        # transformed input data
+        X_transformed = self.data_transformed.loc[:,self.input_names].copy()
+        y_transformed = self.data_transformed.loc[:,self.output_names].copy() #Note that these are actualy not transformed
+
+        assert X_transformed.shape[0] == y_transformed.shape[0], \
+            "Input and output data must have the same number of rows"
+        assert X_transformed.shape[1] > 0, "Input data must have at least one feature"
+        assert y_transformed.shape[1] > 0, "Output data must have at least one variable"
+
+        # Create and fit separate GPR model for each output
+        self.gpr_models = {}
+        for output_name in self.output_names:
+            gpr = GaussianProcessRegressor(
+                kernel=self.kernel,
+                #alpha=self.alpha,
+                n_restarts_optimizer=self.n_restarts_optimizer,
+                #random_state=self.random_state
+            )
+            
+            # Fit the GPR model for this output
+            gpr.fit(X_transformed.loc[:,self.input_names].values, y_transformed.loc[:,output_name].values)
+            self.gpr_models[output_name] = gpr
+        
+        self.fitted = True
+        return self
+
+    def predict(self, X, return_std=False):
+        """
+        Make predictions using the fitted GPR emulators.
+
+        Parameters
+        ----------
+        X : pandas.DataFrame 
+            Input features for prediction
+        return_std : bool, default False
+            Whether to return prediction standard deviation
+
+        Returns
+        -------
+        predictions : pandas.DataFrame
+            Predicted values for each output
+        std : pandas.DataFrame, optional
+            Prediction standard deviations (if return_std=True)
+        """
+        if not self.fitted:
+            raise ValueError("Emulator must be fitted before making predictions")
+        
+        if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None:
+            raise ValueError("Emulator must be fitted and have valid transformations before prediction")
+        
+        # Apply same transforms as training data
+        X_transformed = self.transformer_pipeline.transform(X.copy())
+
+        
+        # Make predictions for each output
+        predictions_dict = {}
+        std_dict = {}
+        
+        for output_name in self.output_names:
+            gpr = self.gpr_models[output_name]
+            
+            if return_std:
+                pred, std = gpr.predict(X_transformed.values, return_std=True)
+                predictions_dict[output_name] = pred
+                std_dict[output_name] = std
+            else:
+                pred = gpr.predict(X_transformed.values)
+                predictions_dict[output_name] = pred
+        
+        # Convert to DataFrame
+        predictions_df = pd.DataFrame(predictions_dict, index=X.index)
+        
+        if return_std:
+            std_df = pd.DataFrame(std_dict, index=X.index)
+            return predictions_df, std_df
+        else:
+            return predictions_df
+
+
+    def scrape_pst_dir(self,pst_dir,casename):
+
+        if not os.path.exists(pst_dir):
+            raise FileNotFoundError(f"PEST control file {pst_dir} does not exist")
+        
+        pst = Pst(os.path.join(pst_dir,casename + ".pst"))
+
+        # work out input variable names
+        input_groups = pst.pestpp_options.get("opt_dec_var_groups",None)
+        par = pst.parameter_data
+        if input_groups is None:
+            print("using all adjustable parameters as inputs")
+            input_names = pst.adj_par_names
+        else:
+            input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")])
+            print("input groups:",input_groups)
+            adj_par = par.loc[pst.adj_par_names,:].copy()
+            adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:]
+            input_names = adj_par.parnme.tolist()
+        print("input names:",input_names)
+
+        #work out constraints and objectives
+        ineq_names = pst.less_than_obs_constraints.tolist()
+        ineq_names.extend(pst.greater_than_obs_constraints.tolist())
+        obs = pst.observation_data
+        objs = pst.pestpp_options.get("mou_objectives",None)
+        constraints = []
+
+        if objs is None:
+            print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives")
+            objs = ineq_names
+        else:
+            objs = objs.lower().strip().split(',')
+            constraints = [n for n in ineq_names if n not in objs]
+
+        print("objectives:",objs)
+        print("constraints:",constraints)
+        output_names = objs
+        output_names.extend(constraints)
+
+        return pst, input_names, output_names, objs, constraints
+        
+
+    def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
+        """
+        Prepare a PEST++ template directory for the GPR emulator.
+        
+        Parameters
+        ----------
+        gpr_t_d : str
+            Path to the PEST++ template directory.
+        pst_fpath : str
+            Path to an existing PEST control file (PST). The assumption is that an existing PST setup exists for the process-based model.
+        
+        Returns
+        -------
+        None
+            """
+        
+
+        
+        #TODO: it may be more logical to pass in a Pst object instead of a file path; assume the user loads Pst and training data before hand???
+        # Give Emulators a "harvest" function that returns a Pst object with the necessary information?
+
+        # what are the things we need to get from Pst?
+        # 1. decivsion variable names (parameters) a.k.a input_names
+        # 2. observation names (outputs) aka output_names
+        # 3. which obs are objectives; subset of output_names
+        # 4. which obs are constraints; subset of output_names
+
+        pst, input_names, output_names, objs, constraints = self.scrape_pst_dir(pst_dir,casename)
+
+
+        # check that all input_names ar ein par data
+        if self.input_names is None:
+            raise ValueError("input_names must be provided")
+        missing_inputs = set(self.input_names) - set(pst.parameter_data.index)
+        if missing_inputs:
+            raise ValueError(f"Input names {missing_inputs} not found in parameter data")
+        # check that all input names are adjsutable
+        fixed_inputs = pst.parameter_data.loc[self.input_names, "partrans"].str.contains("fixed|tied", case=False, na=False)
+        if fixed_inputs.any():
+            raise ValueError(f"Input names {self.input_names[fixed_inputs]} cannot be fixed or tied")
+        self.logger.statement(f"Decision variable parameter names: {self.input_names}")
+
+        # check that all self.output_names are in observation_data
+        if self.output_names is None:
+            raise ValueError("output_names must be provided")
+        missing_outputs = set(self.output_names) - set(pst.observation_data.index)
+        if missing_outputs:
+            raise ValueError(f"Output names {missing_outputs} not found in observation data")
+        self.logger.statement(f"Observation names: {self.output_names}")
+
+
+        # preapre the GPR template directory
+        if os.path.exists(gpr_t_d):
+            self.logger.statement(f"Removing existing template directory {gpr_t_d}")
+            shutil.rmtree(gpr_t_d)
+        self.logger.statement(f"Creating template directory {gpr_t_d}")
+        os.makedirs(gpr_t_d)
+
+        # pickle
+        self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl"))
+        self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}")
+
+        # preapre template files
+        self.logger.statement("Preparing PEST++ template files")
+        
+        #write a template file
+        tpl_fname = os.path.join(gpr_t_d,"gpr_input.csv.tpl")
+        with open(tpl_fname,'w') as f:
+            f.write("ptf ~\nparnme,parval1\n")
+            for input_name in self.input_names:
+                f.write("{0},~  {0}   ~\n".format(input_name))
+        # keep track of other non-decvar parameters
+        other_pars = list(set(pst.parameter_data.parnme.tolist())-set(self.input_names))
+        aux_tpl_fname = None
+        if len(other_pars) > 0:
+            aux_tpl_fname = os.path.join(gpr_t_d,"aux_par.csv.tpl")
+            print("writing aux par tpl file: ",aux_tpl_fname)
+            with open(aux_tpl_fname,'w') as f:
+                f.write("ptf ~\n")
+                for input_name in other_pars:
+                    f.write("{0},~  {0}   ~\n".format(input_name))
+
+        #write an ins file
+        ins_fname = os.path.join(gpr_t_d,"gpr_output.csv.ins")
+        with open(ins_fname,'w') as f:
+            f.write("pif ~\nl1\n")
+            for output_name in self.output_names:
+                if self.return_std:
+                    f.write("l1 ~,~ !{0}! ~,~ !{0}_gprstd!\n".format(output_name))
+                else:
+                    f.write("l1 ~,~ !{0}!\n".format(output_name))
+
+        # build the GPR Pst object
+        self.logger.statement("Building PEST++ control file")
+        tpl_list = [tpl_fname]
+        if aux_tpl_fname is not None:
+            tpl_list.append(aux_tpl_fname)
+        input_list = [f.replace(".tpl","") for f in tpl_list]
+        gpst = Pst.from_io_files(tpl_list,input_list,
+                                    [ins_fname],[ins_fname.replace(".ins","")],pst_path=".")
+        
+
+        def fix_df_col_type(orgdf,fixdf):
+            for col in orgdf.columns:
+                # this gross thing is to avoid a future error warning in pandas - 
+                # why is it getting so strict?!  isn't python duck-typed?
+                if col in fixdf.columns and\
+                fixdf.dtypes[col] != orgdf.dtypes[col]:
+                    fixdf[col] = fixdf[col].astype(orgdf.dtypes[col])
+                fixdf.loc[orgdf.index,col] = orgdf.loc[orgdf.index,col].values
+            return
+
+        fix_df_col_type(orgdf=pst.parameter_data,fixdf=gpst.parameter_data)
+        fix_df_col_type(orgdf=pst.observation_data,fixdf=gpst.observation_data)
+
+        if self.return_std:
+            stdobs = [o for o in gpst.obs_names if o.endswith("_gprstd")]
+            assert len(stdobs) > 0
+            gpst.observation_data.loc[stdobs,"weight"] = 0.0
+
+        gpst.pestpp_options = pst.pestpp_options
+        gpst.prior_information = pst.prior_information.copy()
+
+        gpst.model_command = "python forward_run.py"
+        frun_lines = inspect.getsource(gpr_forward_run)
+        with open(os.path.join(gpr_t_d, "forward_run.py"), 'w') as f:
+            f.write("\n")
+            for import_name in ["pandas as pd","os","numpy as np"]:
+                f.write("import {0}\n".format(import_name))
+            for line in frun_lines:
+                f.write(line)
+            f.write("if __name__ == '__main__':\n")
+            f.write("    gpr_forward_run()\n")
+
+
+        
+        gpst.control_data.noptmax = 0
+        
+        gpst_fname = f"{casename}_gpr.pst"
+        gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2)
+        print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d)
+        try:
+            run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d)
+        except Exception as e:
+            print("WARNING: pestpp-mou test run failed: {0}".format(str(e)))
+        gpst.control_data.noptmax = pst.control_data.noptmax
+        gpst.write(os.path.join(gpr_t_d, gpst_fname), version=2)
+
+        return
+    
+def gpr_forward_run():
+    """the function to evaluate a set of inputs thru the GPR emulators.\
+    This function gets added programmatically to the forward run process"""
+    import pandas as pd
+    from pyemu.emulators import GPR
+    input_df = pd.read_csv("gpr_input.csv",index_col=0).T
+
+    gpr = GPR.load("gpr_emulator.pkl")
+    df = pd.DataFrame(index=gpr.output_names,
+                    columns=["sim","sim_std"])
+    df.index.name = "output_name"
+    if gpr.return_std:
+        predmean,predstdv = gpr.predict(input_df.loc[:,gpr.input_names], return_std=True)
+        df.loc[:,"sim"] = predmean[df.index].values
+        df.loc[:,"sim_std"] = predstdv[df.index].values
+    else:
+        predmean = gpr.predict(input_df.loc[:,gpr.input_names])
+        df.loc[:,"sim"] = predmean[df.index].values
+    df.to_csv("gpr_output.csv",index=True)
+    return df
\ No newline at end of file

From a50fe5147b49131414b06e81c79f5ddf9322d208 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 16:46:58 +0100
Subject: [PATCH 34/58] gpr tests

---
 autotest/emulator_tests.py | 600 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 596 insertions(+), 4 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index b2aa7fcb9..72370d827 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -6,10 +6,11 @@
 import pandas as pd
 import platform
 import pyemu
-from pst_from_tests import setup_tmp, ies_exe_path, _get_port
+from pst_from_tests import setup_tmp, bin_path, _get_port
 from pyemu.emulators import DSI, LPFA, GPR
 
-
+ies_exe_path = os.path.join(bin_path, "pestpp-ies")
+mou_exe_path = os.path.join(bin_path, "pestpp-mou")
 
 def dsi_freyberg(tmp_d,transforms=None,tag=""):
 
@@ -133,7 +134,7 @@ def test_dsivc_freyberg():
     worker_root = "."
 
     pyemu.os_utils.start_workers(td,
-                                 "pestpp-mou",
+                                 mou_exe_path,
                                     "dsivc.pst",
                                     num_workers=num_workers,
                                     worker_root=worker_root,
@@ -318,6 +319,595 @@ def test_lpfa_std():
     ])
     return
 
+
+def gpr_compare_invest():
+    import numpy as np
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    case = "zdt1"
+    use_chances = False
+    m_d = os.path.join(case+"_gpr_baseline")
+    org_d = os.path.join("utils",case+"_template")
+    t_d = case+"_template"
+    if os.path.exists(t_d):
+        shutil.rmtree(t_d)
+    shutil.copytree(org_d,t_d)
+    if os.path.exists(m_d):
+        shutil.rmtree(m_d)
+
+    pst = pyemu.Pst(os.path.join(t_d, case+".pst"))
+    pst.pestpp_options["mou_generator"] = "pso"
+    if use_chances:
+        pst.pestpp_options["opt_risk"] = 0.95
+        pst.pestpp_options["opt_stack_size"] = 50
+        pst.pestpp_options["opt_recalc_chance_every"] = 10000
+        pst.pestpp_options["opt_chance_points"] = "single"
+    else:
+        pst.pestpp_options["opt_risk"] = 0.5
+   
+    pop_size = 60
+    num_workers = 60
+    noptmax_full = 30
+    noptmax_inner = 10
+    noptmax_outer = 5
+    port = 4554
+    pst.control_data.noptmax = noptmax_full 
+    pst.pestpp_options["mou_population_size"] = pop_size
+    pst.pestpp_options["mou_save_population_every"] = 1
+    pst.write(os.path.join(t_d, case+".pst"))
+    if not os.path.exists(m_d):
+        pyemu.os_utils.start_workers(t_d, mou_exe_path,  case+".pst", num_workers, worker_root=".",
+                                    master_dir=m_d, verbose=True, port=port)
+    #shutil.copytree(t_d,m_d)
+    #pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=m_d)
+    # use the initial population files for training
+    dv_pops = [os.path.join(m_d,"{0}.0.dv_pop.csv".format(case))]
+    obs_pops = [f.replace("dv_","obs_") for f in dv_pops]
+
+    pst_fname = os.path.join(m_d,case+".pst")
+    gpr_t_d = os.path.join(case+"_gpr_template")
+    pyemu.helpers.prep_for_gpr(pst_fname,dv_pops,obs_pops,t_d=m_d,gpr_t_d=gpr_t_d,nverf=int(pop_size*.1),\
+                               plot_fits=True,apply_standard_scalar=False,include_emulated_std_obs=True)
+    gpst = pyemu.Pst(os.path.join(gpr_t_d,case+".pst"))
+    shutil.copy2(os.path.join(m_d,case+".0.dv_pop.csv"),os.path.join(gpr_t_d,"initial_dv_pop.csv"))
+    gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv"
+    gpst.control_data.noptmax = noptmax_full
+    gpst.write(os.path.join(gpr_t_d,case+".pst"),version=2)
+    gpr_m_d = gpr_t_d.replace("template","master")
+    if os.path.exists(gpr_m_d):
+         shutil.rmtree(gpr_m_d)
+    pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path,  case+".pst", num_workers, worker_root=".",
+                                        master_dir=gpr_m_d, verbose=True, port=port)
+
+    #o1 = pd.read_csv(os.path.join(m_d,case+".{0}.obs_pop.csv".format(max(0,pst.control_data.noptmax))))
+    o1 = pd.read_csv(os.path.join(m_d,case+".pareto.archive.summary.csv"))
+    o1 = o1.loc[o1.generation == o1.generation.max(), :]
+    o1 = o1.loc[o1.is_feasible == True, :]
+    o1 = o1.loc[o1.nsga2_front == 1, :]
+
+
+    import matplotlib.pyplot as plt
+    o2 = pd.read_csv(os.path.join(gpr_m_d, case + ".{0}.obs_pop.csv".format(max(0, gpst.control_data.noptmax))))
+    fig,ax = plt.subplots(1,1,figsize=(5,5))
+    ax.scatter(o1.obj_1,o1.obj_2,c="r",s=10)
+    ax.scatter(o2.obj_1,o2.obj_2,c="0.5",s=10,alpha=0.5)
+    plt.tight_layout()
+    plt.savefig("gpr_{0}_compare_noiter.pdf".format(case))
+    plt.close(fig)
+
+    # now lets try an inner-outer scheme...
+    
+    gpst.control_data.noptmax = noptmax_inner
+    gpst.write(os.path.join(gpr_t_d,case+".pst"),version=2)
+    gpr_t_d_iter = gpr_t_d+"_outeriter{0}".format(0)
+    if os.path.exists(gpr_t_d_iter):
+        shutil.rmtree(gpr_t_d_iter)
+    shutil.copytree(gpr_t_d,gpr_t_d_iter)
+    for iouter in range(1,noptmax_outer+1):
+        #run the gpr emulator
+        gpr_m_d_iter = gpr_t_d_iter.replace("template","master")
+        complex_m_d_iter = t_d.replace("template", "master_complex_retrain_outeriter{0}".format(iouter))
+        if os.path.exists(gpr_m_d_iter):
+            shutil.rmtree(gpr_m_d_iter)
+        pyemu.os_utils.start_workers(gpr_t_d_iter, mou_exe_path,  case+".pst", num_workers, worker_root=".",
+                                        master_dir=gpr_m_d_iter, verbose=True, port=port)
+        o2 = pd.read_csv(os.path.join(gpr_m_d_iter,case+".{0}.obs_pop.csv".format(gpst.control_data.noptmax)))
+
+        # now run the final dv pop thru the "complex" model
+        final_gpr_dvpop_fname = os.path.join(gpr_m_d_iter,case+".archive.dv_pop.csv")
+        assert os.path.exists(final_gpr_dvpop_fname)
+        complex_model_dvpop_fname = os.path.join(t_d,"gpr_outeriter{0}_dvpop.csv".format(iouter))
+        if os.path.exists(complex_model_dvpop_fname):
+            os.remove(complex_model_dvpop_fname)
+        # load the gpr archive and do something clever to pick new points to eval
+        # with the complex model
+        dvpop = pd.read_csv(final_gpr_dvpop_fname,index_col=0)
+        if dvpop.shape[0] > pop_size:
+            arc_sum = pd.read_csv(os.path.join(gpr_m_d_iter,case+".pareto.archive.summary.csv"))
+            as_front_map = {member:front for member,front in zip(arc_sum.member,arc_sum.nsga2_front)}
+            as_crowd_map = {member: crowd for member, crowd in zip(arc_sum.member, arc_sum.nsga2_crowding_distance)}
+            as_feas_map = {member: feas for member, feas in zip(arc_sum.member, arc_sum.feasible_distance)}
+            as_gen_map = {member: gen for member, gen in zip(arc_sum.member, arc_sum.generation)}
+
+            dvpop.loc[:,"front"] = dvpop.index.map(lambda x: as_front_map.get(x,np.nan))
+            dvpop.loc[:, "crowd"] = dvpop.index.map(lambda x: as_crowd_map.get(x, np.nan))
+            dvpop.loc[:,"feas"] = dvpop.index.map(lambda x: as_feas_map.get(x,np.nan))
+            dvpop.loc[:, "gen"] = dvpop.index.map(lambda x: as_gen_map.get(x, np.nan))
+            #drop members that have missing archive info
+            dvpop = dvpop.dropna()
+            if dvpop.shape[0] > pop_size:
+                dvpop.sort_values(by=["gen","feas","front","crowd"],ascending=[False,True,True,False],inplace=True)
+                dvpop = dvpop.iloc[:pop_size,:]
+            dvpop.drop(["gen","feas","front","crowd"],axis=1,inplace=True)
+
+        #shutil.copy2(final_gpr_dvpop_fname,complex_model_dvpop_fname)
+        dvpop.to_csv(complex_model_dvpop_fname)
+        pst.pestpp_options["mou_dv_population_file"] = os.path.split(complex_model_dvpop_fname)[1]
+        pst.control_data.noptmax = -1
+        pst.write(os.path.join(t_d,case+".pst"),version=2)
+
+        pyemu.os_utils.start_workers(t_d, mou_exe_path,  case+".pst", num_workers, worker_root=".",
+                                    master_dir=complex_m_d_iter, verbose=True, port=port)
+
+        # plot the complex model results...
+        o2 = pd.read_csv(os.path.join(complex_m_d_iter, case + ".pareto.archive.summary.csv"))
+        o2 = o2.loc[o2.generation == o2.generation.max(), :]
+        #o2 = o2.loc[o2.is_feasible==True,:]
+        o2 = o2.loc[o2.nsga2_front == 1, :]
+        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
+        ax.scatter(o1.obj_1, o1.obj_2,c="r",s=10,label="full complex")
+        ax.scatter(o2.obj_1, o2.obj_2,c="0.5",s=10,alpha=0.5,label="mixed emulated-complex")
+        ax.legend(loc="upper right")
+        ax.set_xlim(0,10)
+        ax.set_ylim(0,20)
+        plt.tight_layout()
+        plt.savefig("gpr_{0}_compare_iterscheme_{1}.pdf".format(case,iouter))
+        plt.close(fig)
+
+        # now add those complex model input-output pop files to the list and retrain
+        # the gpr
+        dv_pops.append(os.path.join(complex_m_d_iter,case+".0.dv_pop.csv"))
+        obs_pops.append(os.path.join(complex_m_d_iter,case+".0.obs_pop.csv"))
+        gpr_t_d_iter = gpr_t_d+"_outeriter{0}".format(iouter)
+        pyemu.helpers.prep_for_gpr(pst_fname,dv_pops,obs_pops,t_d=gpr_t_d,gpr_t_d=gpr_t_d_iter,nverf=int(pop_size*.1),
+                                   plot_fits=True,apply_standard_scalar=False,include_emulated_std_obs=True)
+        gpst_iter = pyemu.Pst(os.path.join(gpr_t_d_iter,case+".pst"))
+        #aggdf = pd.read_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv"),index_col=0)
+        #aggdf.index = ["outeriter{0}_member{1}".format(iouter,i) for i in range(aggdf.shape[0])]
+        restart_gpr_dvpop_fname = "gpr_restart_dvpop_outeriter{0}.csv".format(iouter)
+        #aggdf.to_csv(os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname))
+        shutil.copy2(os.path.join(complex_m_d_iter,case+".0.dv_pop.csv"),os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname))
+        gpst_iter.pestpp_options["mou_dv_population_file"] = restart_gpr_dvpop_fname
+        gpst_iter.control_data.noptmax = gpst.control_data.noptmax
+        gpst_iter.write(os.path.join(gpr_t_d_iter,case+".pst"),version=2)
+
+
+def gpr_constr_invest():
+    import numpy as np
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    case = "constr"
+    use_chances = False
+    m_d = os.path.join(case + "_gpr_baseline")
+    org_d = os.path.join("utils", case + "_template")
+    t_d = case + "_template"
+    if os.path.exists(t_d):
+        shutil.rmtree(t_d)
+    shutil.copytree(org_d, t_d)
+    if os.path.exists(m_d):
+        shutil.rmtree(m_d)
+
+    pst = pyemu.Pst(os.path.join(t_d, case + ".pst"))
+    pst.pestpp_options["mou_generator"] = "pso"
+    if use_chances:
+        pst.pestpp_options["opt_risk"] = 0.95
+        pst.pestpp_options["opt_stack_size"] = 50
+        pst.pestpp_options["opt_recalc_chance_every"] = 10000
+        pst.pestpp_options["opt_chance_points"] = "single"
+    else:
+        pst.pestpp_options["opt_risk"] = 0.5
+
+    pop_size = 15
+    num_workers = 5
+    noptmax_full = 3
+    noptmax_inner = 2
+    noptmax_outer = 2
+    port = 4554
+    pst.control_data.noptmax = -1
+    pst.pestpp_options["mou_population_size"] = pop_size
+    pst.pestpp_options["mou_save_population_every"] = 1
+    pst.write(os.path.join(t_d, case + ".pst"))
+    #if not os.path.exists(m_d):
+    #    pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+    #                                 master_dir=m_d, verbose=True, port=port)
+    if os.path.exists(m_d):
+        shutil.rmtree(m_d)
+    shutil.copytree(t_d,m_d)
+    pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=m_d)
+    # use the initial population files for training
+    dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))]
+    obs_pops = [f.replace("dv_", "obs_") for f in dv_pops]
+
+    pst_fname = os.path.join(m_d, case + ".pst")
+    gpr_t_d = os.path.join(case + "_gpr_template")
+    pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops,t_d=m_d, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
+                               plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True)
+    gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst"))
+    #shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv"))
+    #gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv"
+    gpst.pestpp_options.pop("mou_dv_population_file",None) #= "initial_dv_pop.csv"
+    
+    gpst.control_data.noptmax = noptmax_full
+    gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2)
+    gpr_m_d = gpr_t_d.replace("template", "master")
+    if os.path.exists(gpr_m_d):
+        shutil.rmtree(gpr_m_d)
+    #pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+    #                             master_dir=gpr_m_d, verbose=True, port=port)
+    shutil.copytree(gpr_t_d,gpr_m_d)
+    pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_m_d)
+    
+    # o1 = pd.read_csv(os.path.join(m_d,case+".{0}.obs_pop.csv".format(max(0,pst.control_data.noptmax))))
+    o1 = pd.read_csv(os.path.join(m_d, case + ".pareto.archive.summary.csv"))
+    o1 = o1.loc[o1.generation == o1.generation.max(), :]
+    o1 = o1.loc[o1.is_feasible == True, :]
+    o1 = o1.loc[o1.nsga2_front == 1, :]
+
+    # import matplotlib.pyplot as plt
+    # o2 = pd.read_csv(os.path.join(gpr_m_d, case + ".{0}.obs_pop.csv".format(max(0, gpst.control_data.noptmax))))
+    # fig, ax = plt.subplots(1, 1, figsize=(5, 5))
+    # ax.scatter(o1.obj_1, o1.obj_2, c="r", s=10)
+    # ax.scatter(o2.obj_1, o2.obj_2, c="0.5", s=10, alpha=0.5)
+    # plt.tight_layout()
+    # plt.savefig("gpr_{0}_compare_noiter.pdf".format(case))
+    # plt.close(fig)
+
+    # now lets try an inner-outer scheme...
+
+    gpst.control_data.noptmax = noptmax_inner
+    gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2)
+    gpr_t_d_iter = gpr_t_d + "_outeriter{0}".format(0)
+    if os.path.exists(gpr_t_d_iter):
+        shutil.rmtree(gpr_t_d_iter)
+    shutil.copytree(gpr_t_d, gpr_t_d_iter)
+    for iouter in range(1, noptmax_outer + 1):
+        # run the gpr emulator
+        gpr_m_d_iter = gpr_t_d_iter.replace("template", "master")
+        complex_m_d_iter = t_d.replace("template", "master_complex_retrain_outeriter{0}".format(iouter))
+        if os.path.exists(gpr_m_d_iter):
+            shutil.rmtree(gpr_m_d_iter)
+        shutil.copytree(gpr_t_d_iter,gpr_m_d_iter)
+
+        pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_m_d_iter)
+    
+        #pyemu.os_utils.start_workers(gpr_t_d_iter, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+        #                             master_dir=gpr_m_d_iter, verbose=True, port=port)
+        
+        o2 = pd.read_csv(os.path.join(gpr_m_d_iter, case + ".{0}.obs_pop.csv".format(gpst.control_data.noptmax)))
+
+        # now run the final dv pop thru the "complex" model
+        final_gpr_dvpop_fname = os.path.join(gpr_m_d_iter, case + ".archive.dv_pop.csv")
+        assert os.path.exists(final_gpr_dvpop_fname)
+        complex_model_dvpop_fname = os.path.join(t_d, "gpr_outeriter{0}_dvpop.csv".format(iouter))
+        if os.path.exists(complex_model_dvpop_fname):
+            os.remove(complex_model_dvpop_fname)
+        # load the gpr archive and do something clever to pick new points to eval
+        # with the complex model
+        dvpop = pd.read_csv(final_gpr_dvpop_fname, index_col=0)
+        if dvpop.shape[0] > pop_size:
+            arc_sum = pd.read_csv(os.path.join(gpr_m_d_iter, case + ".pareto.archive.summary.csv"))
+            as_front_map = {member: front for member, front in zip(arc_sum.member, arc_sum.nsga2_front)}
+            as_crowd_map = {member: crowd for member, crowd in zip(arc_sum.member, arc_sum.nsga2_crowding_distance)}
+            as_feas_map = {member: feas for member, feas in zip(arc_sum.member, arc_sum.feasible_distance)}
+            as_gen_map = {member: gen for member, gen in zip(arc_sum.member, arc_sum.generation)}
+
+            dvpop.loc[:, "front"] = dvpop.index.map(lambda x: as_front_map.get(x, np.nan))
+            dvpop.loc[:, "crowd"] = dvpop.index.map(lambda x: as_crowd_map.get(x, np.nan))
+            dvpop.loc[:, "feas"] = dvpop.index.map(lambda x: as_feas_map.get(x, np.nan))
+            dvpop.loc[:, "gen"] = dvpop.index.map(lambda x: as_gen_map.get(x, np.nan))
+            # drop members that have missing archive info
+            dvpop = dvpop.dropna()
+            if dvpop.shape[0] > pop_size:
+                dvpop.sort_values(by=["gen", "feas", "front", "crowd"], ascending=[False, True, True, False],
+                                  inplace=True)
+                dvpop = dvpop.iloc[:pop_size, :]
+            dvpop.drop(["gen", "feas", "front", "crowd"], axis=1, inplace=True)
+
+        # shutil.copy2(final_gpr_dvpop_fname,complex_model_dvpop_fname)
+        dvpop.to_csv(complex_model_dvpop_fname)
+        pst.pestpp_options["mou_dv_population_file"] = os.path.split(complex_model_dvpop_fname)[1]
+        pst.control_data.noptmax = -1
+        pst.write(os.path.join(t_d, case + ".pst"), version=2)
+        if os.path.exists(complex_m_d_iter):
+            shutil.rmtree(complex_m_d_iter)
+        shutil.copytree(t_d,complex_m_d_iter)
+        #pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+        #                             master_dir=complex_m_d_iter, verbose=True, port=port)
+        pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=complex_m_d_iter)
+    
+        # plot the complex model results...
+        o2 = pd.read_csv(os.path.join(complex_m_d_iter, case + ".pareto.archive.summary.csv"))
+        o2 = o2.loc[o2.generation == o2.generation.max(), :]
+        # o2 = o2.loc[o2.is_feasible==True,:]
+        o2 = o2.loc[o2.nsga2_front == 1, :]
+        # fig, ax = plt.subplots(1, 1, figsize=(5, 5))
+        # ax.scatter(o1.obj_1, o1.obj_2, c="r", s=10, label="full complex")
+        # ax.scatter(o2.obj_1, o2.obj_2, c="0.5", s=10, alpha=0.5, label="mixed emulated-complex")
+        # ax.legend(loc="upper right")
+        # ax.set_xlim(0, 10)
+        # ax.set_ylim(0, 20)
+        # plt.tight_layout()
+        # plt.savefig("gpr_{0}_compare_iterscheme_{1}.pdf".format(case, iouter))
+        # plt.close(fig)
+
+        # now add those complex model input-output pop files to the list and retrain
+        # the gpr
+        dv_pops.append(os.path.join(complex_m_d_iter, case + ".0.dv_pop.csv"))
+        obs_pops.append(os.path.join(complex_m_d_iter, case + ".0.obs_pop.csv"))
+        gpr_t_d_iter = gpr_t_d + "_outeriter{0}".format(iouter)
+        pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=gpr_t_d,gpr_t_d=gpr_t_d_iter, nverf=int(pop_size * .1),
+                                   plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True)
+        gpst_iter = pyemu.Pst(os.path.join(gpr_t_d_iter, case + ".pst"))
+        # aggdf = pd.read_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv"),index_col=0)
+        # aggdf.index = ["outeriter{0}_member{1}".format(iouter,i) for i in range(aggdf.shape[0])]
+        #restart_gpr_dvpop_fname = "gpr_restart_dvpop_outeriter{0}.csv".format(iouter)
+        # aggdf.to_csv(os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname))
+        #shutil.copy2(os.path.join(complex_m_d_iter, case + ".0.dv_pop.csv"),
+        #             os.path.join(gpr_t_d_iter, restart_gpr_dvpop_fname))
+        gpst_iter.pestpp_options.pop("mou_dv_population_file",None)# = restart_gpr_dvpop_fname
+        gpst_iter.control_data.noptmax = gpst.control_data.noptmax
+        gpst_iter.write(os.path.join(gpr_t_d_iter, case + ".pst"), version=2)
+
+    psum_fname = os.path.join(complex_m_d_iter,case+".pareto.archive.summary.csv")
+    assert os.path.exists(psum_fname)
+    psum = pd.read_csv(psum_fname)
+    #assert 1.0 in psum.obj_1.values
+    #assert 1.0 in psum.obj_2.values
+
+
+def collate_training_data(pst,m_d,case):
+
+    input_fnames = [os.path.join(m_d,"{0}.0.dv_pop.csv".format(case))]
+    output_fnames = [f.replace("dv_","obs_") for f in input_fnames]
+
+    # work out input variable names
+    input_groups = pst.pestpp_options.get("opt_dec_var_groups",None)
+    par = pst.parameter_data
+    if input_groups is None:
+        print("using all adjustable parameters as inputs")
+        input_names = pst.adj_par_names
+    else:
+        input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")])
+        print("input groups:",input_groups)
+        adj_par = par.loc[pst.adj_par_names,:].copy()
+        adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:]
+        input_names = adj_par.parnme.tolist()
+    print("input names:",input_names)
+
+    #work out constraints and objectives
+    ineq_names = pst.less_than_obs_constraints.tolist()
+    ineq_names.extend(pst.greater_than_obs_constraints.tolist())
+    obs = pst.observation_data
+    objs = pst.pestpp_options.get("mou_objectives",None)
+    constraints = []
+
+    if objs is None:
+        print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives")
+        objs = ineq_names
+    else:
+        objs = objs.lower().strip().split(',')
+        constraints = [n for n in ineq_names if n not in objs]
+
+    print("objectives:",objs)
+    print("constraints:",constraints)
+    output_names = objs
+    output_names.extend(constraints)
+
+    print("loading input and output files")
+    if isinstance(input_fnames,str):
+        input_fnames = [input_fnames]
+    if isinstance(output_fnames,str):
+        output_fnames = [output_fnames]
+    if len(output_fnames) != len(input_fnames):
+        raise Exception("len(input_fnames) != len(output_fnames)")
+
+
+    dfs = []
+    for input_fname,output_fname in zip(input_fnames,output_fnames):
+        if input_fname.lower().endswith(".csv"):
+            input_df = pd.read_csv(os.path.join(input_fname),index_col=0)
+        elif input_fname.lower().endswith(".jcb"):
+            input_df = pyemu.ParameterEnsemble.from_binary(pst=pst,filename=input_fname)._df
+        else:
+            raise Exception("unrecognized input_fname extension:'{0}', looking for csv or jcb".\
+                            format(input_fname.lower()))
+
+        if output_fname.lower().endswith(".csv"):
+            output_df = pd.read_csv(os.path.join(output_fname),index_col=0)
+        elif output_fname.lower().endswith(".jcb"):
+            output_df = pyemu.ObservationEnsemble.from_binary(pst=pst,filename=output_fname)._df
+        else:
+            raise Exception("unrecognized output_fname extension:'{0}', looking for csv or jcb".\
+                            format(output_fname.lower()))
+
+        if input_df.shape[0] != output_df.shape[0]:
+            raise Exception("input rows != output rows for {0} and {1}".\
+                            format(input_fname,output_fname))
+        input_df = input_df.loc[:,input_names]
+        assert input_df.shape == input_df.dropna().shape
+
+        output_df = output_df.loc[:, output_names]
+        assert output_df.shape == output_df.dropna().shape
+
+        input_df.loc[:,output_names] = output_df.values
+        dfs.append(input_df)
+        print("...loaded",input_fname,output_fname)
+
+    data = pd.concat(dfs)
+    assert data.shape == data.dropna().shape
+    #df.to_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv"))
+    #print("aggregated training dataset shape",df.shape,"saved to",pst_fname + ".aggresults.csv")
+    return data, input_names, output_names
+
+def gpr_zdt1_test():
+    import numpy as np
+    import subprocess as sp
+    import multiprocessing as mp
+    from datetime import datetime
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    case = "zdt1"
+    use_chances = False
+    m_d = os.path.join(case + "_gpr_baseline")
+    org_d = os.path.join("utils", case + "_template")
+    t_d = case + "_template"
+    if os.path.exists(t_d):
+         shutil.rmtree(t_d)
+    shutil.copytree(org_d, t_d)
+    if os.path.exists(m_d):
+        shutil.rmtree(m_d)
+
+    pst = pyemu.Pst(os.path.join(t_d, case + ".pst"))
+    pst.pestpp_options["mou_generator"] = "pso"
+    pst.pestpp_options["overdue_giveup_fac"] = 1e10
+    pst.pestpp_options["overdue_resched_fac"] = 1e10
+    if use_chances:
+        pst.pestpp_options["opt_risk"] = 0.95
+        pst.pestpp_options["opt_stack_size"] = 50
+        pst.pestpp_options["opt_recalc_chance_every"] = 10000
+        pst.pestpp_options["opt_chance_points"] = "single"
+    else:
+        pst.pestpp_options["opt_risk"] = 0.5
+
+    pop_size = 20
+    num_workers = 10
+    noptmax_full = 1
+    
+    port = 4569
+    pst.control_data.noptmax = -1
+    pst.pestpp_options["mou_population_size"] = pop_size
+    pst.pestpp_options["mou_save_population_every"] = 1
+    pst.write(os.path.join(t_d, case + ".pst"))
+    #if not os.path.exists(m_d):
+    #    pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+    #                                 master_dir=m_d, verbose=True, port=port)
+    
+    pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=t_d)
+    
+
+    m_d = t_d
+    dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))]
+    obs_pops = [f.replace("dv_", "obs_") for f in dv_pops]
+
+    pst_fname = os.path.join(m_d, case + ".pst")
+    gpr_t_d = os.path.join(case + "_gpr_template")
+
+    data, input_names, output_names = collate_training_data(pst,m_d,case)
+    from pyemu.emulators.gpr import GPR
+    gpr = GPR(data=data.copy(),
+          input_names=input_names,
+          output_names=output_names,
+          #transforms=transforms,
+          #kernel=gp_kernel,
+          n_restarts_optimizer=20,
+          );
+    gpr.fit()
+    gpr.prepare_pestpp(m_d,case,gpr_t_d=gpr_t_d)
+
+    #pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
+    #                           plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True)
+    gpst = pyemu.Pst(os.path.join(gpr_t_d, case + "_gpr.pst"))
+    shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv"))
+    gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv"
+    gpst.control_data.noptmax = noptmax_full
+    gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2)
+    gpr_m_d = gpr_t_d.replace("template", "master")
+    if os.path.exists(gpr_m_d):
+        shutil.rmtree(gpr_m_d)
+    start = datetime.now()
+    #pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+    #                             master_dir=gpr_m_d, verbose=True, port=port)
+    pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_t_d)
+
+    gpr_m_d = gpr_t_d
+
+    finish = datetime.now()
+    duration1 = (finish - start).total_seconds()
+    arcorg = pd.read_csv(os.path.join(gpr_m_d,"zdt1.archive.obs_pop.csv"),index_col=0)
+    
+
+    psum_fname = os.path.join(gpr_m_d,case+".pareto.archive.summary.csv")
+    assert os.path.exists(psum_fname)
+    psum = pd.read_csv(psum_fname)
+    print(psum.obj_1.min())
+    print(psum.obj_2.min())
+    assert psum.obj_1.min() < 0.05
+
+    gpr_t_d2 = gpr_t_d + "_ppw"
+    if os.path.exists(gpr_t_d2):
+        shutil.rmtree(gpr_t_d2)
+    shutil.copytree(gpr_t_d,gpr_t_d2)
+
+    gpr_m_d2 = gpr_t_d2.replace("template","master")
+    gpr_d2 = GPR.load(os.path.join(gpr_m_d2,"gpr_emulator.pkl"))
+    input_df = pd.read_csv(os.path.join(gpr_t_d2,"gpr_input.csv"),index_col=0)
+    #mdf = pd.read_csv(os.path.join(gpr_t_d2,"gprmodel_info.csv"),index_col=0)
+    #mdf["model_fname"] = mdf.model_fname.apply(lambda x: os.path.join(gpr_t_d2,x))
+    pyemu.os_utils.start_workers(gpr_t_d2, mou_exe_path, case + ".pst", num_workers, worker_root=".",
+                                 master_dir=gpr_m_d2, verbose=True, port=port,
+                                 ppw_function=pyemu.helpers.gpr_pyworker,
+                                 ppw_kwargs={"input_df":input_df,
+                                            #"mdf":mdf,
+                                            "gpr":gpr_d2})
+    
+    
+    arcppw = pd.read_csv(os.path.join(gpr_m_d2,"zdt1.archive.obs_pop.csv"),index_col=0)
+    diff = np.abs(arcppw.values - arcorg.values)
+    print(diff.max())
+    assert diff.max() < 1e-6
+        
+
+    start = datetime.now()
+    b_d = os.getcwd()
+    os.chdir(gpr_t_d2)
+    p = sp.Popen([mou_exe_path,"{0}.pst".format(case),"/h",":{0}".format(port)])
+    os.chdir(b_d)
+    #p.wait()
+    #return
+    
+    # looper over and start the workers - in this
+    # case they dont need unique dirs since they aren't writing
+    # anything
+    procs = []
+    # try this test with 1 worker as an edge case
+    num_workers = 1
+    for i in range(num_workers):
+        pp = mp.Process(target=gpr_zdt1_ppw)
+        pp.start()
+        procs.append(pp)
+    # if everything worked, the the workers should receive the 
+    # shutdown signal from the master and exit gracefully...
+    for pp in procs:
+        pp.join()
+
+    # wait for the master to finish...but should already be finished
+    p.wait()
+    finish = datetime.now()
+    print("ppw` took",(finish-start).total_seconds())
+    print("org took",duration1)
+
+    arcppw = pd.read_csv(os.path.join(gpr_t_d2,"zdt1.archive.obs_pop.csv"),index_col=0)
+    diff = np.abs(arcppw.values - arcorg.values)
+    print(diff.max())
+    assert diff.max() < 1e-6
+        
+
+
+def gpr_zdt1_ppw():
+    t_d = "zdt1_gpr_template"
+    os.chdir(t_d)
+    pst_name = "zdt1.pst"
+    ppw = pyemu.helpers.gpr_pyworker(pst_name,"localhost",4569,gpr=True)
+    os.chdir("..")
+
+
 if __name__ == "__main__":
     #test_dsi_basic()
     #test_dsi_nst()
@@ -325,4 +915,6 @@ def test_lpfa_std():
     #test_dsi_mixed()
     #test_dsivc_freyberg()
     #plot_freyberg_dsi()
-    test_lpfa_std()
+    #test_lpfa_std()
+    gpr_zdt1_test()
+

From a7f3a6fc21c092b8bdd9cf03e981c33be852d07d Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 16:48:32 +0100
Subject: [PATCH 35/58] general fixes to ppw

---
 pyemu/emulators/gpr.py | 115 +++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py
index ac43b567a..6473090fe 100644
--- a/pyemu/emulators/gpr.py
+++ b/pyemu/emulators/gpr.py
@@ -280,47 +280,7 @@ def predict(self, X, return_std=False):
             return predictions_df
 
 
-    def scrape_pst_dir(self,pst_dir,casename):
 
-        if not os.path.exists(pst_dir):
-            raise FileNotFoundError(f"PEST control file {pst_dir} does not exist")
-        
-        pst = Pst(os.path.join(pst_dir,casename + ".pst"))
-
-        # work out input variable names
-        input_groups = pst.pestpp_options.get("opt_dec_var_groups",None)
-        par = pst.parameter_data
-        if input_groups is None:
-            print("using all adjustable parameters as inputs")
-            input_names = pst.adj_par_names
-        else:
-            input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")])
-            print("input groups:",input_groups)
-            adj_par = par.loc[pst.adj_par_names,:].copy()
-            adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:]
-            input_names = adj_par.parnme.tolist()
-        print("input names:",input_names)
-
-        #work out constraints and objectives
-        ineq_names = pst.less_than_obs_constraints.tolist()
-        ineq_names.extend(pst.greater_than_obs_constraints.tolist())
-        obs = pst.observation_data
-        objs = pst.pestpp_options.get("mou_objectives",None)
-        constraints = []
-
-        if objs is None:
-            print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives")
-            objs = ineq_names
-        else:
-            objs = objs.lower().strip().split(',')
-            constraints = [n for n in ineq_names if n not in objs]
-
-        print("objectives:",objs)
-        print("constraints:",constraints)
-        output_names = objs
-        output_names.extend(constraints)
-
-        return pst, input_names, output_names, objs, constraints
         
 
     def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
@@ -350,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
         # 3. which obs are objectives; subset of output_names
         # 4. which obs are constraints; subset of output_names
 
-        pst, input_names, output_names, objs, constraints = self.scrape_pst_dir(pst_dir,casename)
+        pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename)
 
 
         # check that all input_names ar ein par data
@@ -381,9 +341,6 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
         self.logger.statement(f"Creating template directory {gpr_t_d}")
         os.makedirs(gpr_t_d)
 
-        # pickle
-        self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl"))
-        self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}")
 
         # preapre template files
         self.logger.statement("Preparing PEST++ template files")
@@ -457,7 +414,9 @@ def fix_df_col_type(orgdf,fixdf):
             f.write("if __name__ == '__main__':\n")
             f.write("    gpr_forward_run()\n")
 
-
+        # pickle
+        self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl"))
+        self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}")
         
         gpst.control_data.noptmax = 0
         
@@ -471,6 +430,8 @@ def fix_df_col_type(orgdf,fixdf):
         gpst.control_data.noptmax = pst.control_data.noptmax
         gpst.write(os.path.join(gpr_t_d, gpst_fname), version=2)
 
+
+
         return
     
 def gpr_forward_run():
@@ -478,18 +439,58 @@ def gpr_forward_run():
     This function gets added programmatically to the forward run process"""
     import pandas as pd
     from pyemu.emulators import GPR
-    input_df = pd.read_csv("gpr_input.csv",index_col=0).T
-
+    input_df = pd.read_csv("gpr_input.csv",index_col=0)
     gpr = GPR.load("gpr_emulator.pkl")
-    df = pd.DataFrame(index=gpr.output_names,
-                    columns=["sim","sim_std"])
-    df.index.name = "output_name"
+    simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"])
+    simdf.index.name = "output_name"
     if gpr.return_std:
-        predmean,predstdv = gpr.predict(input_df.loc[:,gpr.input_names], return_std=True)
-        df.loc[:,"sim"] = predmean[df.index].values
-        df.loc[:,"sim_std"] = predstdv[df.index].values
+        predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True)
+        simdf.loc[:,"sim"] = predmean[simdf.index].values
+        simdf.loc[:,"sim_std"] = predstdv[simdf.index].values
     else:
-        predmean = gpr.predict(input_df.loc[:,gpr.input_names])
-        df.loc[:,"sim"] = predmean[df.index].values
-    df.to_csv("gpr_output.csv",index=True)
-    return df
\ No newline at end of file
+        predmean = gpr.predict(input_df.loc[gpr.input_names].T)
+        simdf.loc[:,"sim"] = predmean[simdf.index].values
+    simdf.to_csv("gpr_output.csv",index=True)
+    return simdf
+
+def scrape_pst_dir(self,pst_dir,casename):
+
+    if not os.path.exists(pst_dir):
+        raise FileNotFoundError(f"PEST control file {pst_dir} does not exist")
+    
+    pst = Pst(os.path.join(pst_dir,casename + ".pst"))
+
+    # work out input variable names
+    input_groups = pst.pestpp_options.get("opt_dec_var_groups",None)
+    par = pst.parameter_data
+    if input_groups is None:
+        print("using all adjustable parameters as inputs")
+        input_names = pst.adj_par_names
+    else:
+        input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")])
+        print("input groups:",input_groups)
+        adj_par = par.loc[pst.adj_par_names,:].copy()
+        adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:]
+        input_names = adj_par.parnme.tolist()
+    print("input names:",input_names)
+
+    #work out constraints and objectives
+    ineq_names = pst.less_than_obs_constraints.tolist()
+    ineq_names.extend(pst.greater_than_obs_constraints.tolist())
+    obs = pst.observation_data
+    objs = pst.pestpp_options.get("mou_objectives",None)
+    constraints = []
+
+    if objs is None:
+        print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives")
+        objs = ineq_names
+    else:
+        objs = objs.lower().strip().split(',')
+        constraints = [n for n in ineq_names if n not in objs]
+
+    print("objectives:",objs)
+    print("constraints:",constraints)
+    output_names = objs
+    output_names.extend(constraints)
+
+    return pst, input_names, output_names, objs, constraints
\ No newline at end of file

From 993742012f29410864584b65190bfd8f4965a7de Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 16:49:07 +0100
Subject: [PATCH 36/58] refactored gpr helper fnxs to maintain legacy, but also
 use new GPR class

---
 pyemu/utils/helpers.py | 73 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index b109fb7e4..d993d642a 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4349,13 +4349,13 @@ def emulate_with_gpr(input_df,mdf,gpr_model_dict):
         mdf.loc[output_name,"sim_std"] = sim[1]
     return mdf
 
-
-def gpr_pyworker(pst,host,port,input_df=None,mdf=None):
+def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None):
     import os
     import pandas as pd
     import numpy as np
     import pickle
 
+    
     # if explicit args weren't passed, get the default ones...
     if input_df is None:
         input_df = pd.read_csv("gpr_input.csv",index_col=0)
@@ -4402,6 +4402,75 @@ def gpr_pyworker(pst,host,port,input_df=None,mdf=None):
         # if None, we are done
         if parameters is None:
             break
+
+
+def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False):
+
+    if gpr is False:
+        print("WARNING: using legacy gpr_pyworker function, which is deprecated")
+        gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf)
+    elif gpr is True:
+        gpr = None
+        
+    import pandas as pd
+    from pyemu.emulators import GPR
+    
+    # if explicit args weren't passed, get the default ones...
+    if input_df is None:
+        input_df = pd.read_csv("gpr_input.csv",index_col=0)
+    if gpr is None:
+        gpr = GPR.load("gpr_emulator.pkl")
+    simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float)
+    simdf.index.name = "output_name"
+
+    ppw = PyPestWorker(pst,host,port,verbose=False)
+
+    # we can only get parameters once the worker has initialize and 
+    # is ready to run, so getting the first of pars here
+    # essentially blocks until the worker is ready
+    parameters = ppw.get_parameters()
+    # if its  None, the master already quit...
+    if parameters is None:
+        return
+
+    obs = ppw._pst.observation_data.copy()
+    # align the obsval series with the order sent from the master
+    obs = obs.loc[ppw.obs_names,"obsval"]
+    
+    # work out which par values sent from the master we need to run the emulator
+    par = ppw._pst.parameter_data.copy()
+    usepar_idx = []
+    ppw_par_names = list(ppw.par_names)
+    for i,pname in enumerate(input_df.index.values):
+        usepar_idx.append(ppw_par_names.index(pname))
+    
+
+    while True:
+        # map the current dv values in parameters into the 
+        # df needed to run the emulator
+        input_df["parval1"] = parameters.values[usepar_idx]
+        # do the emulation
+        if gpr.return_std:
+            predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True)
+            simdf.loc[:,"sim"] = predmean[simdf.index].values
+            simdf.loc[:,"sim_std"] = predstdv[simdf.index].values
+        else:
+            predmean = gpr.predict(input_df.loc[gpr.input_names].T)
+            simdf.loc[:,"sim"] = predmean[simdf.index].values
+
+
+        # replace the emulated quantities in the obs series
+        obs.loc[simdf.index] = simdf.sim.values
+        obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values
+
+        #send the obs series to the master
+        ppw.send_observations(obs.values)
+
+        #try to get more pars
+        parameters = ppw.get_parameters()
+        # if None, we are done
+        if parameters is None:
+            break
         
 
 

From 3c942a0af33a826832562f4a1f9c58ee578869c6 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 17:25:16 +0100
Subject: [PATCH 37/58] init updates

---
 pyemu/__init__.py           | 2 +-
 pyemu/emulators/__init__.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyemu/__init__.py b/pyemu/__init__.py
index a53c116ac..15dd1e1dc 100644
--- a/pyemu/__init__.py
+++ b/pyemu/__init__.py
@@ -22,7 +22,7 @@
                     os_utils, pp_utils, smp_utils)
 from .emulators import (
                       #emulators
-                      Emulator, DSI, LPFA,
+                      Emulator, DSI, LPFA,  GPR,
                     
                       
                       #transformers
diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py
index 5bb861e71..4833fd494 100755
--- a/pyemu/emulators/__init__.py
+++ b/pyemu/emulators/__init__.py
@@ -10,10 +10,12 @@
 from .base import Emulator
 from .dsi import DSI
 from .lpfa import LPFA
+from .gpr import GPR  
 __all__ = [
     'Emulator', #base Emulator Class
     'DSI',  # DSI Emulator Class
     'LPFA',
+    'GPR',  # GPR Emulator Class
     'BaseTransformer',
     'Log10Transformer',
     'RowWiseMinMaxScaler',

From 7bd18073b6000af8441ce48f2ccb25b48c869fcf Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Wed, 2 Jul 2025 17:27:32 +0100
Subject: [PATCH 38/58] fix to utils gpr test

---
 autotest/utils_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
index c492848bc..c1ab978ff 100644
--- a/autotest/utils_tests.py
+++ b/autotest/utils_tests.py
@@ -3161,7 +3161,7 @@ def gpr_zdt1_test():
 
     pst_fname = os.path.join(m_d, case + ".pst")
     gpr_t_d = os.path.join(case + "_gpr_template")
-    pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
+    pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
                                plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True)
     gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst"))
     shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv"))

From 3133d09ad844c3fa13243177148f1fd5ba2f5aa6 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 3 Jul 2025 12:00:12 +0100
Subject: [PATCH 39/58] fi to grp_pyworker

---
 pyemu/utils/helpers.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index d993d642a..a862f2497 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4406,11 +4406,13 @@ def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None):
 
 def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False):
 
-    if gpr is False:
+    if gpr == False:
         print("WARNING: using legacy gpr_pyworker function, which is deprecated")
         gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf)
-    elif gpr is True:
-        gpr = None
+    elif gpr == True:
+        gpr = GPR.load("gpr_emulator.pkl")
+    else:
+        assert isinstance(gpr, GPR), "gpr must be a GPR object or True to load from 'gpr_emulator.pkl'"
         
     import pandas as pd
     from pyemu.emulators import GPR
@@ -4418,8 +4420,7 @@ def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False):
     # if explicit args weren't passed, get the default ones...
     if input_df is None:
         input_df = pd.read_csv("gpr_input.csv",index_col=0)
-    if gpr is None:
-        gpr = GPR.load("gpr_emulator.pkl")
+
     simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float)
     simdf.index.name = "output_name"
 

From c6b1f0c37c83db76f6be7652e0a7530f029313fe Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 3 Jul 2025 12:08:48 +0100
Subject: [PATCH 40/58] fix legacy gpr oyworker handling

---
 pyemu/utils/helpers.py | 105 +++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 52 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index a862f2497..c6ab64772 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4406,72 +4406,73 @@ def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None):
 
 def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False):
 
-    if gpr == False:
+    if gpr is False:
         print("WARNING: using legacy gpr_pyworker function, which is deprecated")
         gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf)
-    elif gpr == True:
-        gpr = GPR.load("gpr_emulator.pkl")
     else:
+        if gpr is True:
+            gpr = GPR.load("gpr_emulator.pkl")
+
         assert isinstance(gpr, GPR), "gpr must be a GPR object or True to load from 'gpr_emulator.pkl'"
         
-    import pandas as pd
-    from pyemu.emulators import GPR
-    
-    # if explicit args weren't passed, get the default ones...
-    if input_df is None:
-        input_df = pd.read_csv("gpr_input.csv",index_col=0)
+        import pandas as pd
+        from pyemu.emulators import GPR
+        
+        # if explicit args weren't passed, get the default ones...
+        if input_df is None:
+            input_df = pd.read_csv("gpr_input.csv",index_col=0)
 
-    simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float)
-    simdf.index.name = "output_name"
+        simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float)
+        simdf.index.name = "output_name"
 
-    ppw = PyPestWorker(pst,host,port,verbose=False)
+        ppw = PyPestWorker(pst,host,port,verbose=False)
 
-    # we can only get parameters once the worker has initialize and 
-    # is ready to run, so getting the first of pars here
-    # essentially blocks until the worker is ready
-    parameters = ppw.get_parameters()
-    # if its  None, the master already quit...
-    if parameters is None:
-        return
+        # we can only get parameters once the worker has initialize and 
+        # is ready to run, so getting the first of pars here
+        # essentially blocks until the worker is ready
+        parameters = ppw.get_parameters()
+        # if its  None, the master already quit...
+        if parameters is None:
+            return
 
-    obs = ppw._pst.observation_data.copy()
-    # align the obsval series with the order sent from the master
-    obs = obs.loc[ppw.obs_names,"obsval"]
-    
-    # work out which par values sent from the master we need to run the emulator
-    par = ppw._pst.parameter_data.copy()
-    usepar_idx = []
-    ppw_par_names = list(ppw.par_names)
-    for i,pname in enumerate(input_df.index.values):
-        usepar_idx.append(ppw_par_names.index(pname))
-    
+        obs = ppw._pst.observation_data.copy()
+        # align the obsval series with the order sent from the master
+        obs = obs.loc[ppw.obs_names,"obsval"]
+        
+        # work out which par values sent from the master we need to run the emulator
+        par = ppw._pst.parameter_data.copy()
+        usepar_idx = []
+        ppw_par_names = list(ppw.par_names)
+        for i,pname in enumerate(input_df.index.values):
+            usepar_idx.append(ppw_par_names.index(pname))
+        
 
-    while True:
-        # map the current dv values in parameters into the 
-        # df needed to run the emulator
-        input_df["parval1"] = parameters.values[usepar_idx]
-        # do the emulation
-        if gpr.return_std:
-            predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True)
-            simdf.loc[:,"sim"] = predmean[simdf.index].values
-            simdf.loc[:,"sim_std"] = predstdv[simdf.index].values
-        else:
-            predmean = gpr.predict(input_df.loc[gpr.input_names].T)
-            simdf.loc[:,"sim"] = predmean[simdf.index].values
+        while True:
+            # map the current dv values in parameters into the 
+            # df needed to run the emulator
+            input_df["parval1"] = parameters.values[usepar_idx]
+            # do the emulation
+            if gpr.return_std:
+                predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True)
+                simdf.loc[:,"sim"] = predmean[simdf.index].values
+                simdf.loc[:,"sim_std"] = predstdv[simdf.index].values
+            else:
+                predmean = gpr.predict(input_df.loc[gpr.input_names].T)
+                simdf.loc[:,"sim"] = predmean[simdf.index].values
 
 
-        # replace the emulated quantities in the obs series
-        obs.loc[simdf.index] = simdf.sim.values
-        obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values
+            # replace the emulated quantities in the obs series
+            obs.loc[simdf.index] = simdf.sim.values
+            obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values
 
-        #send the obs series to the master
-        ppw.send_observations(obs.values)
+            #send the obs series to the master
+            ppw.send_observations(obs.values)
 
-        #try to get more pars
-        parameters = ppw.get_parameters()
-        # if None, we are done
-        if parameters is None:
-            break
+            #try to get more pars
+            parameters = ppw.get_parameters()
+            # if None, we are done
+            if parameters is None:
+                break
         
 
 

From 11569036dcf730ddcd5bb3be94b2481841fc6860 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 3 Jul 2025 13:59:01 +0100
Subject: [PATCH 41/58] mystery of the disapearing t_d argument

---
 pyemu/utils/helpers.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index c6ab64772..9e921ffe8 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4043,7 +4043,7 @@ def get_current_prop(_cur_thresh):
     return thresh, prop
 
 
-def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_kernel=None,nverf=0,
+def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d="template",gp_kernel=None,nverf=0,
                  plot_fits=False,apply_standard_scalar=False, include_emulated_std_obs=False):
     """helper function to setup a gaussian-process-regression (GPR) emulator for outputs of interest.  This
     is primarily targeted at low-dimensional settings like those encountered in PESTPP-MOU
@@ -4054,6 +4054,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_
         output_fnames (str | list[str]): usually a list of observation population files that
             corresponds to the simulation results associated with `input_fnames`
         gpr_t_d (str): the template file dir to create that will hold the GPR emulators
+        t_d (str): the template dir containing the PESTPP-MOU outputs that the GPR emulators are trained on
         gp_kernel (sklearn GaussianProcess kernel): the kernel to use.  if None, a standard RBF kernel
             is created and used
         nverf (int): the number of input-output pairs to hold back for a simple verification test
@@ -4180,7 +4181,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_
         import matplotlib.pyplot as plt
         from matplotlib.backends.backend_pdf import PdfPages
         pdf = PdfPages(os.path.join(gpr_t_d,"gpr_fits.pdf"))
-    for output_name in output_names:
+    for i,output_name in enumerate(output_names):
 
         y_verf = df.loc[:,output_name].values.copy()[cut:]
         y_train = df.loc[:, output_name].values.copy()[:cut]
@@ -4220,8 +4221,8 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_
                 plt.close(fig)
 
 
-
-        model_fname = os.path.split(pst_fname)[1]+"."+output_name+".pkl"
+        objname = f'obj_{i}'
+        model_fname = os.path.split(pst_fname)[1]+"."+objname+".pkl"
         if os.path.exists(os.path.join(gpr_t_d,model_fname)):
             print("WARNING: model_fname '{0}' exists, overwriting...".format(model_fname))
         with open(os.path.join(gpr_t_d,model_fname),'wb') as f:
@@ -4323,6 +4324,13 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_
     gpst_fname = os.path.split(pst_fname)[1]
     gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2)
     print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d)
+
+    #if they exist, copy pestpp bins from t_d over to gpr_t_d. otherwise, we assume bin is in path
+    pp_bins = [f for f in os.listdir(t_d) if 'pestpp-' in f]
+    if len(pp_bins)>0:
+        for pp_bin in pp_bins:
+            shutil.copy2(os.path.join(t_d,pp_bin),os.path.join(gpr_t_d,pp_bin))
+
     try:
         pyemu.os_utils.run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d)
     except Exception as e:

From b27d95f13acc7a7bff522eba910d010798c48ee2 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 3 Jul 2025 14:02:43 +0100
Subject: [PATCH 42/58] checkin tests

---
 autotest/utils_tests.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
index c1ab978ff..7bd45c2dd 100644
--- a/autotest/utils_tests.py
+++ b/autotest/utils_tests.py
@@ -3158,10 +3158,10 @@ def gpr_zdt1_test():
     m_d = t_d
     dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))]
     obs_pops = [f.replace("dv_", "obs_") for f in dv_pops]
-
+ 
     pst_fname = os.path.join(m_d, case + ".pst")
     gpr_t_d = os.path.join(case + "_gpr_template")
-    pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
+    pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops,t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \
                                plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True)
     gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst"))
     shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv"))
@@ -3176,26 +3176,25 @@ def gpr_zdt1_test():
     #                             master_dir=gpr_m_d, verbose=True, port=port)
     pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_t_d)
     gpr_m_d = gpr_t_d
-
+ 
     finish = datetime.now()
     duration1 = (finish - start).total_seconds()
     arcorg = pd.read_csv(os.path.join(gpr_m_d,"zdt1.archive.obs_pop.csv"),index_col=0)
     
-
+ 
     psum_fname = os.path.join(gpr_m_d,case+".pareto.archive.summary.csv")
     assert os.path.exists(psum_fname)
     psum = pd.read_csv(psum_fname)
     print(psum.obj_1.min())
     print(psum.obj_2.min())
-    assert psum.obj_1.min() < 0.05
-
+    assert psum.obj_1.min() < 0.05 
     gpr_t_d2 = gpr_t_d + "_ppw"
     if os.path.exists(gpr_t_d2):
         shutil.rmtree(gpr_t_d2)
     shutil.copytree(gpr_t_d,gpr_t_d2)
-
+ 
     gpr_m_d2 = gpr_t_d2.replace("template","master")
-
+ 
     input_df = pd.read_csv(os.path.join(gpr_t_d2,"gpr_input.csv"),index_col=0)
     mdf = pd.read_csv(os.path.join(gpr_t_d2,"gprmodel_info.csv"),index_col=0)
     mdf["model_fname"] = mdf.model_fname.apply(lambda x: os.path.join(gpr_t_d2,x))
@@ -3210,8 +3209,7 @@ def gpr_zdt1_test():
     diff = np.abs(arcppw.values - arcorg.values)
     print(diff.max())
     assert diff.max() < 1e-6
-        
-
+         
     start = datetime.now()
     b_d = os.getcwd()
     os.chdir(gpr_t_d2)
@@ -3234,13 +3232,13 @@ def gpr_zdt1_test():
     # shutdown signal from the master and exit gracefully...
     for pp in procs:
         pp.join()
-
+ 
     # wait for the master to finish...but should already be finished
     p.wait()
     finish = datetime.now()
     print("ppw` took",(finish-start).total_seconds())
     print("org took",duration1)
-
+ 
     arcppw = pd.read_csv(os.path.join(gpr_t_d2,"zdt1.archive.obs_pop.csv"),index_col=0)
     diff = np.abs(arcppw.values - arcorg.values)
     print(diff.max())
@@ -3258,7 +3256,8 @@ def gpr_zdt1_ppw():
 
 if __name__ == "__main__":
     #ppu_geostats_test(".")
-    gpr_compare_invest()
+    gpr_zdt1_test()
+    #gpr_compare_invest()
     #gpr_constr_test()
     # import sys
     # t_d = "constr_ppw_template"

From 0817aa3ddb252a2a7e682d099c6631a625b418b7 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Thu, 3 Jul 2025 15:32:53 +0100
Subject: [PATCH 43/58] fix to scrape dir fnx

---
 pyemu/emulators/gpr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py
index 6473090fe..a79035ed9 100644
--- a/pyemu/emulators/gpr.py
+++ b/pyemu/emulators/gpr.py
@@ -453,7 +453,7 @@ def gpr_forward_run():
     simdf.to_csv("gpr_output.csv",index=True)
     return simdf
 
-def scrape_pst_dir(self,pst_dir,casename):
+def scrape_pst_dir(pst_dir,casename):
 
     if not os.path.exists(pst_dir):
         raise FileNotFoundError(f"PEST control file {pst_dir} does not exist")

From a3adfa006c2393f41d5e1e81a0e5225ea3e4317b Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 07:21:52 -0600
Subject: [PATCH 44/58] trying a lower max port number

---
 pyemu/utils/os_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py
index d16183662..680f323ed 100644
--- a/pyemu/utils/os_utils.py
+++ b/pyemu/utils/os_utils.py
@@ -958,7 +958,7 @@ def send_killed_run(self,group=None,runid=None,desc="killed"):
 class PortManager(object):
     """Cross-platform port manager for parallel processes."""
     def __init__(self,
-                 port_range=(4004, 65535),
+                 port_range=(4004, 4999),
                  lock_dir=None,
                  max_retries=50,
                  lock_timeout=5,

From 324a88cacb6485ddbe099fb4d616cddddb1613c2 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 08:14:06 -0600
Subject: [PATCH 45/58] turned off zdt1 test in utils_test - this functionality
 has moved to emulator tests

---
 autotest/utils_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
index 7bd45c2dd..cd962dad4 100644
--- a/autotest/utils_tests.py
+++ b/autotest/utils_tests.py
@@ -3110,7 +3110,7 @@ def gpr_constr_invest():
     #assert 1.0 in psum.obj_2.values
     
 
-def gpr_zdt1_test():
+def gpr_zdt1_invest():
     import numpy as np
     import subprocess as sp
     import multiprocessing as mp

From 2adf7f1654211555363642b7f86ccc628cd83063 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 7 Jul 2025 17:08:09 +0100
Subject: [PATCH 46/58] refactro fixes for dsivc

---
 pyemu/emulators/dsi.py  | 29 ++++++++++++++++++++---------
 pyemu/emulators/lpfa.py |  6 ++----
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index 940868a57..dc773fa45 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -108,7 +108,7 @@ def _prepare_training_data(self):
         else:
             # Still need to set up a dummy transformer for inverse operations
             from .transformers import AutobotsAssemble
-            self.feature_transformer = AutobotsAssemble(data.copy())
+            self.transformer_pipeline = AutobotsAssemble(data.copy())
             self.data_transformed = data.copy()
     
         return self.data_transformed
@@ -387,7 +387,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
 
         #track dsivc args for forward run
         self.dsivc_args = {"percentiles":percentiles,
-                        "decvar_names":decvar_names,
+                            "decvar_names":decvar_names,
                             "track_stack":track_stack,
                         }
 
@@ -403,8 +403,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
             assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}"
             pst = Pst(os.path.join(t_d,"dsi.pst"))
         if oe is None:
-            self.logger.statement("no posterior DSI observation ensemble provided, using dsi.3.obs.jcb in DSI template dir...")
-            self.logger.statement(f"using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...")
+            self.logger.statement(f"no posterior DSI observation ensemble provided, using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...")
             assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}"
             oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb"))
         else:
@@ -429,13 +428,22 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
                             }
         # ensure it's a dict
         if dsi_args is None:
-            dsi_args = {}
+            dsi_args = default_dsi_args
         elif not isinstance(dsi_args, dict):
             raise TypeError("Expected a dictionary for 'options'")
         # merge with defaults (user values override defaults)
-        dsi_args = {**default_dsi_args, **dsi_args}
-
-
+        #dsi_args = {**default_dsi_args, **dsi_args}
+        else:
+            for key, value in default_dsi_args.items():
+                if key not in dsi_args:
+                    dsi_args[key] = value
+
+        # check that dsi_args has the required keys
+        required_keys = ["noptmax", "decvar_weight", "num_pyworkers"]
+        for key in required_keys:
+            if key not in dsi_args:
+                raise KeyError(f"Missing required key '{key}' in 'dsi_args'")
+        self.dsi_args = dsi_args
         out_files = []
 
         self.logger.statement(f"preparing stack stats observations...")
@@ -556,6 +564,9 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
         self.logger.statement("overwriting dsi.pst file...")
         pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"]
         pst.control_data.noptmax = dsi_args["noptmax"]
+
+        #TODO: ensure no noise for dvars obs
+
         pst.write(os.path.join(t_d,"dsi.pst"), version=2)
         
         
@@ -563,6 +574,6 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F
         self.decision_variable_names = decvar_names
         # re-pickle dsi to track dsivc args
         self.save(os.path.join(t_d,"dsi.pickle"))
-
+  
         self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...")
         return pst_dsivc
\ No newline at end of file
diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py
index 4252a61d7..a1a2da5f8 100644
--- a/pyemu/emulators/lpfa.py
+++ b/pyemu/emulators/lpfa.py
@@ -190,10 +190,8 @@ def _prepare_training_data(self):
         
         Parameters
         ----------
-        data : pandas.DataFrame, optional
-            Data to prepare. If None, uses self.data. Default is None.
-        test_size : float, optional
-            Fraction of data to use for testing. Default is 0.2.
+        self: LPFA
+            The emulator instance containing the data and configuration.
             
         Returns
         -------

From 323fb46eded85dd79d60a56d5c478bef184c66d3 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Mon, 7 Jul 2025 17:08:48 +0100
Subject: [PATCH 47/58] dsivc fix

---
 pyemu/utils/helpers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
index 9e921ffe8..2eb966ee8 100644
--- a/pyemu/utils/helpers.py
+++ b/pyemu/utils/helpers.py
@@ -4505,7 +4505,7 @@ def dsi_forward_run(pvals,dsi,write_csv=False):
         sim_vals.to_csv("dsi_sim_vals.csv")
     return sim_vals
 
-def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"):
+def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies",num_workers=1):
     import pandas as pd
     import pyemu
     import os
@@ -4562,10 +4562,11 @@ def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"):
 
     # deploy dsi...
     pvals = pd.read_csv(os.path.join(md_ies,"dsi_pars.csv"),index_col=0)
-    num_workers=1
+    
     worker_root="."
     dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb"))
-    num_workers = dsi.dsivc_args.get("num_pyworkers",1)
+    num_workers = dsi.dsi_args.get("num_pyworkers",1)
+    print(num_workers,"workers requested for dsi")
     pyemu.os_utils.start_workers(md_ies,ies_exe_path,"dsi.pst",
                                 num_workers=num_workers,
                                 worker_root=worker_root,

From 1bf5432001a42136c67b76b8fbed02e8102cec1d Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 11:59:44 -0600
Subject: [PATCH 48/58] changed port on pyworkertest...

---
 autotest/utils_tests.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
index cd962dad4..9c8f6b6c2 100644
--- a/autotest/utils_tests.py
+++ b/autotest/utils_tests.py
@@ -2692,7 +2692,7 @@ def pypestworker_test():
     import subprocess as sp
     import multiprocessing as mp
     host = "localhost"
-    port = 4004
+    port = 4111
     case = "constr"
     org_d = os.path.join("utils","{0}_template".format(case))
     t_d = "{0}_ppw_template".format(case)
@@ -3256,7 +3256,8 @@ def gpr_zdt1_ppw():
 
 if __name__ == "__main__":
     #ppu_geostats_test(".")
-    gpr_zdt1_test()
+    pypestworker_test()
+    #gpr_zdt1_test()
     #gpr_compare_invest()
     #gpr_constr_test()
     # import sys

From 14f8c77c2b851973fdf5060c4d486cf7b26caea3 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 12:33:55 -0600
Subject: [PATCH 49/58] trying to speedup tests

---
 autotest/emulator_tests.py | 2 +-
 autotest/utils_tests.py    | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 72370d827..92bd82882 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -777,7 +777,7 @@ def gpr_zdt1_test():
         pst.pestpp_options["opt_risk"] = 0.5
 
     pop_size = 20
-    num_workers = 10
+    num_workers = 3
     noptmax_full = 1
     
     port = 4569
diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
index 9c8f6b6c2..4391c0e7c 100644
--- a/autotest/utils_tests.py
+++ b/autotest/utils_tests.py
@@ -2706,7 +2706,7 @@ def pypestworker_test():
     pst.pestpp_options["overdue_giveup_fac"] = 1e10
     pst.pestpp_options["overdue_resched_fac"] = 1e10
     
-    pst.control_data.noptmax = 5
+    pst.control_data.noptmax = 2
     pst.write(os.path.join(t_d,"{0}.pst".format(case)),version=2)
     import sys
     sys.path.insert(0,t_d)
@@ -3255,6 +3255,7 @@ def gpr_zdt1_ppw():
 
 
 if __name__ == "__main__":
+    maha_pdc_test('.')
     #ppu_geostats_test(".")
     pypestworker_test()
     #gpr_zdt1_test()

From dd1f5c634a74f9694d469c19c924765ae8617c92 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 12:35:19 -0600
Subject: [PATCH 50/58] serial pytest

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e2efecad..60ad36e99 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -118,7 +118,7 @@ jobs:
       shell: bash -l {0}
       working-directory: ./autotest
       run: |
-        pytest -rP -rx --capture=no -v -n=auto --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
+        pytest -rP -rx --capture=no -v -n=1 --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 

From cba560457548629e262d4a943238beacd7fc7df4 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 12:35:29 -0600
Subject: [PATCH 51/58] serial pytest

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 60ad36e99..fc01f2dca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -118,7 +118,7 @@ jobs:
       shell: bash -l {0}
       working-directory: ./autotest
       run: |
-        pytest -rP -rx --capture=no -v -n=1 --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
+        pytest -rP -rx --capture=no -v --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }}
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 

From 0cbdd6ca5f49da752f3ac87618fb9c4a2021f030 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 13:22:28 -0600
Subject: [PATCH 52/58] trying to speed up dsivc test

---
 autotest/emulator_tests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 92bd82882..8e17498a1 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -107,7 +107,7 @@ def test_dsivc_freyberg():
                                 track_stack=False,
                                 percentiles=[0.05, 0.25, 0.5, 0.75, 0.95],
                                 dsi_args={
-                                    "noptmax":3,
+                                    "noptmax":1,
                                     "decvar_weight":10.0,
                                     "num_pyworkers":1,
                                 },
@@ -124,13 +124,13 @@ def test_dsivc_freyberg():
     obs.loc[mou_objectives, "weight"] = 1.0
     obs.loc[mou_objectives, "obgnme"] = "less_than_obj"
 
-    pstdsivc.control_data.noptmax = 1 #just for testing
-    pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing 
+    pstdsivc.control_data.noptmax = -1 #just for testing
+    pstdsivc.pestpp_options["mou_population_size"] = 3 #just for testing 
 
     pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2)
 
     md = "master_dsivc"
-    num_workers = 1
+    num_workers = 3
     worker_root = "."
 
     pyemu.os_utils.start_workers(td,

From 5a9af48af876f422c60366197049440cc7cf4f13 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 14:10:48 -0600
Subject: [PATCH 53/58] more speed up

---
 autotest/emulator_tests.py | 4 ++--
 autotest/pst_from_tests.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 8e17498a1..cb267ac67 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -909,12 +909,12 @@ def gpr_zdt1_ppw():
 
 
 if __name__ == "__main__":
-    #test_dsi_basic()
+    test_dsi_basic()
     #test_dsi_nst()
     #test_dsi_nst_extrap()
     #test_dsi_mixed()
     #test_dsivc_freyberg()
     #plot_freyberg_dsi()
     #test_lpfa_std()
-    gpr_zdt1_test()
+    #gpr_zdt1_test()
 
diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py
index e7ab7f2e0..3bbc8c090 100644
--- a/autotest/pst_from_tests.py
+++ b/autotest/pst_from_tests.py
@@ -5088,9 +5088,9 @@ def mf6_freyberg_thresh_test(tmp_path):
     # reset away from the truth...
     pst.parameter_data.loc[:,"parval1"] = org_par.parval1.values.copy()
 
-    pst.control_data.noptmax = 2
+    pst.control_data.noptmax = 1
     pst.pestpp_options["ies_par_en"] = "prior.jcb"
-    pst.pestpp_options["ies_num_reals"] = 30
+    pst.pestpp_options["ies_num_reals"] = 10
     pst.pestpp_options["ies_subset_size"] = -10
     pst.pestpp_options["ies_no_noise"] = True
     #pst.pestpp_options["ies_bad_phi_sigma"] = 2.0
@@ -5110,7 +5110,7 @@ def mf6_freyberg_thresh_test(tmp_path):
     m_d = "master_thresh"
     port = _get_port()
     pyemu.os_utils.start_workers(pf.new_d, ies_exe_path, "freyberg.pst",
-                                 worker_root=".", master_dir=m_d, num_workers=10,
+                                 worker_root=".", master_dir=m_d, num_workers=5,
                                  port=port)
     phidf = pd.read_csv(os.path.join(m_d,"freyberg.phi.actual.csv"))
     # print(phidf["mean"])

From 31db71040c57cf7abd25952449eec36d4a9d6304 Mon Sep 17 00:00:00 2001
From: jwhite <jtwhite1000@gmail.com>
Date: Mon, 7 Jul 2025 15:31:30 -0600
Subject: [PATCH 54/58] skipping zdt1 test for now - something is up.  Tried to
 fix dsi predict if case where transforms is None

---
 autotest/emulator_tests.py | 4 ++++
 pyemu/emulators/dsi.py     | 8 +++++---
 pyemu/emulators/gpr.py     | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index cb267ac67..94c35977b 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -31,6 +31,7 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""):
     #dsi._fit_transformer_pipeline()
     dsi.fit()
 
+
     # history match
     obsdata = pst.observation_data.copy()
     if transforms is not None:
@@ -747,6 +748,8 @@ def collate_training_data(pst,m_d,case):
     #print("aggregated training dataset shape",df.shape,"saved to",pst_fname + ".aggresults.csv")
     return data, input_names, output_names
 
+
+@pytest.mark.skip(reason="seems like it still in dev")
 def gpr_zdt1_test():
     import numpy as np
     import subprocess as sp
@@ -909,6 +912,7 @@ def gpr_zdt1_ppw():
 
 
 if __name__ == "__main__":
+    
     test_dsi_basic()
     #test_dsi_nst()
     #test_dsi_nst_extrap()
diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py
index 940868a57..755462a2a 100755
--- a/pyemu/emulators/dsi.py
+++ b/pyemu/emulators/dsi.py
@@ -205,7 +205,7 @@ def predict(self, pvals):
         if not self.fitted:
             raise ValueError("Emulator must be fitted before prediction")
             
-        if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None:
+        if self.transforms is not None and (not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None):
             raise ValueError("Emulator must be fitted and have valid transformations before prediction")
         
         if isinstance(pvals, pd.Series):
@@ -215,8 +215,9 @@ def predict(self, pvals):
         pmat = self.pmat
         ovals = self.ovals
         sim_vals = ovals + np.dot(pmat,pvals)
-        pipeline = self.transformer_pipeline
-        sim_vals = pipeline.inverse(sim_vals)
+        if self.transforms is not None:
+            pipeline = self.transformer_pipeline
+            sim_vals = pipeline.inverse(sim_vals)
         sim_vals.index.name = 'obsnme'
         sim_vals.name = "obsval"
         self.sim_vals = sim_vals
@@ -274,6 +275,7 @@ def prepare_pestpp(self, t_d=None, observation_data=None):
 
         # run once to get the dsi_pars.csv file
         pvals = np.zeros_like(self.s)
+
         sim_vals = self.predict(pvals)
         
         self.logger.log("creating ins file")
diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py
index 6473090fe..7e2e6ea5c 100644
--- a/pyemu/emulators/gpr.py
+++ b/pyemu/emulators/gpr.py
@@ -310,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
         # 3. which obs are objectives; subset of output_names
         # 4. which obs are constraints; subset of output_names
 
-        pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename)
+        pst, input_names, output_names, objs, constraints = scrape_pst_dir(self,pst_dir,casename)
 
 
         # check that all input_names ar ein par data

From e2f6f4ea8b33052fd07042f6a334c25869dfdc6e Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 8 Jul 2025 11:29:12 +0100
Subject: [PATCH 55/58] checkin before merge

---
 autotest/emulator_tests.py | 6 +++---
 autotest/pst_from_tests.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py
index 72370d827..a3e088444 100644
--- a/autotest/emulator_tests.py
+++ b/autotest/emulator_tests.py
@@ -97,7 +97,7 @@ def test_dsivc_freyberg():
     dsi = DSI.load(os.path.join(td, "dsi.pickle"))
 
     pst = pyemu.Pst(os.path.join(td, "dsi.pst"))
-    oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.1.obs.jcb"))
+    oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.0.obs.jcb"))
 
     obsdata = dsi.observation_data
     decvars = obsdata.loc[obsdata.obgnme=="out_wel"].obsnme.tolist()
@@ -107,7 +107,7 @@ def test_dsivc_freyberg():
                                 track_stack=False,
                                 percentiles=[0.05, 0.25, 0.5, 0.75, 0.95],
                                 dsi_args={
-                                    "noptmax":3,
+                                    "noptmax":-1, #just for testing
                                     "decvar_weight":10.0,
                                     "num_pyworkers":1,
                                 },
@@ -125,7 +125,7 @@ def test_dsivc_freyberg():
     obs.loc[mou_objectives, "obgnme"] = "less_than_obj"
 
     pstdsivc.control_data.noptmax = 1 #just for testing
-    pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing 
+    pstdsivc.pestpp_options["mou_population_size"] = 1 #just for testing 
 
     pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2)
 
diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py
index e7ab7f2e0..05772529b 100644
--- a/autotest/pst_from_tests.py
+++ b/autotest/pst_from_tests.py
@@ -11,7 +11,7 @@
 import pytest
 
 ext = ''
-local_bins = False  # change if wanting to test with local binary exes
+local_bins = True  # change if wanting to test with local binary exes
 if local_bins:
     bin_path = os.path.join("..", "..", "bin")
     if "linux" in platform.system().lower():

From 87a1fb8ef6087179e14c0c32de5a404382590c74 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Tue, 8 Jul 2025 12:54:56 +0100
Subject: [PATCH 56/58] dangrerous boolean flag...

---
 autotest/pst_from_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py
index 72e17d60c..3bbc8c090 100644
--- a/autotest/pst_from_tests.py
+++ b/autotest/pst_from_tests.py
@@ -11,7 +11,7 @@
 import pytest
 
 ext = ''
-local_bins = True  # change if wanting to test with local binary exes
+local_bins = False  # change if wanting to test with local binary exes
 if local_bins:
     bin_path = os.path.join("..", "..", "bin")
     if "linux" in platform.system().lower():

From 2d4c7789b5a844b74a365b4ef8f386ae69e7cd93 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Sat, 26 Jul 2025 18:02:28 -0500
Subject: [PATCH 57/58] fix handling of columns for transformer pipeline

---
 pyemu/emulators/transformers.py | 142 ++++++++++++++++++++++----------
 1 file changed, 99 insertions(+), 43 deletions(-)

diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py
index 39345159e..2c1ac7b29 100755
--- a/pyemu/emulators/transformers.py
+++ b/pyemu/emulators/transformers.py
@@ -4,7 +4,16 @@
 from __future__ import print_function, division
 import numpy as np
 import pandas as pd
-from sklearn.preprocessing import StandardScaler
+
+
+# Check sklearn availability at module level
+try:
+    from sklearn.preprocessing import StandardScaler
+    HAS_SKLEARN = True
+except ImportError:
+    HAS_SKLEARN = False
+    # Create dummy classes or set to None
+    StandardScaler = None
 
 
 class BaseTransformer:
@@ -27,14 +36,24 @@ def inverse_transform(self, X):
         raise NotImplementedError
 
 class Log10Transformer(BaseTransformer):
-    """Apply log10 transformation."""
+    """Apply log10 transformation.
+    
+    Parameters
+    ----------
+    columns : list, optional
+        List of column names to be transformed. If None, all columns will be transformed.
+    """
 
-    def __init__(self):
+    def __init__(self, columns=None):
+        self.columns = columns
         self.shifts = {}
 
     def transform(self, X):
         result = X.copy()
-        for col in X.columns:
+        columns = self.columns if self.columns is not None else X.columns
+        columns = [col for col in columns if col in X.columns]
+        
+        for col in columns:
             min_val = X[col].min()
             shift = -min_val + 1e-6 if min_val <= 0 else 0
             self.shifts[col] = shift
@@ -43,9 +62,10 @@ def transform(self, X):
 
     def inverse_transform(self, X):
         result = X.copy()
-        for col in X.columns:
-            shift = self.shifts.get(col, 0)
-            result[col] = (10 ** X[col]) - shift
+        for col in self.shifts.keys():
+            if col in X.columns:
+                shift = self.shifts.get(col, 0)
+                result[col] = (10 ** X[col]) - shift
         return result
 
 class RowWiseMinMaxScaler(BaseTransformer):
@@ -318,16 +338,33 @@ def inverse_transform(self, X):
         return result
 
 class StandardScalerTransformer(BaseTransformer):
-    def __init__(self, with_mean=True, with_std=True, copy=True):
+    """Wrapper around sklearn's StandardScaler for DataFrame compatibility.
+    
+    Parameters
+    ----------
+    with_mean : bool, default=True
+        If True, center the data before scaling.
+    with_std : bool, default=True
+        If True, scale the data to unit variance.
+    copy : bool, default=True
+        If True, a copy of X will be created. If False, centering and scaling happen in-place.
+    columns : list, optional
+        List of column names to be transformed. If None, all columns will be transformed.
+    """
+    
+    def __init__(self, with_mean=True, with_std=True, copy=True, columns=None):
         self.with_mean = with_mean
         self.with_std = with_std  
         self.copy = copy
+        self.columns = columns
         self._sklearn_scaler = None
-        self._columns = None
+        self._fitted_columns = None
         
     def fit(self, X):
-        # Store column names for DataFrame reconstruction
-        self._columns = X.columns.tolist()
+        # Determine which columns to fit
+        columns = self.columns if self.columns is not None else X.columns
+        columns = [col for col in columns if col in X.columns]
+        self._fitted_columns = columns
         
         # Create sklearn StandardScaler
         self._sklearn_scaler = StandardScaler(
@@ -337,56 +374,69 @@ def fit(self, X):
         )
         
         # Fit on numpy array (sklearn expects this)
-        self._sklearn_scaler.fit(X.values)
+        if columns:
+            self._sklearn_scaler.fit(X[columns].values)
         return self
         
     def transform(self, X):
         if self._sklearn_scaler is None:
             raise ValueError("Transformer must be fitted before transform")
-            
-        # Transform using sklearn
-        transformed_values = self._sklearn_scaler.transform(X.values)
         
-        # Reconstruct DataFrame with original structure
-        if isinstance(X, pd.DataFrame):
-            return pd.DataFrame(
-                transformed_values, 
-                index=X.index, 
-                columns=X.columns
-            )
-        else:
-            return transformed_values
+        result = X.copy()
+        
+        if self._fitted_columns:
+            # Transform using sklearn
+            transformed_values = self._sklearn_scaler.transform(X[self._fitted_columns].values)
+            
+            # Update only the fitted columns in the result
+            result[self._fitted_columns] = transformed_values
+            
+        return result
             
     def inverse_transform(self, X):
         if self._sklearn_scaler is None:
             raise ValueError("Transformer must be fitted before inverse_transform")
-            
-        # Inverse transform using sklearn
-        inverse_values = self._sklearn_scaler.inverse_transform(X.values)
         
-        # Reconstruct DataFrame
-        if isinstance(X, pd.DataFrame):
-            return pd.DataFrame(
-                inverse_values,
-                index=X.index,
-                columns=X.columns
-            )
-        else:
-            return inverse_values
+        result = X.copy()
+        
+        if self._fitted_columns:
+            # Inverse transform using sklearn
+            inverse_values = self._sklearn_scaler.inverse_transform(X[self._fitted_columns].values)
+            
+            # Update only the fitted columns in the result
+            result[self._fitted_columns] = inverse_values
+            
+        return result
 
 class NormalScoreTransformer(BaseTransformer):
-    """A transformer for normal score transformation."""
+    """A transformer for normal score transformation.
+    
+    Parameters
+    ----------
+    tol : float, default=1e-7
+        Tolerance for convergence in random generation.
+    max_samples : int, default=1000000
+        Maximum number of samples for random generation.
+    quadratic_extrapolation : bool, default=False
+        Whether to use quadratic extrapolation for values outside the fitted range.
+    columns : list, optional
+        List of column names to be transformed. If None, all columns will be transformed.
+    """
 
-    def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False):
+    def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False, columns=None):
         self.tol = tol
         self.max_samples = max_samples
         self.quadratic_extrapolation = quadratic_extrapolation
+        self.columns = columns
         self.column_parameters = {}
         self.shared_z_scores = {}
 
     def fit(self, X):
         """Fit the transformer to the data."""
-        for col in X.columns:
+        columns = self.columns if self.columns is not None else X.columns
+        columns = [col for col in columns if col in X.columns]
+        
+        for col in columns:
             values = X[col].values
             sorted_vals = np.sort(values)
             smoothed_vals = self._moving_average_with_endpoints(sorted_vals)
@@ -417,7 +467,10 @@ def transform(self, X):
             The transformed DataFrame with normal scores.
         """
         result = X.copy()
-        for col in X.columns:
+        for col in self.column_parameters.keys():
+            if col not in X.columns:
+                continue
+                
             params = self.column_parameters.get(col, {})
             z_scores = params.get('z_scores', [])
             originals = params.get('originals', [])
@@ -476,7 +529,10 @@ def inverse_transform(self, X):
             The inverse-transformed DataFrame.
         """
         result = X.copy()
-        for col in X.columns:
+        for col in self.column_parameters.keys():
+            if col not in X.columns:
+                continue
+                
             params = self.column_parameters.get(col, {})
             z_scores = params.get('z_scores', [])
             originals = params.get('originals', [])
@@ -747,13 +803,13 @@ def inverse_on_external_df(self, df, columns=None):
     def _create_transformer(self, transform_type, **kwargs):
         """Factory method to create appropriate transformer."""
         if transform_type == "log10":
-            return Log10Transformer()
+            return Log10Transformer(**kwargs)
         elif transform_type == "normal_score":
             return NormalScoreTransformer(**kwargs)
         elif transform_type == "row_wise_minmax":
             return RowWiseMinMaxScaler(**kwargs)
         elif transform_type == "standard_scaler":
-            return StandardScalerTransformer()
+            return StandardScalerTransformer(**kwargs)
         elif transform_type == "minmax_scaler":
             return MinMaxScaler(**kwargs)
         else:

From 2180ebd993b1ab066e3b8e9df6f7011bbcebe054 Mon Sep 17 00:00:00 2001
From: rhugman <rthugman@gmail.com>
Date: Sat, 26 Jul 2025 18:02:54 -0500
Subject: [PATCH 58/58] make sklearn an optional dependency

---
 pyemu/emulators/__init__.py | 39 ++++++++++++++++++++++++++++++++-----
 pyemu/emulators/gpr.py      |  2 +-
 pyemu/emulators/lpfa.py     | 17 +++++++++++++---
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py
index 4833fd494..3fc5f847e 100755
--- a/pyemu/emulators/__init__.py
+++ b/pyemu/emulators/__init__.py
@@ -2,25 +2,54 @@
     BaseTransformer,
     Log10Transformer,
     RowWiseMinMaxScaler,
-    StandardScalerTransformer,
+    #StandardScalerTransformer,
     NormalScoreTransformer,
     TransformerPipeline,
     AutobotsAssemble
 )
 from .base import Emulator
 from .dsi import DSI
-from .lpfa import LPFA
-from .gpr import GPR  
+#from .lpfa import LPFA
+#from .gpr import GPR  
+
+
 __all__ = [
     'Emulator', #base Emulator Class
     'DSI',  # DSI Emulator Class
     'LPFA',
-    'GPR',  # GPR Emulator Class
+##    'GPR',  # GPR Emulator Class
     'BaseTransformer',
     'Log10Transformer',
     'RowWiseMinMaxScaler',
-    'StandardScalerTransformer',
+#    'StandardScalerTransformer',
     'NormalScoreTransformer',
     'TransformerPipeline',
     'AutobotsAssemble'
 ]
+
+# Check sklearn availability
+try:
+    import sklearn
+    HAS_SKLEARN = True
+except ImportError:
+    HAS_SKLEARN = False
+
+# Conditional imports
+if HAS_SKLEARN:
+    from .lpfa import LPFA
+    from .gpr import GPR
+    from .transformers import StandardScalerTransformer
+    __all__.extend(['LPFA', 'GPR','StandardScalerTransformer'])
+else:
+    # Create placeholder classes that raise informative errors
+    class LPFA:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("LPFA emulator requires scikit-learn. Install with: pip install scikit-learn")
+    
+    class GPR:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("GPR emulator requires scikit-learn. Install with: pip install scikit-learn")
+    
+    class StandardScalerTransformer:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("StandardScalerTransformer requires scikit-learn. Install with: pip install scikit-learn")
diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py
index a07e2797c..a79035ed9 100644
--- a/pyemu/emulators/gpr.py
+++ b/pyemu/emulators/gpr.py
@@ -310,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"):
         # 3. which obs are objectives; subset of output_names
         # 4. which obs are constraints; subset of output_names
 
-        pst, input_names, output_names, objs, constraints = scrape_pst_dir(self,pst_dir,casename)
+        pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename)
 
 
         # check that all input_names ar ein par data
diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py
index a1a2da5f8..2de198858 100644
--- a/pyemu/emulators/lpfa.py
+++ b/pyemu/emulators/lpfa.py
@@ -3,11 +3,22 @@
 
 """
 from __future__ import print_function, division
+
+# Check sklearn availability at module level
+try:
+    from sklearn.model_selection import train_test_split
+    from sklearn.decomposition import PCA
+    from sklearn.neural_network import MLPRegressor
+    HAS_SKLEARN = True
+except ImportError:
+    HAS_SKLEARN = False
+    # Create dummy classes or set to None
+    train_test_split = None
+    PCA = None
+    MLPRegressor = None
+
 import numpy as np
 import pandas as pd
-from sklearn.model_selection import train_test_split
-from sklearn.decomposition import PCA
-from sklearn.neural_network import MLPRegressor
 
 from .base import Emulator
 from .transformers import RowWiseMinMaxScaler