From 0202ce494f320b1cc6096bc323479ff3a35b3f37 Mon Sep 17 00:00:00 2001 From: jwhite Date: Thu, 12 Dec 2024 17:58:44 -0500 Subject: [PATCH 01/58] added option to skip metadata parsing as this can be slow for large npar/nobs --- pyemu/pst/pst_handler.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pyemu/pst/pst_handler.py b/pyemu/pst/pst_handler.py index c0d50fa9b..c1d9e256a 100644 --- a/pyemu/pst/pst_handler.py +++ b/pyemu/pst/pst_handler.py @@ -61,7 +61,7 @@ class Pst(object): """ - def __init__(self, filename, load=True, resfile=None): + def __init__(self, filename, load=True, resfile=None, parse_metadata=True): self.parameter_data = None """pandas.DataFrame: '* parameter data' information. Columns are @@ -136,7 +136,7 @@ def __init__(self, filename, load=True, resfile=None): if not os.path.exists(filename): raise Exception("pst file not found:{0}".format(filename)) - self.load(filename) + self.load(filename, parse_metadata=parse_metadata) def __setattr__(self, key, value): if key == "model_command": @@ -1238,7 +1238,7 @@ def _load_version2(self, filename): "'* model input/output cant be used with '* model input' or '* model output'" ) - def load(self, filename): + def load(self, filename, parse_metadata=True): """entry point load the pest control file. Args: @@ -1271,7 +1271,8 @@ def load(self, filename): self._load_version2(filename) self._try_load_longnames() - self.try_parse_name_metadata() + if parse_metadata: + self.try_parse_name_metadata() self._reset_file_paths_os() def _reset_file_paths_os(self): From ae00d9a13597c6505cd8e36616230cea57b7ac70 Mon Sep 17 00:00:00 2001 From: jwhite Date: Thu, 24 Apr 2025 15:56:30 -0600 Subject: [PATCH 02/58] more tune ups in pypestworker --- pyemu/utils/os_utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py index 94990cab8..50801208c 100644 --- a/pyemu/utils/os_utils.py +++ b/pyemu/utils/os_utils.py @@ -651,7 +651,7 @@ def send(self,s,mtype,group,runid,desc,data): full_desc = desc + fill_desc buf += full_desc.encode() buf += sdata - s.send(buf) + s.sendall(buf) def _check_sec_message(self,recv_sec_message): @@ -662,7 +662,7 @@ def _check_sec_message(self,recv_sec_message): class PyPestWorker(object): - def __init__(self, pst, host, port, timeout=0.1,verbose=True): + def __init__(self, pst, host, port, timeout=0.25,verbose=True): self.host = host self.port = port self._pst_arg = pst @@ -695,23 +695,19 @@ def _process_pst(self): def connect(self,is_reconnect=False): - self.message("trying to connect to {0}:{1}...".format(self.host,self.port)) + self.message("trying to connect to {0}:{1}...".format(self.host,self.port),echo=True) self.s = None c = 0 while True: try: time.sleep(self.timeout) - print(".", end='') c += 1 - if c % 75 == 0: - print('') - print(c) if is_reconnect and c > self.max_reconnect_attempts: print("max reconnect attempts reached...") return False self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.s.connect((self.host, self.port)) - self.message("connected to {0}:{1}".format(self.host,self.port)) + self.message("connected to {0}:{1}".format(self.host,self.port),echo=True) break except ConnectionRefusedError: @@ -723,8 +719,8 @@ def connect(self,is_reconnect=False): return True - def message(self,msg): - if self.verbose: + def message(self,msg,echo=False): + if self.verbose or echo: print(str(datetime.now())+" : "+msg) @@ -757,9 +753,13 @@ def listen(self,lock=None,send_lock=None): if not success: print("...exiting") time.sleep(self.timeout) + # set the teminate flag so that the get_pars() look will exit + self._lock.acquire() + self.net_pack.mtype = 14 + self._lock.release() return else: - print("...reconnect successfully...") + print("...reconnected successfully...") continue if n > 0: From 33e368f9bed12650c7ec2d5a6541ce0f717c6a59 Mon Sep 17 00:00:00 2001 From: jwhite Date: Thu, 24 Apr 2025 16:16:48 -0600 Subject: [PATCH 03/58] more worker stuff --- pyemu/utils/os_utils.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py index 50801208c..1247cf457 100644 --- a/pyemu/utils/os_utils.py +++ b/pyemu/utils/os_utils.py @@ -660,9 +660,21 @@ def _check_sec_message(self,recv_sec_message): format(recv_sec_message,self.sec_message)) class PyPestWorker(object): + """a pure python worker for pest++. the pest++ master doesnt even know... + Args: + pst (str or pyemu.Pst): something about a control file + host (str): master hostname or IPv4 address + port (int): port number that the master is listening on + timeout (float): number of seconds to sleep at different points in the process. + if you have lots of pars and/obs, a longer sleep can be helpful, but if you make this smaller, + the worker responds faster...'it depends' + verbose (bool): flag to echo what's going on to stdout + socket_timeout (float): number of seconds that the socket should wait before giving up. + generally, this can be a big number... + """ - def __init__(self, pst, host, port, timeout=0.25,verbose=True): + def __init__(self, pst, host, port, timeout=0.25,verbose=True, socket_timeout=None): self.host = host self.port = port self._pst_arg = pst @@ -673,7 +685,9 @@ def __init__(self, pst, host, port, timeout=0.25,verbose=True): self.verbose = bool(verbose) self.par_names = None self.obs_names = None - + if socket_timeout is None: + socket_timeout = timeout * 100 + self.socket_timeout = socket_timeout self.par_values = None self.max_reconnect_attempts = 10 self._process_pst() @@ -741,7 +755,7 @@ def send(self,mtype,group,runid,desc="",data=0): return True def listen(self,lock=None,send_lock=None): - self.s.settimeout(self.timeout) + self.s.settimeout(self.socket_timeout) failed_reconnect = False while True: time.sleep(self.timeout) From 858ee5d43ed8b4d0aa94a68b96ac26023be20a8f Mon Sep 17 00:00:00 2001 From: jwhite Date: Thu, 22 May 2025 16:51:41 -0600 Subject: [PATCH 04/58] added some sugar to results handler to help with programmatic access to a sequence of ensemble/population files --- autotest/pst_tests_2.py | 7 ++++--- pyemu/pst/result_handler.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/autotest/pst_tests_2.py b/autotest/pst_tests_2.py index eef36c0b0..c82e2db5b 100644 --- a/autotest/pst_tests_2.py +++ b/autotest/pst_tests_2.py @@ -915,6 +915,7 @@ def results_ies_1_test(): pst = pyemu.Pst(os.path.join(m_d, "pest.pst"),result_dir=m_d) + df = pst.ies.get("paren",0) df = r.ies.rmr print(df) assert df is not None @@ -1096,10 +1097,10 @@ def results_mou_1_test(): assert df is not None if __name__ == "__main__": - results_ies_3_test() + #results_ies_3_test() results_ies_1_test() - results_ies_2_test() - results_mou_1_test() + #results_ies_2_test() + #results_mou_1_test() #at_bounds_test() #pst_from_flopy_geo_draw_test() diff --git a/pyemu/pst/result_handler.py b/pyemu/pst/result_handler.py index 033341969..7d868afb3 100644 --- a/pyemu/pst/result_handler.py +++ b/pyemu/pst/result_handler.py @@ -181,6 +181,20 @@ def get_files(self,tag): files.append(f) return files + def get(self,tag,*args): + """helper to call __getattr__() with programatic args + + Args: + tag (str): string for the item of interest (eg "paren", "dvpop", etc) + *args (list): optional args to str concatenate with tag when passed to + __getattr__(). for example tag could be "paren" and args could 0, + so that what is passed to __getattr__() is "paren0". + Returns: + "it depends" + + """ + ttag = tag + "".join([str(a) for a in args]) + return self.__getattr__(ttag) def __getattr__(self,tag): """overload of the get-attribute class method to make things super From ce5dff34a2f2704e8e7c0f816a8224eb6b7cc538 Mon Sep 17 00:00:00 2001 From: Rui Hugman Date: Mon, 16 Jun 2025 15:13:10 +0100 Subject: [PATCH 05/58] introducing transformer classes and pipeline --- pyemu/__init__.py | 12 +- pyemu/emulators/__init__.py | 21 + pyemu/emulators/base.py | 187 ++++++++ pyemu/emulators/transformers.py | 736 ++++++++++++++++++++++++++++++++ 4 files changed, 955 insertions(+), 1 deletion(-) create mode 100755 pyemu/emulators/__init__.py create mode 100755 pyemu/emulators/base.py create mode 100755 pyemu/emulators/transformers.py diff --git a/pyemu/__init__.py b/pyemu/__init__.py index db0e00960..9b88113c7 100644 --- a/pyemu/__init__.py +++ b/pyemu/__init__.py @@ -20,7 +20,9 @@ from .sc import Schur from .utils import (geostats, gw_utils, helpers, metrics, optimization, os_utils, pp_utils, smp_utils) - +from .emulators import (Emulator, BaseTransformer, Log10Transformer, + RowWiseMinMaxScaler, StandardScalerTransformer, NormalScoreTransformer, + TransformerPipeline, AutobotsAssemble) #from .prototypes import * try: from .legacy import * @@ -53,5 +55,13 @@ "smp_utils", "plot_utils", "metrics", + "Emulator", + "BaseTransformer", + "Log10Transformer", + "RowWiseMinMaxScaler", + "StandardScalerTransformer", + "NormalScoreTransformer", + "TransformerPipeline", + "AutobotsAssemble", ] # del get_versions diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py new file mode 100755 index 000000000..3bd39b1da --- /dev/null +++ b/pyemu/emulators/__init__.py @@ -0,0 +1,21 @@ +from .transformers import ( + BaseTransformer, + Log10Transformer, + RowWiseMinMaxScaler, + StandardScalerTransformer, + NormalScoreTransformer, + TransformerPipeline, + AutobotsAssemble +) +from .base import Emulator + +__all__ = [ + 'Emulator', #base Emulator Class + 'BaseTransformer', + 'Log10Transformer', + 'RowWiseMinMaxScaler', + 'StandardScalerTransformer', + 'NormalScoreTransformer', + 'TransformerPipeline', + 'AutobotsAssemble' +] diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py new file mode 100755 index 000000000..f088d91ee --- /dev/null +++ b/pyemu/emulators/base.py @@ -0,0 +1,187 @@ +""" +Base class for emulators. +""" +from __future__ import print_function, division +import pickle +import numpy as np +import pandas as pd +from ..logger import Logger + +class Emulator: + """ + Base class for emulators. + + This class defines the common interface for all emulator implementations + and provides shared functionality used by multiple emulator types. + + Parameters + ---------- + verbose : bool, optional + If True, enable verbose logging. Default is True. + """ + + def __init__(self, verbose=True): + """ + Initialize the Emulator base class. + + Parameters + ---------- + verbose : bool, optional + If True, enable verbose logging. Default is True. + """ + self.logger = Logger(verbose) + self.log = self.logger.log + self.fitted = False + self.data = None + self.data_transformed = None + self.feature_scaler = None + self.energy_threshold = 1.0 + self.feature_transformer = None + + def fit(self, X, y=None): + """ + Fit the emulator to training data. + + Parameters + ---------- + X : pandas.DataFrame + Input features for training. + y : pandas.DataFrame or None, optional + Target values for training if separate from X. + + Returns + ------- + self : Emulator + Returns self for method chaining. + """ + raise NotImplementedError("Subclasses must implement fit method") + + def predict(self, X): + """ + Generate predictions using the fitted emulator. + + Parameters + ---------- + X : pandas.DataFrame + Input data to generate predictions for. + + Returns + ------- + pandas.DataFrame or pandas.Series + Predictions for the input data. + """ + if not self.fitted: + raise ValueError("Emulator must be fitted before prediction") + raise NotImplementedError("Subclasses must implement predict method") + + def prepare_training_data(self, data=None): + """ + Prepare and transform training data for model fitting. + + Parameters + ---------- + data : pandas.DataFrame, optional + Raw training data. If None, uses self.data. + + Returns + ------- + tuple + Processed data ready for model fitting. + """ + if data is None: + if self.data is None: + raise ValueError("No data provided and no data stored in the emulator") + data = self.data + + # Common preprocessing logic could go here + return data + + def apply_feature_transforms(self, data=None, transforms=None): + """ + Apply feature transformations to data with customizable transformer sequence. + + Parameters + ---------- + data : pandas.DataFrame, optional + Data to transform. If None, uses self.data. + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g., 'log10', 'normal_score') + - 'columns': list - Columns to apply the transformation to (optional) + - Additional kwargs specific to the transformer + If None, no transformations are applied. + + Returns + ------- + pandas.DataFrame + Transformed data. + + Examples + -------- + # Using the transforms parameter: + emulator.apply_feature_transforms( + transforms=[ + {'type': 'log10', 'columns': ['flow', 'heads']}, + {'type': 'normal_score', 'columns': None, 'quadratic_extrapolation': True} + ] + ) + """ + if data is None: + data = self.data + + if data is None: + raise ValueError("No data provided and no data stored in the emulator") + + self.logger.statement("applying feature transforms") + # Import AutobotsAssemble here to avoid circular import + from .transformers import AutobotsAssemble + + ft = AutobotsAssemble(data.copy()) + + # Process the transforms parameter if provided + if transforms: + for transform in transforms: + transform_type = transform.get('type') + columns = transform.get('columns') + # Extract transformer-specific kwargs + kwargs = {k: v for k, v in transform.items() + if k not in ('type', 'columns')} + + self.logger.statement(f"applying {transform_type} transform") + ft.apply(transform_type, columns=columns, **kwargs) + + transformed_data = ft.df.copy() + self.feature_transformer = ft + self.data_transformed = transformed_data + + return transformed_data + + def save(self, filename): + """ + Save the fitted emulator to a file. + + Parameters + ---------- + filename : str + Path to save the emulator. + """ + with open(filename, "wb") as f: + pickle.dump(self, f) + + @classmethod + def load(cls, filename): + """ + Load a fitted emulator from a file. + + Parameters + ---------- + filename : str + Path to the saved emulator file. + + Returns + ------- + Emulator + The loaded emulator instance. + """ + with open(filename, "rb") as f: + return pickle.load(f) \ No newline at end of file diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py new file mode 100755 index 000000000..22c1bbb02 --- /dev/null +++ b/pyemu/emulators/transformers.py @@ -0,0 +1,736 @@ +""" +Transformer classes for data transformations in emulators. +""" +from __future__ import print_function, division +import numpy as np +import pandas as pd + +class BaseTransformer: + """Base class for all transformers providing a consistent interface.""" + + def fit(self, X): + """Learn parameters from data if needed.""" + return self + + def transform(self, X): + """Apply transformation to X.""" + raise NotImplementedError + + def fit_transform(self, X): + """Fit and transform in one step.""" + return self.fit(X).transform(X) + + def inverse_transform(self, X): + """Inverse transform X back to original space.""" + raise NotImplementedError + +class Log10Transformer(BaseTransformer): + """Apply log10 transformation.""" + + def __init__(self): + self.shifts = {} + + def transform(self, X): + result = X.copy() + for col in X.columns: + min_val = X[col].min() + shift = -min_val + 1e-6 if min_val <= 0 else 0 + self.shifts[col] = shift + result[col] = np.log10(X[col] + shift) + return result + + def inverse_transform(self, X): + result = X.copy() + for col in X.columns: + shift = self.shifts.get(col, 0) + result[col] = (10 ** X[col]) - shift + return result + +class RowWiseMinMaxScaler(BaseTransformer): + """Scale each row of a DataFrame to a specified range. + + Parameters + ---------- + feature_range : tuple (min, max), default=(-1, 1) + The range to scale features into. + groups : dict or None, default=None + Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group). + If None, all columns will be treated as a single group. + fit_groups : dict or None, default=None + Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max. + If None, defaults to using the same columns as in groups. + """ + + def __init__(self, feature_range=(-1, 1), groups=None, fit_groups=None): + self.feature_range = feature_range + self.groups = groups + self.fit_groups = fit_groups if fit_groups is not None else groups + self.row_params = {} # Will store per-row (min, max) for each group + + def fit(self, X): + """Compute row-wise min and max for each group. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to fit the scaler on. + + Returns + ------- + self : object + Returns self. + """ + # If groups not specified, treat all columns as one group + if self.groups is None: + self.groups = {"all": X.columns.tolist()} + + if self.fit_groups is None: + self.fit_groups = self.groups.copy() + + # Calculate and store row-wise min and max for each group + self.row_params = {} + for group_name, group_cols in self.groups.items(): + # Determine which columns to use for computing min/max for each row + fit_cols = self.fit_groups.get(group_name, group_cols) + # Keep only columns that exist in the DataFrame + fit_cols = [col for col in fit_cols if col in X.columns] + if not fit_cols: + continue + + # Compute row-wise min and max using the fit columns + row_min = X[fit_cols].min(axis=1) + row_max = X[fit_cols].max(axis=1) + self.row_params[group_name] = (row_min, row_max) + + return self + + def transform(self, X): + """Scale each row of data to the specified range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + result = X.copy() + f_min, f_max = self.feature_range + + # Auto-fit if not already fitted or if groups weren't specified + if not self.row_params or self.groups is None: + self.fit(X) + + # Transform each group + for group_name, group_cols in self.groups.items(): + # Keep only columns that exist in the DataFrame + valid_cols = [col for col in group_cols if col in X.columns] + if not valid_cols: + continue + + # Get the min and max for each row in this group + row_min, row_max = self.row_params[group_name] + + # Calculate the row range, avoiding division by zero + row_range = row_max - row_min + row_range[row_range == 0] = 1.0 # Set to 1 where range is 0 + + # For all columns in the group, scale using the row-wise parameters + group_data = X[valid_cols] + # First scale to [0, 1] + group_std = group_data.sub(row_min, axis=0).div(row_range, axis=0) + # Then scale to the desired feature range + result[valid_cols] = group_std * (f_max - f_min) + f_min + + return result + + def inverse_transform(self, X): + """Inverse transform data back to the original scale. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + if not self.row_params: + raise ValueError("This RowWiseMinMaxScaler instance is not fitted yet. " + "Call 'fit' before using this method.") + + result = X.copy() + f_min, f_max = self.feature_range + + # Inverse transform each group + for group_name, group_cols in self.groups.items(): + # Keep only columns that exist in the DataFrame + valid_cols = [col for col in group_cols if col in X.columns] + if not valid_cols: + continue + + # Get the min and max for each row in this group + row_min, row_max = self.row_params[group_name] + row_range = row_max - row_min + row_range[row_range == 0] = 1.0 # Avoid division by zero + + # Get the scaled data for this group + group_data = X[valid_cols] + + # First convert from feature_range to [0, 1] + group_std = (group_data - f_min) / (f_max - f_min) + + # Then recover original values + result[valid_cols] = group_std.mul(row_range, axis=0).add(row_min, axis=0) + + return result + +class MinMaxScaler(BaseTransformer): + """Scale each column of a DataFrame to a specified range. + + Parameters + ---------- + feature_range : tuple (min, max), default=(0, 1) + The range to scale features into. + columns : list, optional + List of column names to be scaled. If None, all columns will be scaled. + skip_constant : bool, optional + If True, columns with constant values will be skipped. Default is True. + """ + + def __init__(self, feature_range=(-1, 1), columns=None, skip_constant=True): + self.feature_range = feature_range + self.columns = columns + self.skip_constant = skip_constant + self.min_ = {} + self.scale_ = {} + + def fit(self, X): + """Learn min and max values for scaling. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to fit the scaler on. + + Returns + ------- + self : object + Returns self. + """ + columns = self.columns if self.columns is not None else X.columns + + # Ensure we only work with columns that exist in the DataFrame + columns = [col for col in columns if col in X.columns] + + for col in columns: + col_min = X[col].min() + col_max = X[col].max() + + # If the column has constant values and skip_constant is True, store the values but don't transform + if self.skip_constant and col_min == col_max: + self.min_[col] = col_min + self.scale_[col] = 0 # Flag for constant column + else: + # Store min and calculate scale factor for non-constant columns + self.min_[col] = col_min + # Avoid division by zero for nearly constant columns + if col_max - col_min > 1e-10: + self.scale_[col] = (self.feature_range[1] - self.feature_range[0]) / (col_max - col_min) + else: + # For nearly constant columns, set scale to 0 to keep original value + self.scale_[col] = 0 + + return self + + def transform(self, X): + """Scale features according to feature_range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + if not self.min_: + self.fit(X) + + result = X.copy() + + f_min, f_max = self.feature_range + + for col in self.min_.keys(): + if col not in X.columns: + continue + + # Skip columns marked as constant + if self.scale_[col] == 0: + continue + + # Apply scaling: X_std = (X - X.min) / (X.max - X.min) -> X_scaled = X_std * (max - min) + min + result[col] = (X[col] - self.min_[col]) * self.scale_[col] + f_min + + return result + + def inverse_transform(self, X): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + if not self.min_: + raise ValueError("This MinMaxScaler instance is not fitted yet. Call 'fit' before using this method.") + + result = X.copy() + + f_min, f_max = self.feature_range + + for col in self.min_.keys(): + if col not in X.columns: + continue + + # Skip columns marked as constant + if self.scale_[col] == 0: + continue + + # Apply inverse scaling: X_original = (X_scaled - min) / (max - min) * (X.max - X.min) + X.min + result[col] = (X[col] - f_min) / self.scale_[col] + self.min_[col] + + return result + +class StandardScalerTransformer(BaseTransformer): + """Apply standard scaling (zero mean, unit variance) to data.""" + + def __init__(self): + self.means = {} + self.stds = {} + + def fit(self, X): + """Compute mean and standard deviation for each feature.""" + for col in X.columns: + self.means[col] = X[col].mean() + self.stds[col] = X[col].std() + if self.stds[col] == 0: + self.stds[col] = 1.0 # Avoid division by zero + return self + + def transform(self, X): + """Transform the data using mean and std from fit.""" + result = X.copy() + for col in X.columns: + if col in self.means: + mean = self.means[col] + std = self.stds[col] + result[col] = (X[col] - mean) / std + return result + + def inverse_transform(self, X): + """Inverse transform data back to original scale.""" + result = X.copy() + for col in X.columns: + if col in self.means: + mean = self.means[col] + std = self.stds[col] + result[col] = (X[col] * std) + mean + return result + +class NormalScoreTransformer(BaseTransformer): + """A transformer for normal score transformation.""" + + def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False): + self.tol = tol + self.max_samples = max_samples + self.quadratic_extrapolation = quadratic_extrapolation + self.column_parameters = {} + self.shared_z_scores = {} + + def fit(self, X): + """Fit the transformer to the data.""" + for col in X.columns: + values = X[col].values + sorted_vals = np.sort(values) + smoothed_vals = self._moving_average_with_endpoints(sorted_vals) + + n_points = len(smoothed_vals) + if n_points not in self.shared_z_scores: + self.shared_z_scores[n_points] = self._randrealgen_optimized(n_points) + + z_scores = self.shared_z_scores[n_points] + + self.column_parameters[col] = { + 'z_scores': z_scores, + 'originals': smoothed_vals, + } + return self + + def transform(self, X): + """Transform the data using normal score transformation. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame with normal scores. + """ + result = X.copy() + for col in X.columns: + params = self.column_parameters.get(col, {}) + z_scores = params.get('z_scores', []) + originals = params.get('originals', []) + + if len(z_scores) == 0 or len(originals) == 0: + continue + + values = X[col].values + + # Handle values outside the original range + min_orig, max_orig = np.min(originals), np.max(originals) + min_z, max_z = np.min(z_scores), np.max(z_scores) + + # For values within range, use interpolation + within_range = (values >= min_orig) & (values <= max_orig) + if within_range.any(): + result.loc[within_range, col] = np.interp( + values[within_range], originals, z_scores + ) + + # For values outside range, use extrapolation if enabled or clamp to bounds + below_min = values < min_orig + above_max = values > max_orig + + if below_min.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation below minimum + slope = (z_scores[1] - z_scores[0]) / (originals[1] - originals[0]) + result.loc[below_min, col] = min_z + slope * (values[below_min] - min_orig) + else: + # Otherwise clamp to minimum z-score + result.loc[below_min, col] = min_z + + if above_max.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation above maximum + slope = (z_scores[-1] - z_scores[-2]) / (originals[-1] - originals[-2]) + result.loc[above_max, col] = max_z + slope * (values[above_max] - max_orig) + else: + # Otherwise clamp to maximum z-score + result.loc[above_max, col] = max_z + + return result + + def inverse_transform(self, X): + """Inverse transform data back to original space. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame with transformed data to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + result = X.copy() + for col in X.columns: + params = self.column_parameters.get(col, {}) + z_scores = params.get('z_scores', []) + originals = params.get('originals', []) + if len(z_scores) == 0 or len(originals) == 0: + continue + + # Get values to inverse transform + values = X[col].values + min_z, max_z = np.min(z_scores), np.max(z_scores) + min_orig, max_orig = np.min(originals), np.max(originals) + + # For values within the z-score range, use interpolation + within_range = (values >= min_z) & (values <= max_z) + if within_range.any(): + result.loc[within_range, col] = np.interp(values[within_range], z_scores, originals) + + # For values outside the z-score range, use extrapolation if enabled + below_min = values < min_z + above_max = values > max_z + + if below_min.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation below minimum z-score + slope = (originals[1] - originals[0]) / (z_scores[1] - z_scores[0]) + intercept = originals[0] - slope * z_scores[0] + result.loc[below_min, col] = slope * values[below_min] + intercept + else: + # Otherwise clamp to minimum original value + result.loc[below_min, col] = min_orig + + if above_max.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation above maximum z-score + slope = (originals[-1] - originals[-2]) / (z_scores[-1] - z_scores[-2]) + intercept = originals[-1] - slope * z_scores[-1] + result.loc[above_max, col] = slope * values[above_max] + intercept + else: + # Otherwise clamp to maximum original value + result.loc[above_max, col] = max_orig + + return result + + def _randrealgen_optimized(self, nreal): + rval = np.zeros(nreal) + nsamp = 0 + numsort = (nreal + 1) // 2 if nreal % 2 == 0 else nreal // 2 + + while nsamp < self.max_samples: + nsamp += 1 + work1 = np.random.normal(size=nreal) + work1.sort() + + if nsamp > 1: + previous_mean = rval[:numsort] / (nsamp - 1) + rval[:numsort] += work1[:numsort] + current_mean = rval[:numsort] / nsamp + max_diff = np.max(np.abs(current_mean - previous_mean)) + + if max_diff <= self.tol: + break + else: + rval[:numsort] = work1[:numsort] + + rval[:numsort] /= nsamp + rval[numsort:] = -rval[:numsort][::-1] if nreal % 2 == 0 else np.concatenate(([-rval[numsort]], -rval[:numsort][::-1])) + return rval + + def _moving_average_with_endpoints(self, y_values): + """Apply a moving average smoothing to an array while preserving endpoints.""" + window_size = 3 + if y_values.shape[0] > 40: + window_size = 5 + if y_values.shape[0] > 90: + window_size = 7 + if y_values.shape[0] > 200: + window_size = 9 + + if window_size % 2 == 0: + raise ValueError("window_size must be odd") + half_window = window_size // 2 + smoothed_y = np.zeros_like(y_values) + + # Handle start points correctly + for i in range(0, half_window): + smoothed_y[i] = np.mean(y_values[:i + half_window + 1]) + + # Handle end points correctly + for i in range(1, half_window + 1): + smoothed_y[-i] = np.mean(y_values[-(i + half_window):]) + + # Middle points + for i in range(half_window, len(y_values) - half_window): + smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window + 1]) + + # Preserve original endpoints exactly + smoothed_y[0] = y_values[0] + smoothed_y[-1] = y_values[-1] + + # Ensure monotonicity + for i in range(1, len(smoothed_y)): + if smoothed_y[i] <= smoothed_y[i - 1]: + smoothed_y[i] = smoothed_y[i - 1] + 1e-16 + + return smoothed_y + +class TransformerPipeline: + """Apply a sequence of transformers in order.""" + + def __init__(self): + self.transformers = [] + self.fitted = False + + def add(self, transformer, columns=None): + """Add a transformer to the pipeline, optionally for specific columns.""" + self.transformers.append((transformer, columns)) + return self + + def fit(self, X): + """Fit all transformers in the pipeline.""" + for transformer, columns in self.transformers: + cols_to_transform = columns if columns is not None else X.columns + sub_X = X[cols_to_transform] + transformer.fit(sub_X) + self.fitted = True + return self + + def transform(self, X): + """Transform data using all transformers in the pipeline. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + result = X.copy() + for transformer, columns in self.transformers: + cols_to_transform = columns if columns is not None else X.columns + # Only use columns that exist in the input data + valid_cols = [col for col in cols_to_transform if col in X.columns] + if not valid_cols: + continue + sub_X = result[valid_cols] + result[valid_cols] = transformer.transform(sub_X) + return result + + def fit_transform(self, X): + """Fit all transformers and transform data in one operation.""" + self.fit(X) + return self.transform(X) + + def inverse_transform(self, X): + """Apply inverse transformations in reverse order. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + + if isinstance(X, pd.Series): + result = X.copy().to_frame().T + else: + result = X.copy() + # Need to reverse the order of transformers for inverse + for transformer, columns in reversed(self.transformers): + cols_to_transform = columns if columns is not None else result.columns + # Only use columns that exist in the input data + valid_cols = [col for col in cols_to_transform if col in result.columns] + if not valid_cols: + continue + sub_X = result[valid_cols].copy() # Create a copy to avoid reference issues + inverted = transformer.inverse_transform(sub_X) + result.loc[:, valid_cols] = inverted # Use loc for proper assignment + if isinstance(X, pd.Series): + result = result.iloc[0] + return result + +class AutobotsAssemble: + """Class for transforming features in a DataFrame using a pipeline approach.""" + + def __init__(self, df=None): + self.df = df.copy() if df is not None else None + self.pipeline = TransformerPipeline() + + def apply(self, transform_type, columns=None, **kwargs): + """Apply a transformation to specified columns.""" + transformer = self._create_transformer(transform_type, **kwargs) + if columns is None: + columns = list(self.df.columns) # Convert to list to avoid pandas index issues + + # Fit transformer to data if needed + if hasattr(transformer, 'fit') and callable(transformer.fit): + if self.df is not None: + df_subset = self.df[columns] + transformer.fit(df_subset) + + # Add to pipeline + self.pipeline.add(transformer, columns) + + # Apply transformation to current df if available + if self.df is not None: + # Use transform directly to ensure correct application + df_subset = self.df[columns].copy() + transformed = transformer.transform(df_subset) + self.df[columns] = transformed + + return self + + def transform(self, df): + """Transform an external DataFrame using the pipeline. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + if self.pipeline.transformers: + return self.pipeline.transform(df) + return df.copy() + + def inverse(self, df=None): + """Apply inverse transformations in reverse order.""" + to_transform = df if df is not None else self.df + result = self.pipeline.inverse_transform(to_transform) + if df is None: + self.df = result + return result + + def inverse_on_external_df(self, df, columns=None): + """Apply inverse transformations to an external DataFrame. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame to inverse transform. + columns : list, optional + Specific columns to inverse transform. If None, all columns are processed. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + to_transform = df.copy() + if columns is not None: + # Ensure we only process specified columns + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns not found in DataFrame: {missing_cols}") + + return self.pipeline.inverse_transform(to_transform) + + def _create_transformer(self, transform_type, **kwargs): + """Factory method to create appropriate transformer.""" + if transform_type == "log10": + return Log10Transformer() + elif transform_type == "normal_score": + return NormalScoreTransformer(**kwargs) + elif transform_type == "row_wise_minmax": + return RowWiseMinMaxScaler(**kwargs) + elif transform_type == "standard_scaler": + return StandardScalerTransformer() + elif transform_type == "minmax_scaler": + return MinMaxScaler(**kwargs) + else: + raise ValueError(f"Unknown transform type: {transform_type}") \ No newline at end of file From f3c45acb59868e142b86b3a3f5ce33c78cb6a24f Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 16 Jun 2025 15:14:54 +0100 Subject: [PATCH 06/58] transformer tests --- autotest/transformer_tests.py | 462 ++++++++++++++++++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100755 autotest/transformer_tests.py diff --git a/autotest/transformer_tests.py b/autotest/transformer_tests.py new file mode 100755 index 000000000..493be1179 --- /dev/null +++ b/autotest/transformer_tests.py @@ -0,0 +1,462 @@ +import os +import sys +import shutil +import pytest +import numpy as np +import pandas as pd +import platform +sys.path.append("..") +import pyemu + +def test_base_transformer(): + """Test the BaseTransformer abstract class functionality""" + bt = pyemu.emulators.BaseTransformer() + + # fit should return self + assert bt.fit(None) is bt + + # fit_transform should call fit and transform + with pytest.raises(NotImplementedError): + bt.fit_transform(None) + + # transform should raise NotImplementedError + with pytest.raises(NotImplementedError): + bt.transform(None) + + # inverse_transform should raise NotImplementedError + with pytest.raises(NotImplementedError): + bt.inverse_transform(None) + +def test_log10_transformer(): + """Test the Log10Transformer functionality""" + # Create test dataframe with positive and negative values + df = pd.DataFrame({ + 'pos': [1, 10, 100, 1000], + 'zero': [0, 0.1, 0.01, 0.001], + 'neg': [-1, -10, -100, -1000] + }) + + # Initialize and test transformer + lt = pyemu.emulators.Log10Transformer() + + # Transform data + transformed = lt.transform(df) + + # Check that positive values are properly transformed + np.testing.assert_allclose( + transformed['pos'].values, + np.log10(df['pos'].values) + ) + + # Check that zeros/small values are handled correctly + assert not np.any(np.isinf(transformed['zero'].values)) + + # Check that negative values are handled correctly + assert not np.any(np.isnan(transformed['neg'].values)) + + # Test inverse transform + back_transformed = lt.inverse_transform(transformed) + + # Check that we get back very close to original values + np.testing.assert_allclose( + back_transformed['pos'].values, + df['pos'].values + ) + + # For zero/very small values + np.testing.assert_allclose( + back_transformed['zero'].values, + df['zero'].values , + rtol=1e-6 + ) + + # For negative values + np.testing.assert_allclose( + back_transformed['neg'].values, + df['neg'].values , + rtol=1e-6 + ) + +def test_row_wise_minmax_scaler(): + """Test the RowWiseMinMaxScaler functionality""" + # Test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [100, 200, 300, 400] + }) + + # Initialize scaler + scaler = pyemu.emulators.RowWiseMinMaxScaler() + + # Fit and transform + transformed = scaler.fit_transform(df) + + # Check each row is scaled to [0, 1] + for i in range(len(df)): + row_min = transformed.iloc[i].min() + row_max = transformed.iloc[i].max() + assert np.isclose(row_min, -1.0) + assert np.isclose(row_max, 1.0) + + # Test inverse transform + back_transformed = scaler.inverse_transform(transformed) + + # Check we get back original values + np.testing.assert_allclose(back_transformed.values, df.values) + +def test_normal_score_transformer(): + """Test the NormalScoreTransformer functionality""" + # Create test data with various distributions + np.random.seed(42) + n = 200 + + # Uniform data + uniform_data = np.random.uniform(0, 10, n) + + # Log-normal data + lognormal_data = np.exp(np.random.normal(0, 1, n)) + + # Bimodal data + bimodal_data = np.concatenate([ + np.random.normal(-3, 1, n//2), + np.random.normal(3, 1, n//2) + ]) + + df = pd.DataFrame({ + 'uniform': uniform_data, + 'lognormal': lognormal_data, + 'bimodal': bimodal_data + }) + + # Initialize transformer + nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) + + # Transform data + transformed = nst.fit_transform(df) + + # Check transformed distributions are more normal + # For each column, check skewness and kurtosis are closer to normal + for col in df.columns: + # Calculate statistics of original and transformed data + orig_skew = skewness(df[col].values) + trans_skew = skewness(transformed[col].values) + + orig_kurt = kurtosis(df[col].values) + trans_kurt = kurtosis(transformed[col].values) + + # Transformed data should have skewness closer to 0 + assert abs(trans_skew) < abs(orig_skew) or np.isclose(abs(trans_skew), 0, atol=0.5) + + # Transformed data should have kurtosis closer to 3 (normal distribution) + assert abs(trans_kurt - 3) < abs(orig_kurt - 3) or np.isclose(trans_kurt, 3, atol=1.0) + + # Test inverse transform + back_transformed = nst.inverse_transform(transformed) + + # Check we get back close to original values + # (not exact due to binning and smoothing) + np.testing.assert_allclose( + back_transformed.values, + df.values, + rtol=0.1, + atol=0.1 + ) + + # Test with quadratic extrapolation + nst_quad = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + transformed_quad = nst_quad.fit_transform(df) + + # Create data outside the original range for extrapolation test + # Transform should not fail for out-of-range values when using quadratic extrapolation + extreme_transformed = transformed_quad.copy() + extreme_transformed.loc[0] = transformed_quad.min() - 1 + extreme_transformed.loc[1] = transformed_quad.max() + 1 + + back_extreme = nst_quad.inverse_transform(extreme_transformed) + assert not np.any(np.isnan(back_extreme.values)) + assert not np.any(np.isinf(back_extreme.values)) + +def test_transformer_pipeline(): + """Test the TransformerPipeline functionality""" + # Create test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [100, 200, 300, 400] + }) + + # Create pipeline with multiple transformers + pipeline = pyemu.emulators.TransformerPipeline() + + # Add log transformer for all columns + log_trans = pyemu.emulators.Log10Transformer() + pipeline.add(log_trans) + + # Add row-wise min-max scaler for specific columns + minmax_trans = pyemu.emulators.RowWiseMinMaxScaler() + pipeline.add(minmax_trans, columns=['a', 'b']) + + # Transform data + transformed = pipeline.transform(df) + + # Check log was applied to all columns + np.testing.assert_allclose( + transformed['c'].values, + np.log10(df['c'].values) + ) + + # Check minmax was applied only to a and b + for i in range(len(df)): + row_subset = transformed.iloc[i][['a', 'b']] + assert np.isclose(row_subset.min(), 0.0) or np.isclose(row_subset.max(), 1.0) + + # Test inverse transform + back_transformed = pipeline.inverse_transform(transformed) + + # Check we get back close to original values + np.testing.assert_allclose(back_transformed.values, df.values, rtol=1e-5) + +def test_autobots_assemble(): + """Test the AutobotsAssemble class functionality""" + # Create test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [-10, -20, -30, -40] + }) + + # Save original data for comparison + original_df = df.copy() + + # Initialize with data + aa = pyemu.emulators.AutobotsAssemble(df) + + # Apply log transform to positive columns + aa.apply('log10', columns=['a', 'b']) + + # Check the transform was applied correctly + np.testing.assert_allclose( + aa.df[['a', 'b']].values, + np.log10(original_df[['a', 'b']].values) + ) + + # Check that column c is unchanged + np.testing.assert_array_equal(aa.df['c'].values, original_df['c'].values) + + # Save intermediate state after log transform + log_transformed = aa.df.copy() + + # Apply normal score transform to all columns + aa.apply('normal_score') + + # Save state after normal score transform + normal_transformed = aa.df.copy() + + # Verify both transforms were applied (data should be different from log transform) + assert not np.allclose(normal_transformed.values, log_transformed.values) + + # Apply the inverse transformation + back_transformed = aa.inverse() + + # Check we get back close to original values + np.testing.assert_allclose(back_transformed.values, original_df.values, rtol=0.1) + + # Test with external already-transformed data + external_transformed = pd.DataFrame({ + 'a': [-0.5, 0.0, 0.5], # Already transformed data in normal score space + 'b': [0.5, 0.0, -0.5], # (approximately in the normal distribution range) + 'c': [1.0, 0.0, -1.0] + }) + + # Test inverse transform on external transformed data + back_external = aa.inverse(external_transformed) + + # Check that shape is preserved + assert back_external.shape == external_transformed.shape + + # Verify output has reasonable values (should be in the range of original data) + for col in ['a', 'b']: + # These columns had log transform applied, so should be positive + assert np.all(back_external[col] > 0) + + # Column c should have values in the range of the original data + assert np.min(back_external['c']) >= -40 + assert np.max(back_external['c']) <= -10 + + # Apply transform again to verify roundtrip accuracy + roundtrip = aa.transform(back_external) + + # Check roundtrip accuracy for values within standard normal range (-2 to 2) + for col in external_transformed.columns: + # Find values within the normal range + mask = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) + if mask.any(): + # Get the values to compare + expected = external_transformed.loc[mask, col].values + actual = roundtrip.loc[mask, col].values + + # Handle zeros and near-zeros with absolute tolerance instead of relative + zero_mask = np.isclose(expected, 0, atol=1e-10) + if zero_mask.any(): + # For zeros, use absolute tolerance + np.testing.assert_allclose( + actual[zero_mask], + expected[zero_mask], + atol=0.1 # Absolute tolerance for zeros + ) + + # For non-zeros, use relative tolerance + if (~zero_mask).any(): + np.testing.assert_allclose( + actual[~zero_mask], + expected[~zero_mask], + rtol=0.1 # Relative tolerance for non-zeros + ) + else: + # No zeros, use normal comparison + np.testing.assert_allclose( + actual, + expected, + rtol=0.1 + ) + + # Additional test to verify pipeline order is maintained + # Create a new pipeline with transforms in different order + bb = pyemu.emulators.AutobotsAssemble(original_df.copy()) + + # First normal score, then log10 + bb.apply('normal_score') + bb.apply('log10', columns=['a', 'b']) + + # Apply inverse - should revert log10 first, then normal_score + back_bb = bb.inverse() + + # Check we get back close to original values + np.testing.assert_allclose(back_bb.values, original_df.values, rtol=0.1) + + + +def skewness(x): + """Calculate skewness of a distribution""" + n = len(x) + x_mean = np.mean(x) + return (np.sum((x - x_mean) ** 3) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 1.5) + +def kurtosis(x): + """Calculate kurtosis of a distribution""" + n = len(x) + x_mean = np.mean(x) + return (np.sum((x - x_mean) ** 4) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 2) + + + + +def test_normal_score_with_external_data(): + """Test NormalScoreTransformer with external already-transformed data""" + # Create training data with a specific distribution + np.random.seed(42) + n = 100 + training_data = pd.DataFrame({ + 'normal': np.random.normal(5, 2, n), + 'lognormal': np.exp(np.random.normal(1, 0.5, n)), + 'uniform': np.random.uniform(0, 10, n) + }) + + # Create "external" data that we'll pretend is already transformed + # For this test, we'll generate values in the typical normal score range (-3 to 3) + external_transformed = pd.DataFrame({ + 'normal': np.random.normal(0, 1, 1), # Already in normal score space + 'lognormal': np.random.normal(0, 1, 1), + 'uniform': np.random.normal(0, 1, 1) + }) + + # Initialize and fit transformer on training data + nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + nst.fit(training_data) + + # Transform training data to verify transformation works + transformed_training = nst.transform(training_data) + + # Check that transformed data has properties of normal distribution + for col in training_data.columns: + # Mean should be close to 0 + assert abs(transformed_training[col].mean()) < 0.3 + # Standard deviation should be close to 1 + assert abs(transformed_training[col].std() - 1.0) < 0.3 + + # Store column parameters for inspection + z_scores = {} + originals = {} + for col in training_data.columns: + params = nst.column_parameters.get(col, {}) + z_scores[col] = params.get('z_scores', []) + originals[col] = params.get('originals', []) + + # Verify column parameters were created + assert len(z_scores[col]) > 0 + assert len(originals[col]) > 0 + + # Apply inverse transform to external transformed data directly + back_external = nst.inverse_transform(external_transformed) + + # Verify the shape matches + assert back_external.shape == external_transformed.shape + + # Apply the transform to back_external to check if it recovers external_transformed + re_transformed = nst.transform(back_external) + + # Check that re-transforming recovers values close to the external_transformed + # Note: exact recovery isn't expected due to interpolation/extrapolation + for col in external_transformed.columns: + # Values inside the normal range (-2 to 2) should be very close + inside_range = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) + if inside_range.any(): + np.testing.assert_allclose( + re_transformed.loc[inside_range, col].values, + external_transformed.loc[inside_range, col].values, + rtol=0.2 + ) + + # Test external values that are far outside the z-score range + extreme_transformed = pd.DataFrame({ + 'normal': np.array([-5, 0, 5],dtype=float), # Includes extreme values + 'lognormal': np.array([-5, 0, 5],dtype=float), + 'uniform': np.array([-5, 0, 5],dtype=float) + }) + + # Test with extrapolation first + nst_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + nst_extrap.fit(training_data) + back_extreme_extrap = nst_extrap.inverse_transform(extreme_transformed) + + # Test without extrapolation + nst_no_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) + nst_no_extrap.fit(training_data) + back_extreme_no_extrap = nst_no_extrap.inverse_transform(extreme_transformed) + + # With extrapolation, extreme values should be outside the original data range + for col in training_data.columns: + min_orig = training_data[col].min() + max_orig = training_data[col].max() + + # Check extrapolation is working (values outside original range) + assert back_extreme_extrap[col].min() < min_orig or back_extreme_extrap[col].max() > max_orig + + # Without extrapolation, values should be clamped to original range + assert back_extreme_no_extrap[col].min() >= min_orig - 1e-10 # Allow for floating point error + assert back_extreme_no_extrap[col].max() <= max_orig + 1e-10 + + # Test with AutobotsAssemble to ensure the pipeline works with external transformed data + aa = pyemu.emulators.AutobotsAssemble(training_data.copy()) + aa.apply('normal_score') + + # Test applying inverse transform to external data + back_from_aa = aa.inverse(external_transformed.copy()) + + # Verify results with direct inverse transform + np.testing.assert_allclose( + back_from_aa.values, + nst.inverse_transform(external_transformed).values, + rtol=1e-3 + ) \ No newline at end of file From d1d684e3a0dfb2692fa08b90d53e42dc4663af5a Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 16 Jun 2025 16:17:05 +0100 Subject: [PATCH 07/58] dsi initial commit --- pyemu/emulators/dsi.py | 598 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100755 pyemu/emulators/dsi.py diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py new file mode 100755 index 000000000..6891f18e8 --- /dev/null +++ b/pyemu/emulators/dsi.py @@ -0,0 +1,598 @@ +""" +Data Space Inversion (DSI) emulator implementation. +""" +from __future__ import print_function, division +import numpy as np +import pandas as pd +import inspect +from pyemu.utils.helpers import dsi_forward_run, series_to_insfile +import pickle +import os +import shutil +from pyemu.pst.pst_handler import Pst +from pyemu.en import ObservationEnsemble,ParameterEnsemble +from .base import Emulator + +class DSI(Emulator): + """ + Data Space Inversion emulator class. + + #TODO: add more docstring details + + Parameters + ---------- + pst : Pst, optional + A Pst object. If provided, the emulator will be initialized with the + information from the Pst object. + sim_ensemble : ObservationEnsemble, optional + An ensemble of simulated observations. If provided, the emulator will + be initialized with the information from the ensemble. + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g.,'log10', 'normal_score'). + - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. + - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). + Example: + transforms = [ + {'type': 'log10', 'columns': ['obs1', 'obs2']}, + {'type': 'normal_score', 'quadratic_extrapolation': True} + ] + Default is None, which means no transformations will be applied. + energy_threshold : float, optional + The energy threshold for the SVD. Default is 1.0, no truncation. + verbose : bool, optional + If True, enable verbose logging. Default is False. + """ + + def __init__(self, + pst=None, + sim_ensemble=None, + transforms=None, + energy_threshold=1.0, + verbose=False): + """ + Initialize the DSI emulator. + + Parameters + ---------- + pst : Pst, optional + A Pst object. If provided, the emulator will be initialized with the + information from the Pst object. + sim_ensemble : ObservationEnsemble, optional + An ensemble of simulated observations. If provided, the emulator will + be initialized with the information from the ensemble. + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g.,'log10', 'normal_score'). + - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. + - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). + Example: + transforms = [ + {'type': 'log10', 'columns': ['obs1', 'obs2']}, + {'type': 'normal_score', 'quadratic_extrapolation': True} + ] + Default is None, which means no transformations will be applied. + energy_threshold : float, optional + The energy threshold for the SVD. Default is 1.0, no truncation. + verbose : bool, optional + If True, enable verbose logging. Default is False. + """ + + super().__init__(verbose=verbose) + + self.__org_observation_data = pst.observation_data.copy() if pst is not None else None + #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None + #self.__org_control_data = pst.control_data.copy() #breaks pickling + if isinstance(sim_ensemble, ObservationEnsemble): + sim_ensemble = sim_ensemble._df.copy() + #self.__org_sim_ensemble = sim_ensemble.copy() if sim_ensemble is not None else None + self.data = sim_ensemble.copy() if sim_ensemble is not None else None + #self.feature_scaler = None + self.energy_threshold = energy_threshold + assert isinstance(transforms, list) or transforms is None, "transforms must be a list of dicts or None" + if transforms is not None: + for t in transforms: + assert isinstance(t, dict), "each transform must be a dict" + assert 'type' in t, "each transform dict must have a 'type' key" + if 'columns' in t: + assert isinstance(t['columns'], list), "'columns' must be a list of column names" + #all columns must be in the data + assert all([col in self.data.columns for col in t['columns']]), "some columns in 'columns' are not in the data" + if t['type'] == 'normal_score': + # check for quadratic_extrapolation + if 'quadratic_extrapolation' in t: + assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean" + self.transforms = transforms + self.fitted = False + self.data_transformed = None + self.decision_variable_names = None #used for DSIVC + + def prepare_training_data(self, data=None): + """ + Prepare training data by applying transformations and computing the projection matrix. + + This method follows these steps: + 1. Apply feature transformations (log transform, normal score transform) + 2. Compute projection matrix using SVD + + Parameters + ---------- + data : pandas.DataFrame, optional + Data to prepare. If None, uses self.data. + + Returns + ------- + pandas.DataFrame + The prepared data. + """ + if data is None: + data = self.data + + if data is None: + raise ValueError("No data provided and no data stored in the emulator") + + # Always use the base class transformation method for consistency + if self.transforms is not None: + self.data_transformed = self.apply_feature_transforms(data, self.transforms) + else: + # Still need to set up a dummy transformer for inverse operations + from .transformers import AutobotsAssemble + self.feature_transformer = AutobotsAssemble(data.copy()) + self.data_transformed = data.copy() + + return self.data_transformed + + def compute_projection_matrix(self, energy_threshold=None): + """ + Compute the projection matrix using SVD. + + Parameters + ---------- + energy_threshold : float, optional + Energy threshold for truncation. Default is None, which uses the threshold from initialization. + + Returns + ------- + None + """ + self.logger.statement("normalizing data") + # normalize the data by subtracting the mean and dividing by the standard deviation + X = self.data_transformed.copy() + deviations = X - X.mean() + z = deviations / np.sqrt(float(X.shape[0] - 1)) + if isinstance(z, pd.DataFrame): + z = z.values + + self.logger.statement("undertaking SVD") + u, s, v = np.linalg.svd(z, full_matrices=False) + us = np.dot(v.T, np.diag(s)) + if energy_threshold is None: + energy_threshold = self.energy_threshold + if energy_threshold<1.0: + self.logger.statement("applying energy truncation") + # compute the cumulative energy of the singular values + cumulative_energy = np.cumsum(s**2) / np.sum(s**2) + print(cumulative_energy) + # find the number of components needed to reach the energy threshold + num_components = np.argmax(cumulative_energy >= energy_threshold) + 1 + # keep only the first num_components singular values and vectors + us = us[:, :num_components] + s = s[:num_components] + u = u[:, :num_components] + print(f"Truncated from {len(s)} to {num_components} components while retaining {energy_threshold*100:.1f}% of variance") + if num_components<=1: + print(f"Warning: only {num_components} component retained, you may need to check the data") + + self.logger.statement("calculating us matrix") + + # store components needed for forward run + # store mean vector + self.ovals = self.data_transformed.mean(axis=0) + # store proj matrix and singular values + self.pmat = us + self.s = s + return + + def fit(self, X=None, y=None): + """ + Fit the emulator to training data. + + Parameters + ---------- + X : pandas.DataFrame + Input data to fit the emulator on. + y : None + Not used, present for API consistency. + + Returns + ------- + self : DSI + The fitted emulator. + """ + if X is not None: + self.data = X + self.logger.statement("transforming new training data") + self.data_transformed = self.prepare_training_data() + + if self.data_transformed is None: + self.logger.statement("transforming training data") + self.data_transformed = self.prepare_training_data() + + # Compute projection matrix + self.compute_projection_matrix() + self.fitted = True + return self + + def predict(self, pvals): + """ + Generate predictions from the emulator. + + Parameters + ---------- + pvals : numpy.ndarray or pandas.Series + Parameter values for prediction. + + Returns + ------- + pandas.Series + Predicted observation values. + """ + if not self.fitted: + raise ValueError("Emulator must be fitted before prediction") + + if not hasattr(self, 'feature_transformer') or self.feature_transformer is None: + raise ValueError("Emulator must be fitted and have valid transformations before prediction") + + if isinstance(pvals, pd.Series): + pvals = pvals.values.flatten() + assert pvals.shape[0] == self.s.shape[0], "pvals must be the same length as the number of singular values" + assert pvals.shape[0] == self.pmat.shape[1], "pvals must be the same length as the number of singular values" + pmat = self.pmat + ovals = self.ovals + sim_vals = ovals + np.dot(pmat,pvals) + ft = self.feature_transformer + sim_vals = ft.inverse(sim_vals) + sim_vals.index.name = 'obsnme' + sim_vals.name = "obsval" + self.sim_vals = sim_vals + return sim_vals + + def check_for_pdc(self): + """Check for Prior data conflict.""" + #TODO + return + + def prepare_pestpp(self, t_d=None, observation_data=None): + """ + Prepare PEST++ control files for the emulator. + + Parameters + ---------- + t_d : str, optional + Template directory path. Must be provided. + observation_data : pandas.DataFrame, optional + Observation data to use. If None, uses the data from initialization. + + Returns + ------- + Pst + PEST++ control file object. + """ + + assert t_d is not None, "template directory must be provided" + self.template_dir = t_d + + if os.path.exists(t_d): + shutil.rmtree(t_d) + os.makedirs(t_d) + self.logger.statement("creating template directory {0}".format(t_d)) + + self.logger.log("creating tpl files") + dsi_in_file = os.path.join(t_d, "dsi_pars.csv") + dsi_tpl_file = dsi_in_file + ".tpl" + ftpl = open(dsi_tpl_file, 'w') + fin = open(dsi_in_file, 'w') + ftpl.write("ptf ~\n") + fin.write("parnme,parval1\n") + ftpl.write("parnme,parval1\n") + npar = self.s.shape[0] + assert npar>0, "no parameters found in the DSI emulator" + dsi_pnames = [] + for i in range(npar): + pname = "dsi_par{0:04d}".format(i) + dsi_pnames.append(pname) + fin.write("{0},0.0\n".format(pname)) + ftpl.write("{0},~ {0} ~\n".format(pname, pname)) + fin.close() + ftpl.close() + self.logger.log("creating tpl files") + + # run once to get the dsi_pars.csv file + pvals = np.zeros_like(self.s) + sim_vals = self.predict(pvals) + + self.logger.log("creating ins file") + out_file = os.path.join(t_d,"dsi_sim_vals.csv") + sim_vals.to_csv(out_file,index=True) + + ins_file = out_file + ".ins" + sdf = pd.read_csv(out_file,index_col=0) + with open(ins_file,'w') as f: + f.write("pif ~\n") + f.write("l1\n") + for oname in sdf.index.values: + f.write("l1 ~,~ !{0}!\n".format(oname)) + self.logger.log("creating ins file") + + self.logger.log("creating Pst") + pst = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[ins_file],[out_file],pst_path=".") + + par = pst.parameter_data + dsi_pars = par.loc[par.parnme.str.startswith("dsi_par"),"parnme"] + par.loc[dsi_pars,"parval1"] = 0 + par.loc[dsi_pars,"parubnd"] = 10.0 + par.loc[dsi_pars,"parlbnd"] = -10.0 + par.loc[dsi_pars,"partrans"] = "none" + with open(os.path.join(t_d,"dsi.unc"),'w') as f: + f.write("START STANDARD_DEVIATION\n") + for p in dsi_pars: + f.write("{0} 1.0\n".format(p)) + f.write("END STANDARD_DEVIATION") + pst.pestpp_options['parcov'] = "dsi.unc" + + obs = pst.observation_data + + if observation_data is None: + observation_data = self.__org_observation_data + assert isinstance(observation_data, pd.DataFrame), "observation_data must be a pandas DataFrame" + for col in observation_data.columns: + obs.loc[sim_vals.index,col] = observation_data.loc[:,col] + + # check if any observations are missing + missing_obs = list(set(obs.index) - set(observation_data.index)) + assert len(missing_obs) == 0, "missing observations: {0}".format(missing_obs) + + pst.control_data.noptmax = 0 + pst.model_command = "python forward_run.py" + self.logger.log("creating Pst") + + + function_source = inspect.getsource(dsi_forward_run) + with open(os.path.join(t_d,"forward_run.py"),'w') as file: + file.write(function_source) + file.write("\n\n") + file.write("if __name__ == \"__main__\":\n") + file.write(f" {function_source.split('(')[0].split('def ')[1]}()\n") + self.logger.log("creating Pst") + + pst.pestpp_options["save_binary"] = True + pst.pestpp_options["overdue_giveup_fac"] = 1e30 + pst.pestpp_options["overdue_giveup_minutes"] = 1e30 + pst.pestpp_options["panther_agent_freeze_on_fail"] = True + pst.pestpp_options["ies_no_noise"] = False + pst.pestpp_options["ies_subset_size"] = -10 # the more the merrier + #pst.pestpp_options["ies_bad_phi_sigma"] = 2.0 + #pst.pestpp_options["save_binary"] = True + + pst.write(os.path.join(t_d,"dsi.pst"),version=2) + self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst"))) + + #self.pst_dsi = pst #breaks pickling #TODO: add save/load methods to Emulator class + with open(os.path.join(t_d,"dsi.pickle"),"wb") as f: + pickle.dump(self,f) + return pst + + def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None): + """ + Prepare Data Space Inversion Variable Control (DSIVC) control files. + + Parameters + ---------- + decvar_names : list or str + Names of decision variables. + t_d : str, optional + Template directory path. + pst : Pst, optional + PST control file object. + oe : ObservationEnsemble, optional + Observation ensemble. + track_stack : bool, optional + Whether to track the stack. Default is False. + dsi_args : dict, optional + Arguments for DSI. + percentiles : list, optional + Percentiles to calculate. Default is [0.25, 0.75, 0.5]. + mou_population_size : int, optional + Population size for multi-objective optimization. + + Returns + ------- + Pst + PEST++ control file object for DSIVC. + """ + # check that percentiles is a list or array of floats between 0 and 1. + assert isinstance(percentiles, (list, np.ndarray)), "percentiles must be a list or array of floats" + assert all([isinstance(i, (float, int)) for i in percentiles]), "percentiles must be a list or array of floats" + assert all([0 <= i <= 1 for i in percentiles]), "percentiles must be between 0 and 1" + # ensure that pecentiles are unique + percentiles = np.unique(percentiles) + + + #track dsivc args for forward run + self.dsivc_args = {"percentiles":percentiles, + "decvar_names":decvar_names, + "track_stack":track_stack, + } + + if t_d is None: + self.logger.statement("using existing DSI template dir...") + t_d = self.template_dir + self.logger.statement(f"using {t_d} as template directory...") + assert os.path.exists(t_d), f"template directory {t_d} does not exist" + + if pst is None: + self.logger.statement("no pst provided...") + self.logger.statement("using dsi.pst in DSI template dir...") + assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}" + pst = Pst(os.path.join(t_d,"dsi.pst")) + if oe is None: + self.logger.statement("no posterior DSI observation ensemble provided, using dsi.3.obs.jcb in DSI template dir...") + self.logger.statement(f"using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...") + assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}" + oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")) + else: + assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" + + #check if decvar_names str + if isinstance(decvar_names, str): + decvar_names = [decvar_names] + # chekc htat decvars are in the oe columns + missing = [col for col in decvar_names if col not in oe.columns] + assert len(missing) == 0, f"The following decvars are missing from the DSI obs ensemble: {missing}" + # chekc htat decvars are in the pst observation data + missing = [col for col in decvar_names if col not in pst.obs_names] + assert len(missing) == 0, f"The following decvars are missing from the DSI pst control file: {missing}" + + + # handle DSI args + default_dsi_args = {"noptmax":pst.control_data.noptmax, + "decvar_weight":1.0, + #"decvar_phi_factor":0.5, + "num_pyworkers":1, + } + # ensure it's a dict + if dsi_args is None: + dsi_args = {} + elif not isinstance(dsi_args, dict): + raise TypeError("Expected a dictionary for 'options'") + # merge with defaults (user values override defaults) + dsi_args = {**default_dsi_args, **dsi_args} + + + out_files = [] + + self.logger.statement(f"preparing stack stats observations...") + assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" + if oe.index.name is None: + id_vars="index" + else: + id_vars=oe.index.name + stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars=id_vars) + stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True) + stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1) + stack_stats.set_index("obsnme",inplace=True) + stack_stats = stack_stats.obsval + self.logger.statement(f"stack osb recorded to dsi.stack_stats.csv...") + out_file = os.path.join(t_d,"dsi.stack_stats.csv") + out_files.append(out_file) + stack_stats.to_csv(out_file,float_format="%.6e") + series_to_insfile(out_file,ins_file=None) + + + if track_stack: + self.logger.statement(f"including {oe.values.flatten().shape[0]} stack observations...") + + stack = oe._df.reset_index().melt(id_vars=id_vars) + stack.rename(columns={"value":"obsval"},inplace=True) + stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.index,axis=1) + stack.set_index("obsnme",inplace=True) + stack = stack.obsval + out_file = os.path.join(t_d,"dsi.stack.csv") + out_files.append(out_file) + stack.to_csv(out_file,float_format="%.6e") + series_to_insfile(out_file,ins_file=None) + + + + self.logger.statement(f"prepare DSIVC template files...") + dsi_in_file = os.path.join(t_d, "dsivc_pars.csv") + dsi_tpl_file = dsi_in_file + ".tpl" + ftpl = open(dsi_tpl_file, 'w') + fin = open(dsi_in_file, 'w') + ftpl.write("ptf ~\n") + fin.write("parnme,parval1\n") + ftpl.write("parnme,parval1\n") + for pname in decvar_names: + val = oe._df.loc[:,pname].mean() + fin.write(f"{pname},{val:.6e}\n") + ftpl.write(f"{pname},~ {pname} ~\n") + fin.close() + ftpl.close() + + + self.logger.statement(f"building DSIVC control file...") + pst_dsivc = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[i+".ins" for i in out_files],out_files,pst_path=".") + + self.logger.statement(f"setting dec var bounds...") + par = pst_dsivc.parameter_data + # set all parameters fixed + par.loc[:,"partrans"] = "fixed" + # constrain decvar pars to training data bounds + par.loc[decvar_names,"pargp"] = "decvars" + par.loc[decvar_names,"partrans"] = "none" + par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max() + par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min() + + self.logger.statement(f"zero-weighting observation data...") + # prepemtpively set obs weights 0.0 + obs = pst_dsivc.observation_data + obs.loc[:,"weight"] = 0.0 + + self.logger.statement(f"getting obs metadata from DSI observation_data...") + obsorg = pst.observation_data.copy() + columns = [i for i in obsorg.columns if i !='obsnme'] + for o in obsorg.obsnme.values: + obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values + + obs.loc[stack_stats.index,"obgnme"] = "stack_stats" + #obs.loc[stack.index,"obgnme"] = "stack" + + self.logger.statement(f"building dsivc_forward_run.py...") + pst_dsivc.model_command = "python dsivc_forward_run.py" + from pyemu.utils.helpers import dsivc_forward_run + function_source = inspect.getsource(dsivc_forward_run) + with open(os.path.join(t_d,"dsivc_forward_run.py"),'w') as file: + file.write(function_source) + file.write("\n\n") + file.write("if __name__ == \"__main__\":\n") + file.write(f" {function_source.split('(')[0].split('def ')[1]}()\n") + + self.logger.statement(f"preparing nominal initial population...") + if mou_population_size is None: + # set the population size to 2 * number of decision variables + # this is a good rule of thumb for MOU + mou_population_size = 2 * len(decvar_names) + # these should generally be twice the number of decision variables + if mou_population_size < 2 * len(decvar_names): + self.logger.statement(f"mou population is less than 2x number of decision variables, this may be too small...") + # sample 160 sets of decision variables from a unform distribution + dvpop = ParameterEnsemble.from_uniform_draw(pst_dsivc,num_reals=mou_population_size) + # record to external file for PESTPP-MOU + dvpop.to_binary(os.path.join(t_d,"initial_dvpop.jcb")) + # tell PESTPP-MOU about the new file + pst_dsivc.pestpp_options["mou_dv_population_file"] = 'initial_dvpop.jcb' + + + # some additional PESTPP-MOU options: + pst_dsivc.pestpp_options["mou_population_size"] = mou_population_size #twice the number of decision variables + pst_dsivc.pestpp_options["mou_save_population_every"] = 1 # save lots of files! + + pst_dsivc.control_data.noptmax = 0 #just for a test run + pst_dsivc.write(os.path.join(t_d,"dsivc.pst"),version=2) + + # updating the DSI pst control file + self.logger.statement(f"updating DSI pst control file...") + self.logger.statement("overwriting dsi.pst file...") + pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"] + pst.control_data.noptmax = dsi_args["noptmax"] + pst.write(os.path.join(t_d,"dsi.pst"), version=2) + + + self.logger.statement("overwriting dsi.pickle file...") + self.decision_variable_names = decvar_names + # re-pickle dsi to track dsivc args + with open(os.path.join(t_d,"dsi.pickle"),"wb") as f: + pickle.dump(self,f) + + self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...") + return pst_dsivc \ No newline at end of file From 0337259e6e24351921e967824a324f44de2d429f Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 16 Jun 2025 20:07:00 +0100 Subject: [PATCH 08/58] refactor dsi helper functions --- pyemu/utils/helpers.py | 375 +++++++++++++++-------------------------- 1 file changed, 132 insertions(+), 243 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index e8162f076..24cf748dc 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4043,7 +4043,7 @@ def get_current_prop(_cur_thresh): return thresh, prop -def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d="template",gp_kernel=None,nverf=0, +def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_kernel=None,nverf=0, plot_fits=False,apply_standard_scalar=False, include_emulated_std_obs=False): """helper function to setup a gaussian-process-regression (GPR) emulator for outputs of interest. This is primarily targeted at low-dimensional settings like those encountered in PESTPP-MOU @@ -4054,7 +4054,6 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d output_fnames (str | list[str]): usually a list of observation population files that corresponds to the simulation results associated with `input_fnames` gpr_t_d (str): the template file dir to create that will hold the GPR emulators - t_d (str): the template dir containing the PESTPP-MOU outputs that the GPR emulators are trained on gp_kernel (sklearn GaussianProcess kernel): the kernel to use. if None, a standard RBF kernel is created and used nverf (int): the number of input-output pairs to hold back for a simple verification test @@ -4181,7 +4180,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(os.path.join(gpr_t_d,"gpr_fits.pdf")) - for i,output_name in enumerate(output_names): + for output_name in output_names: y_verf = df.loc[:,output_name].values.copy()[cut:] y_train = df.loc[:, output_name].values.copy()[:cut] @@ -4221,8 +4220,8 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d plt.close(fig) - objname = f'obj_{i}' - model_fname = os.path.split(pst_fname)[1]+"."+objname+".pkl" + + model_fname = os.path.split(pst_fname)[1]+"."+output_name+".pkl" if os.path.exists(os.path.join(gpr_t_d,model_fname)): print("WARNING: model_fname '{0}' exists, overwriting...".format(model_fname)) with open(os.path.join(gpr_t_d,model_fname),'wb') as f: @@ -4324,13 +4323,6 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d gpst_fname = os.path.split(pst_fname)[1] gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2) print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d) - - #if they exist, copy pestpp bins from t_d over to gpr_t_d. otherwise, we assume bin is in path - pp_bins = [f for f in os.listdir(t_d) if 'pestpp-' in f] - if len(pp_bins)>0: - for pp_bin in pp_bins: - shutil.copy2(os.path.join(t_d,pp_bin),os.path.join(gpr_t_d,pp_bin)) - try: pyemu.os_utils.run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d) except Exception as e: @@ -4425,82 +4417,123 @@ def gpr_forward_run(): return mdf -def dsi_forward_run(pmat=None,ovals=None,pvals=None, - write_csv=True - - ): +def dsi_forward_run(pvals,dsi,write_csv=False): + assert isinstance(dsi,pyemu.emulators.DSI), "dsi must be a pyemu DSI object" + if isinstance(pvals,pd.DataFrame): + pvals = pvals.parval1 + sim_vals = dsi.predict(pvals) + if write_csv: + sim_vals.to_csv("dsi_sim_vals.csv") + return sim_vals - if pvals is None: - pvals = pd.read_csv("dsi_pars.csv",index_col=0) - if pmat is None: - pmat = np.load("dsi_proj_mat.npy") - if ovals is None: - ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0) +def dsivc_forward_run(md_ies="."): + import pandas as pd + import pyemu + import os + import pickle + from pyemu.utils.os_utils import PortManager + + # load the dsi pest control file + pst_dsi = pyemu.Pst(os.path.join(md_ies,"dsi.pst")) + noptmax = pst_dsi.control_data.noptmax try: - offset = np.load("dsi_obs_offset.npy") + os.remove("dsi.noise.jcb") except: - #print("no offset file found, assuming no offset") - offset = np.zeros(ovals.shape[0]) + print("dsi.noise.jcb not found, continuing...") try: - log_trans = np.load("dsi_obs_log.npy") + os.remove("dsi.stack.csv") except: - #print("no log-tansform file found, assuming no log-transform") - log_trans = np.zeros(ovals.shape[0]) - + print("dsi.stack.csv not found, continuing...") try: - backtransformvals = np.load("dsi_obs_backtransformvals.npy") - backtransformobsnmes = np.load("dsi_obs_backtransformobsnmes.npy",allow_pickle=True) - backtransform=True + os.remove("dsi.stack_stats.csv") except: - #print("no back-transform file found, assuming no back-transform") - backtransform=False - - - sim_vals = ovals + np.dot(pmat,pvals.values) - - if backtransform: - #print("applying back-transform") - obsnmes = np.unique(backtransformobsnmes) - back_vals = [ - inverse_normal_score_transform( - backtransformvals[np.where(backtransformobsnmes==o)][:,1], - backtransformvals[np.where(backtransformobsnmes==o)][:,0], - sim_vals.loc[o].mn, - extrap=None - )[0] - for o in obsnmes - ] - sim_vals.loc[obsnmes,'mn'] = back_vals - - #print("reversing offset and log-transform") - assert log_trans.shape[0] == sim_vals.mn.values.shape[0], f"log transform shape mismatch: {log_trans.shape[0]},{sim_vals.mn.values.shape[0]}" - assert offset.shape[0] == sim_vals.mn.values.shape[0], f"offset transform shape mismatch: {offset.shape[0]},{sim_vals.mn.values.shape[0]}" - vals = sim_vals.mn.values - vals[np.where(log_trans==1)] = 10**vals[np.where(log_trans==1)] - vals-= offset - sim_vals.loc[:,'mn'] = vals - #print(sim_vals) - if write_csv: - sim_vals.to_csv("dsi_sim_vals.csv") - return sim_vals - - -def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None): + print("dsi.stack_stats.csv not found, continuing...") + try: + os.remove(f"dsi.{noptmax}.obs.jcb") + except: + print(f"dsi.{noptmax}.obs.jcb not found, continuing...") + + # load decvars + decvars = pd.read_csv(os.path.join(md_ies, "dsivc_pars.csv"),index_col=0) + assert decvars.shape[0]>0, "no decvars found in dsivc_pars.csv" + + + + # update the decavar obs values in the observation data + obs = pst_dsi.observation_data + assert obs.loc[decvars.index].shape[0] == decvars.shape[0], "not all decvars found in obs data" + assert all(obs.loc[decvars.index].weight > 0.0), "decvar weights should be > 0.0" + obs.loc[decvars.index,"obsval"] = decvars.values + + # update the obs+noise file with the decvar values to ensure NO NOISE on the decvars + noise = pyemu.ObservationEnsemble.from_binary(pst_dsi,os.path.join(md_ies,"dsi.obs+noise.jcb")) + # check that all of decvars.index are in noise.columns + assert len([i for i in decvars.index if i not in noise.columns.tolist()]) == 0, "some decvars not in noise columns" + # update columns in noise if column name in decvars.index + for col in decvars.index: + noise.loc[:,col] = noise.loc[:,col].astype(float) + noise.loc[:,col] = decvars.loc[col].values[0] + # record noise + noise.to_binary(os.path.join(md_ies,"dsi.noise.jcb")) + # make sure pestpp options + pst_dsi.pestpp_options["ies_observation_ensemble"] = "dsi.noise.jcb" + # rewrite the dsi.pst file + pst_dsi.write(os.path.join(md_ies,"dsi.pst"),version=2) + + # deploy dsi... + pvals = pd.read_csv(os.path.join(md_ies,"dsi_pars.csv"),index_col=0) + num_workers=1 + worker_root="." + dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb")) + num_workers = dsi.dsivc_args.get("num_pyworkers",1) + pyemu.os_utils.start_workers(md_ies,"pestpp-ies","dsi.pst", + num_workers=num_workers, + worker_root=worker_root, + port = PortManager().get_available_port(), + master_dir=md_ies, + reuse_master =True, + ppw_function=pyemu.helpers.dsi_pyworker, + ppw_kwargs={"dsi":dsi,"pvals":pvals}) + assert os.path.exists(os.path.join(md_ies,f"dsi.{noptmax}.obs.jcb")), f"dsi.{noptmax}.obs.jcb not found...pst failed?" + + + #TODO: checks on PDC or Eulerian distance to training data? + + #postprocess stack + oe = pyemu.ObservationEnsemble.from_binary(pst_dsi,os.path.join(md_ies,f"dsi.{noptmax}.obs.jcb")) + assert oe.shape[0] == noise.shape[0], "stack and noise shapes do not match; failed runs?" + if dsi.dsivc_args.get("track_stack",False): + # write long form oe + stack = oe._df.reset_index().melt(id_vars="real_name") + stack.rename(columns={"value":"obsval"},inplace=True) + stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.real_name,axis=1) + stack.set_index("obsnme",inplace=True) + stack = stack.obsval + out_file = os.path.join(md_ies,"dsi.stack.csv") + stack.to_csv(out_file,float_format="%.6e") + #write stats + #get user-specified quantiles + percentiles = dsi.dsivc_args.get("percentiles",[0.25,0.75,0.5]) + stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars="index") + stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True) + stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1) + stack_stats.set_index("obsnme",inplace=True) + stack_stats = stack_stats.obsval + out_file = os.path.join(md_ies,"dsi.stack_stats.csv") + stack_stats.to_csv(out_file,float_format="%.6e") + + return + +def dsi_pyworker(pst,host,port,dsi=None,pvals=None): - import os import pandas as pd - import numpy as np - - # if explicit args weren't passed, get the default ones... if pvals is None: pvals = pd.read_csv("dsi_pars.csv",index_col=0) - if pmat is None: - pmat = np.load("dsi_proj_mat.npy") - if ovals is None: - ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0) - + if dsi is None: + import pickle + dsi = pickle.load(open("dsi.pickle","rb")) ppw = PyPestWorker(pst,host,port,verbose=False) @@ -4521,10 +4554,10 @@ def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None): # df needed to run the emulator pvals.parval1 = parameters.loc[pvals.index] # do the emulation - simdf = dsi_forward_run(pmat=pmat,ovals=ovals,pvals=pvals,write_csv=False) + simdf = dsi_forward_run(dsi=dsi,pvals=pvals,write_csv=False) # replace the emulated quantities in the obs series - obs.loc[simdf.index] = simdf.mn.values + obs.loc[simdf.index] = simdf.values #send the obs series to the master ppw.send_observations(obs.values) @@ -4535,171 +4568,27 @@ def dsi_pyworker(pst,host,port,pmat=None,ovals=None,pvals=None): if parameters is None: break - -def randrealgen_optimized(nreal, tol=1e-7, max_samples=1000000): - """ - Generate a set of random realizations with a normal distribution. - - Parameters: - nreal : int - The number of realizations to generate. - tol : float - Tolerance for the stopping criterion. - max_samples : int - Maximum number of samples to use. - - Returns: - numpy.ndarray - An array of nreal random realizations. +def series_to_insfile(out_file,ins_file=None): """ - rval = np.zeros(nreal) - nsamp = 0 - # if nreal is even add 1 - if nreal % 2 == 0: - numsort = (nreal + 1) // 2 - else: - numsort = nreal // 2 - while nsamp < max_samples: - nsamp += 1 - work1 = np.random.normal(size=nreal) - work1.sort() - - if nsamp > 1: - previous_mean = rval[:numsort] / (nsamp - 1) - rval[:numsort] += work1[:numsort] - current_mean = rval[:numsort] / nsamp - max_diff = np.max(np.abs(current_mean - previous_mean)) - - if max_diff <= tol: - break - else: - rval[:numsort] = work1[:numsort] - - rval[:numsort] /= nsamp - if nreal % 2 == 0: - rval[numsort:] = -rval[:numsort][::-1] - else: - rval[numsort+1:] = -rval[:numsort][::-1] - - return rval - - -def normal_score_transform(nstval, val, value): + convert a Pandas Series to an ins file + Parameters + ---------- + out_file : str + name of the output file to convert to ins file + ins_file : str + name of the ins file to create. if None, then out_file+".ins" is used + Returns + ------- + None """ - Transform a value to its normal score using a normal score transform table. - - Parameters: - nstval : array-like - Normal score transform table values. - val : array-like - Original values corresponding to the normal score transform table. - value : float - The value to transform. - - Returns: - float - The normal score of the value. - int - The index of the value in the normal score transform table.""" - - # make sure the input is numpy arrays - val = np.asarray(val) - nstval = np.asarray(nstval) - - # if the value is outside the range of the table, return the first or last value - assert value >= val[0], "Value is below the minimum value in the table." - assert value <= val[-1], "Value is greater than the maximum value in the table." - # ensure that val is sorted - assert np.all(np.diff(val) > 0), f"Values in the table must be sorted in ascending order:{list(zip(np.diff(val)>0,val))}" - - # find the rank of the value in the table - rank = np.searchsorted(val, value, side='right') - 1 - if rank == len(val) - 1: - return nstval[-1], len(val) - # if the value coincides with a value in the table, return the corresponding normal score - nstdiff = nstval[rank + 1] - nstval[rank] - diff = val[rank + 1] - val[rank] - if nstdiff <= 0.0 or diff <= 0.0: - return nstval[rank], rank - - # otherwise, interpolate to get the normal score - dist = value - val[rank] - interpolated_value = nstval[rank] + (dist / diff) * nstdiff - return interpolated_value, rank - - -def inverse_normal_score_transform(nstval, val, value, extrap='quadratic'): - nreal = len(val) - # check that nstval is sorted - assert np.all(np.diff(nstval) > 0), "Values in the table must be sorted in ascending order" - # check that val is sorted - assert np.all(np.diff(val) > 0), "Values in the table must be sorted in ascending order" - - def linear_extrapolate(x0, y0, x1, y1, x): - if x1 != x0: - return y0 + (y1 - y0) / (x1 - x0) * (x - x0) - return y0 - - def quadratic_extrapolate(x1, y1, x2, y2, x3, y3, x4): - y12=y1-y2 - x23=x2-x3 - y23=y2-y3 - x12=x1-x2 - x13=x1-x3 - if x12==0 or x23==0 or x13==0: - raise ValueError("Input x values must be distinct") - a = (y12*x23-y23*x12) - den = x12*x23*x13 - a = a/den - b = y23/x23 - a*(x2+x3) - c=y1-x1*(a*x1+b) - y4 = a*x4**2 + b*x4 + c - return y4 - - ilim = 0 - if value in nstval: - rank = np.searchsorted(nstval, value) - value = val[rank] - - elif value < nstval[0]: - ilim = -1 - if extrap is None: - value = val[0] - elif extrap == 'linear': - value = linear_extrapolate(nstval[0], val[0], nstval[1], val[1], value) - #value = min(value, val[0]) - elif extrap == 'quadratic' and nreal >= 3: - y_vals = np.unique(val)[:3] - idxs = np.searchsorted(val,y_vals) - x_vals = nstval[idxs] - value = quadratic_extrapolate(x_vals[-3], y_vals[-3], x_vals[-2], y_vals[-2], x_vals[-1], y_vals[-1], value) - #value = min(value, val[0]) - else: - value = val[0] - - elif value > nstval[-1]: - ilim = 1 - if extrap is None: - value = val[-1] - elif extrap == 'linear': - value = linear_extrapolate(nstval[-2], val[-2], nstval[-1], val[-1], value) - #value = max(value, val[-1]) - elif extrap == 'quadratic' and nreal >= 3: - y_vals = np.unique(val)[-3:] - idxs = np.searchsorted(val,y_vals) - x_vals = nstval[idxs] - value = quadratic_extrapolate(x_vals[-3], y_vals[-3], x_vals[-2], y_vals[-2], x_vals[-1], y_vals[-1], value) - #value = max(value, val[-1]) - else: - value = val[-1] - - else: - rank = np.searchsorted(nstval, value) - 1 - # Get the bounding x and y values - x0, x1 = nstval[rank], nstval[rank + 1] - y0, y1 = val[rank], val[rank + 1] - # Perform linear interpolation - value = y0 + (y1 - y0) * (value - x0) / (x1 - x0) - - return value, ilim - + if ins_file is None: + ins_file = out_file+".ins" + sdf = pd.read_csv(out_file,index_col=0) + assert sdf.shape[1] == 1, "only one column allowed" + sdf = sdf.iloc[:,0] + with open(ins_file,'w') as f: + f.write("pif ~\n") + f.write("l1\n") + for oname in sdf.index.values: + f.write("l1 ~,~ !{0}!\n".format(oname)) + return From 484865564809aa51f1f9b129ead476152d8f3253 Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 16 Jun 2025 20:08:22 +0100 Subject: [PATCH 09/58] refactor dsi out of EnDS --- pyemu/eds.py | 413 --------------------------------------------------- 1 file changed, 413 deletions(-) diff --git a/pyemu/eds.py b/pyemu/eds.py index 5b68f2611..1bb647098 100644 --- a/pyemu/eds.py +++ b/pyemu/eds.py @@ -11,7 +11,6 @@ from pyemu.mat.mat_handler import Matrix, Jco, Cov from pyemu.pst.pst_handler import Pst from pyemu.utils.os_utils import _istextfile,run -from pyemu.utils.helpers import normal_score_transform,randrealgen_optimized from .logger import Logger @@ -494,415 +493,3 @@ def get_posterior_prediction_moments(self, obslist_dict=None,sim_ensemble=None,i dfper = dfper.loc[groups,self.predictions] return mean_dfs,dfstd,dfper - - - def prep_for_dsi(self,sim_ensemble=None,t_d="dsi_template", - apply_normal_score_transform=False,nst_extrap=None, - use_ztz=False,energy=1.0): - """Setup a new PEST interface for the data-space inversion process. - If the observation data in the Pst object has a "obstransform" column, then observations for which "log" is specified will be subject to log-transformation. - If the `apply_normal_score_transform` flag is set to `True`, then the observations and predictions will be subject to a normal score transform. - - Args: - - sim_ensemble (`pyemu.ObservationEnsemble`): observation ensemble to use for DSI latent space - variables. If `None`, use `self.sim_ensemble`. Default is `None` - t_d (`str`): template directory to setup the DSI model + pest files in. Default is `dsi_template` - apply_normal_score_transform (`bool`): flag to apply a normal score transform to the observations - and predictions. Default is `False` - nst_extrap (`str`): flag to apply extrapolation to the normal score transform. Can be None, 'linear' or 'quadratic'. Default is None. - use_ztz (`bool`): flag to use the condensed ZtZ matrix for SVD. The ZtZ matrix has dimensions nreal*nreal, instead of the nreal*nobs dimensions of Z. - This makes the SVD computation faster and more memory efficient when nobs >> nreal. - Default is `False` - energy (`float`): energy threshold for truncating the sqrt(C) matrix. Default is `1.0` which applies no truncation. - - Example:: - - #assumes "my.pst" exists - ends = pyemu.EnDS(ensemble="my.0.obs.jcb",forecasts=["fore1","fore2"]) - ends.prep_for_dsi() #setup a new pest interface() based on the DSI approach - pyemu.os_utils.start_workers("pestpp-ies","my.pst","dsi_template",num_workers=20, - master_dir="dsi_master") - - - - """ - if sim_ensemble is None: - sim_ensemble = self.sim_ensemble.copy() - - if nst_extrap is not None: - assert nst_extrap in ["linear","quadratic"], "nst_extrap must be None, 'linear' or 'quadratic'" - - if os.path.exists(t_d): - self.logger.warn("EnDS.prep_for_dsi(): t_d '{0}' exists, removing...".format(t_d)) - shutil.rmtree(t_d) - os.makedirs(t_d) - - - nz_names = self.pst.nnz_obs_names - snz_names = set(nz_names) - z_names = [n for n in self.pst.obs_names if n not in snz_names] - names = z_names.copy() - names.extend(nz_names) - names.sort() - - # make sure names are sorted - sim_ensemble = sim_ensemble.loc[:,names] - - self.logger.log("applying transformations") - # implement log-transform/offset and normal score transform - transf_names = nz_names.copy() - transf_names.extend(self.predictions) - - if "obstransform" in self.pst.observation_data.columns: - obs = self.pst.observation_data.copy() - #make sure names are ordered - obs = obs.loc[names,:] - #TODO: deal with "scale" and user-specified "offset" - obs["offset"] = 0.0 #TODO: more elegant? in case all 'none' are passed... - obsnmes = obs.loc[obs.obstransform=='log'].obsnme.values - if len(obsnmes) > 0: - for name in obsnmes: - #TODO: make more efficient - self.logger.log("applying obs log-transform to:"+name) - values = sim_ensemble.loc[:,name].astype(float).values - offset = abs(min(values))+1.0 #arbitrary; enforce positive values - values+=offset - assert min(values)>0, "values must be positive. min value is "+str(min(values)) - sim_ensemble.loc[:,name] = np.log10(values) - obs.loc[obs.obsnme==name,'offset'] = offset - obs[['obsnme','obsval','obstransform','offset']].to_csv(os.path.join(t_d,"dsi_obs_transform.csv"),index=False) - #numpy binary for i/o speed - np.save(os.path.join(t_d,"dsi_obs_offset.npy"), - obs.offset.values, - allow_pickle=False, fix_imports=True) - obs['flag'] = 0 - obs.loc[obs.obstransform=='log', "flag"] = 1 - np.save(os.path.join(t_d,"dsi_obs_log.npy"), - obs.flag.values, - allow_pickle=False, fix_imports=True) - - if apply_normal_score_transform: - # prepare for normal score transform - nstval = randrealgen_optimized(sim_ensemble.shape[0]) - back_transform_df = pd.DataFrame() - self.logger.log("applying normal score transform to non-zero obs and predictions") - #TODO: make more efficient - for name in transf_names: - print("transforming:",name) - values = sim_ensemble._df.loc[:,name].copy() - values.sort_values(inplace=True) - if values.iloc[0] != values.iloc[-1]: - # apply smoothing as per DSI2; window sizes are arbitrary... - window_size=3 - if values.shape[0]>40: - window_size=5 - if values.shape[0]>90: - window_size=7 - if values.shape[0]>200: - window_size=9 - #print("window size:",window_size,values.shape[0]) - values.loc[:] = moving_average_with_endpoints(values.values, window_size) - transformed_values = [normal_score_transform(nstval, values.values, v)[0] for v in values.values] - #transformed_values, sorted_values, sorted_idxs = normal_score_transform(values) #transformed data retains the same order as the original data - elif values.iloc[0] == values.iloc[-1]: - print("all values are the same, skipping nst") - transformed_values = values.values - sim_ensemble.loc[values.index,name] = transformed_values - df = pd.DataFrame() - df['real'] = values.index - df['sorted_values'] = values.values - df['transformed_values'] = transformed_values - df['nstval'] = nstval - df['obsnme'] = name - back_transform_df=pd.concat([back_transform_df,df],ignore_index=True) - #back_transform_df.to_csv(os.path.join(t_d,"dsi_obs_backtransform.csv"),index=False) - #numpy binary for speed - np.save(os.path.join(t_d,"dsi_obs_backtransformvals.npy"), - back_transform_df[['sorted_values',"nstval"]].values, - allow_pickle=False, fix_imports=True) - np.save(os.path.join(t_d,"dsi_obs_backtransformobsnmes.npy"), - back_transform_df['obsnme'].values, - allow_pickle=True, fix_imports=True) - - self.logger.log("applying transformations") - - self.logger.log("computing projection matrix") - if use_ztz: - self.logger.log("using ztz approach...") - pmat, s = compute_using_ztz(sim_ensemble) - self.logger.log("using ztz approach...") - else: - self.logger.log("using z approach...") - pmat, s = compute_using_z(sim_ensemble) - self.logger.log("using z approach...") - self.logger.log("computing projection matrix") - - self.logger.log("applying truncation...") - apply_energy_based_truncation(energy,s,pmat) - self.logger.log("applying truncation...") - - self.logger.log("creating tpl files") - dsi_in_file = os.path.join(t_d, "dsi_pars.csv") - dsi_tpl_file = dsi_in_file + ".tpl" - ftpl = open(dsi_tpl_file, 'w') - fin = open(dsi_in_file, 'w') - ftpl.write("ptf ~\n") - fin.write("parnme,parval1\n") - ftpl.write("parnme,parval1\n") - npar = s.shape[0] - dsi_pnames = [] - for i in range(npar): - pname = "dsi_par{0:04d}".format(i) - dsi_pnames.append(pname) - fin.write("{0},0.0\n".format(pname)) - ftpl.write("{0},~ {0} ~\n".format(pname, pname)) - fin.close() - ftpl.close() - - mn_vec = sim_ensemble.mean(axis=0) - # check that sim_ensemble has names ordered - assert (mn_vec.index.values == names).all(), "sim_ensemble names are not ordered" - mn_in_file = os.path.join(t_d, "dsi_pr_mean.csv") - mn_tpl_file = mn_in_file + ".tpl" - fin = open(mn_in_file, 'w') - ftpl = open(mn_tpl_file, 'w') - ftpl.write("ptf ~\n") - fin.write("obsnme,mn\n") - ftpl.write("obsnme,mn\n") - mn_dict = {} - for oname in names: - pname = "dsi_prmn_{0}".format(oname) - fin.write("{0},{1}\n".format(oname, mn_vec[oname])) - ftpl.write("{0},~ {1} ~\n".format(oname, pname)) - mn_dict[pname] = mn_vec[oname] - fin.close() - ftpl.close() - self.logger.log("creating tpl files") - - self.logger.log("saving proj mat") - #row_names = ["sing_vec_{0}".format(i) for i in range(pmat.shape[0])] - pmat = Matrix(x=pmat,col_names=dsi_pnames,row_names=names) - pmat.col_names = dsi_pnames - #proj_name = "dsi_proj_mat.jcb" # dont change this name!!! - proj_name = "dsi_proj_mat.npy" # dont change this name!!! - proj_path = os.path.join(t_d,proj_name) - #pmat.to_coo(proj_path) - # use numpy for speed - np.save(os.path.join(t_d,proj_name), pmat.x, allow_pickle=False, fix_imports=True) - - self.logger.statement("projection matrix dimensions:"+str(pmat.shape)) - self.logger.statement("projection matrix saved to "+proj_path) - self.logger.log("saving proj mat") - - - # this is the dsi forward run function - it is harded coded below! - def dsi_forward_run(): - import os - import numpy as np - import pandas as pd - from pyemu.utils.helpers import inverse_normal_score_transform - pmat = np.load("dsi_proj_mat.npy") - pvals = pd.read_csv("dsi_pars.csv",index_col=0) - ovals = pd.read_csv("dsi_pr_mean.csv",index_col=0) - sim_vals = ovals + np.dot(pmat,pvals.values) - filename = "dsi_obs_backtransformvals.npy" - if os.path.exists(filename): - print("applying back-transform") - backtransformvals = np.load("dsi_obs_backtransformvals.npy") - backtransformobsnmes = np.load("dsi_obs_backtransformobsnmes.npy",allow_pickle=True) - obsnmes = np.unique(backtransformobsnmes) - back_vals = [ - inverse_normal_score_transform( - backtransformvals[np.where(backtransformobsnmes==o)][:,1], - backtransformvals[np.where(backtransformobsnmes==o)][:,0], - sim_vals.loc[o].mn, - extrap=None - )[0] - for o in obsnmes - ] - sim_vals.loc[obsnmes,'mn'] = back_vals - if os.path.exists("dsi_obs_transform.csv"): - print("reversing log-transform") - offset = np.load("dsi_obs_offset.npy") - log_trans = np.load("dsi_obs_log.npy") - assert log_trans.shape[0] == sim_vals.mn.values.shape[0], f"log transform shape mismatch: {log_trans.shape[0]},{sim_vals.mn.values.shape[0]}" - assert offset.shape[0] == sim_vals.mn.values.shape[0], f"offset transform shape mismatch: {offset.shape[0]},{sim_vals.mn.values.shape[0]}" - vals = sim_vals.mn.values - vals[np.where(log_trans==1)] = 10**vals[np.where(log_trans==1)] - vals-= offset - sim_vals.loc[:,'mn'] = vals - #print(sim_vals) - sim_vals.to_csv("dsi_sim_vals.csv") - - self.logger.log("test run") - b_d = os.getcwd() - os.chdir(t_d) - dsi_forward_run() - os.chdir(b_d) - self.logger.log("test run") - - self.logger.log("creating ins file") - out_file = os.path.join(t_d,"dsi_sim_vals.csv") - ins_file = out_file + ".ins" - sdf = pd.read_csv(out_file,index_col=0) - with open(ins_file,'w') as f: - f.write("pif ~\n") - f.write("l1\n") - for oname in sdf.index.values: - f.write("l1 ~,~ !{0}!\n".format(oname)) - self.logger.log("creating ins file") - - self.logger.log("creating Pst") - pst = Pst.from_io_files([mn_tpl_file,dsi_tpl_file],[mn_in_file,dsi_in_file],[ins_file],[out_file],pst_path=".") - - par = pst.parameter_data - dsi_pars = par.loc[par.parnme.str.startswith("dsi_par"),"parnme"] - par.loc[dsi_pars,"parval1"] = 0 - par.loc[dsi_pars,"parubnd"] = 10.0 - par.loc[dsi_pars,"parlbnd"] = -10.0 - par.loc[dsi_pars,"partrans"] = "none" - with open(os.path.join(t_d,"dsi.unc"),'w') as f: - f.write("START STANDARD_DEVIATION\n") - for p in dsi_pars: - f.write("{0} 1.0\n".format(p)) - f.write("END STANDARD_DEVIATION") - pst.pestpp_options['parcov'] = "dsi.unc" - - mn_pars = par.loc[par.parnme.str.startswith("dsi_prmn"),"parnme"] - par.loc[mn_pars,"partrans"] = "fixed" - for pname,pval in mn_dict.items(): - par.loc[pname,"parval1"] = pval - par.loc[pname, "parubnd"] = pval + 1000 - par.loc[pname, "parlbnd"] = pval - 1000 - - obs = pst.observation_data - org_obs = self.pst.observation_data - for col in org_obs.columns: - obs.loc[org_obs.obsnme,col] = org_obs.loc[:,col] - pst.control_data.noptmax = 0 - pst.model_command = "python forward_run.py" - self.logger.log("creating Pst") - import inspect - #print([l for l in inspect.getsource(dsi_forward_run).split("\n")]) - lines = [line[12:] for line in inspect.getsource(dsi_forward_run).split("\n")][1:] - with open(os.path.join(t_d,"forward_run.py"),'w') as f: - for line in lines: - if nst_extrap is not None: - if "extrap=None" in line: - line = line.replace("None",f"'{nst_extrap}'") - f.write(line+"\n") - pst.write(os.path.join(t_d,"dsi.pst"),version=2) - self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst"))) - try: - run("pestpp-ies dsi.pst",cwd=t_d) - except Exception as e: - self.logger.warn("error testing noptmax=0 run:{0}".format(str(e))) - - return pst - - -def compute_using_z(sim_ensemble): - z = sim_ensemble.get_deviations() / np.sqrt(float(sim_ensemble._df.shape[0] - 1)) - z = z.values - u, s, v = np.linalg.svd(z, full_matrices=False) - us = np.dot(v.T, np.diag(s)) - return us,s - -def compute_using_ztz(sim_ensemble): - # rval are the transformed obs values - rval = sim_ensemble._df.copy() - #mu2 is the mean of the transformed obs values - mu2 = rval.mean() - #adjust rval by subtracting mu2 - rval -= mu2 - #divide rval by the sqrt of nreal-1 - nreal = rval.shape[0] - rval = rval*np.sqrt(1/(nreal-1)) - # rval.T to match pest utils implementation - z = rval.T.values - # Compute the ZtZ matrix - ztz = np.dot(z.T,z) - assert ztz.shape[0] == z.shape[1], "ZtZ matrix is not square" - assert ztz.shape[0] == sim_ensemble.shape[0], "ZtZ matrix is not nreal*nreal" - - #We now do SVD on ZtZ. - print("doing SVD on ZtZ") - u, s2, v = np.linalg.svd(ztz, full_matrices=False) - s = np.sqrt(s2) - s[z.shape[0]:] = 0 #truncation to match compute_using_z() - - # formulate the sqrt of the covariance matrix - us = np.dot(z,u) - return us, s - -def apply_energy_based_truncation(energy,s,us): - if energy >= 1.0: - print("Warning: energy>=1.0, no truncation applied") - return us - # Determine where to truncate - # Determine nn - if us.shape[0]==us.shape[1]: - nn = us.shape[0] - 1 - else: - nobs = us.shape[0] - nreal = us.shape[1] - nn = min(nobs, nreal) - 1 - # Compute total_energy - total_energy = np.sum((np.sqrt(s))[:nn]) - # Find energy truncation point - ntrunc = np.where((np.sqrt(s)).cumsum()/total_energy<=energy)[0].shape[0] - # Initialize threshold - #s1 = s[0] - #thresh = 1.0e-7 * s1 #NOTE: JDoh's implementation uses an additional level of truncation - #ntrunc = min(np.where(s>=thresh)[0][0], ntrunc)+1 - ntrunc=ntrunc+1 - if ntrunc>=us.shape[1]: - print("ntrunc>=us.shape[1], no truncation applied") - else: - print("truncating to {0} singular values".format(ntrunc)) - # Apply threshold logic - us = us[:,:ntrunc] - return us - -def moving_average_with_endpoints(y_values, window_size): - # Ensure the window size is odd - if window_size % 2 == 0: - raise ValueError("window_size must be odd") - # Calculate half-window size - half_window = window_size // 2 - # Initialize the output array - smoothed_y = np.zeros_like(y_values) - # Handle the endpoints - for i in range(0,half_window): - # Start - smoothed_y[i] = np.mean(y_values[:i + half_window ]) - for i in range(1,half_window+1): - # End - smoothed_y[-i] = np.mean(y_values[::-1][:i + half_window +1]) - # Handle the middle part with full window - for i in range(half_window, len(y_values) - half_window): - smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window]) - #Enforce endpoints - smoothed_y[0] = y_values[0] - smoothed_y[-1] = y_values[-1] - # Ensure uniqueness by adding small increments if values are duplicated - #NOTE: this is a hack to ensure uniqueness in the normal score transform - smoothed_y = make_unique(smoothed_y, delta=1e-10) - return smoothed_y - - -def make_unique(arr, delta=1e-10): - """ - Modifies a sorted numpy array in-place to ensure all elements are unique. - - Parameters: - arr (np.ndarray): The sorted numpy array. - delta (float): The minimum increment to apply to duplicate elements. - Default is a very small value (1e-10). - """ - for i in range(1, len(arr)): - if arr[i] <= arr[i - 1]: - arr[i] = arr[i - 1] + delta - - return arr From c776d8fb416f2a265b0bd54974aa878c87a4a254 Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 17 Jun 2025 11:00:24 +0100 Subject: [PATCH 10/58] initial tests commit --- autotest/dsi_tests.py | 108 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 autotest/dsi_tests.py diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py new file mode 100644 index 000000000..724973c09 --- /dev/null +++ b/autotest/dsi_tests.py @@ -0,0 +1,108 @@ +import os +import sys +import shutil +import pytest +import numpy as np +import pandas as pd +import platform +import pyemu +from pst_from_tests import setup_tmp, ies_exe_path, _get_port +from pyemu.emulators import DSI + + +#def test_dsi_feature_transforms(): +# """Test feature transforms in DSI emulator""" +# # Create test data simulating an ensemble +# np.random.seed(42) +# n_reals = 10 +# n_obs = 5 +# sim_names = [f"obs{i}" for i in range(n_obs)] +# sim_data = np.random.lognormal(mean=0, sigma=1, size=(n_reals, n_obs)) +# sim_ensemble = pd.DataFrame(sim_data, columns=sim_names) +# +# # Create DSI emulator +# pst = pyemu.Pst.from_par_obs_names(["p1"], sim_names) +# dsi = pyemu.emulators.DSI( +# pst=pst, +# sim_ensemble=sim_ensemble, +# transforms = [{"type": "log10", "columns": sim_names}, +# {"type": "normal_score", "columns": sim_names}], +# +# ) +# +# # Test feature transforms +# dsi.apply_feature_transforms() +# +# # Check that transformed data exists +# assert dsi.data_transformed is not None +# +# # Check log transform was applied (values should be smaller than original lognormal data) +# assert dsi.data_transformed.mean().mean() < sim_ensemble.mean().mean() +# +# # Check the feature transformer object exists +# assert hasattr(dsi, "feature_transformer") +# +# # Test with specific columns for log transform +# dsi2 = pyemu.emulators.DSI( +# pst=pst, +# sim_ensemble=sim_ensemble, +# transforms = [{"type": "log10", "columns": sim_names[:2]}] +# ) +# dsi2.apply_feature_transforms() +# +# # Check only specified columns were log transformed +# orig_means = sim_ensemble.mean() +# transformed_means = dsi2.data_transformed.mean() +# +# for i, col in enumerate(sim_names): +# if i < 2: # Should be log transformed +# assert transformed_means[col] < orig_means[col] +# else: # Should be unchanged +# assert np.isclose(transformed_means[col], orig_means[col]) + +def test_dsi_freyberg(tmp_d): + + test_d = "ends_master" + test_d = setup_tmp(test_d, tmp_d) + + case = "freyberg6_run_ies" + pst_name = os.path.join(test_d, case + ".pst") + pst = pyemu.Pst(pst_name) + predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"] + pst.pestpp_options["predictions"] = predictions + + oe_name = pst_name.replace(".pst", ".0.obs.csv") + oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] + data = oe._df.copy() + + dsi = DSI(sim_ensemble=data) + dsi.apply_feature_transforms() + dsi.fit() + + # history match + obsdata = pst.observation_data.copy() + td = "template_dsi" + pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata) + pstdsi.control_data.noptmax = 3 + pstdsi.pestpp_options["ies_num_reals"] = 100 + pstdsi.write(os.path.join(td, "dsi.pst"),version=2) + + pvals = pd.read_csv(os.path.join(td, "dsi_pars.csv"), index_col=0) + md = "master_dsi" + num_workers= 3 + worker_root = "." + pyemu.os_utils.start_workers( + td,ies_exe_path,"dsi.pst", num_workers=num_workers, + worker_root=worker_root, master_dir=md, port=_get_port(), + ppw_function=pyemu.helpers.dsi_pyworker, + ppw_kwargs={ + "dsi": dsi, "pvals": pvals, + } + ) + + + return + + +if __name__ == "__main__": + test_dsi_freyberg("temp") \ No newline at end of file From 2b735750ed35803ef1089cda6d82bed752a5f339 Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 17 Jun 2025 13:40:29 +0100 Subject: [PATCH 11/58] Portmanager class for dsivc --- pyemu/utils/os_utils.py | 184 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 180 insertions(+), 4 deletions(-) diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py index 1247cf457..d16183662 100644 --- a/pyemu/utils/os_utils.py +++ b/pyemu/utils/os_utils.py @@ -13,6 +13,12 @@ import socket import time from datetime import datetime +import random +import logging +import tempfile +from contextlib import contextmanager +import json +import uuid import numpy as np import pandas as pd @@ -948,10 +954,180 @@ def send_killed_run(self,group=None,runid=None,desc="killed"): + +class PortManager(object): + """Cross-platform port manager for parallel processes.""" + def __init__(self, + port_range=(4004, 65535), + lock_dir=None, + max_retries=50, + lock_timeout=5, + log_level=logging.INFO): + """ + Initialize the port manager. + Args: + port_range: Tuple of (min_port, max_port) to search within + lock_dir: Directory to store lock files (default: system temp dir) + max_retries: Maximum attempts to find an available port + lock_timeout: Time in seconds after which a lock is considered stale + """ + # Set up instance-specific logger + self.logger = logging.getLogger(f"{__name__}.PortManager.{id(self)}") + self.logger.setLevel(log_level) + # Add a handler if none exists + if not self.logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter( + '%(asctime)s - %(processName)s - %(levelname)s - %(message)s' + ) + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.min_port, self.max_port = port_range + self.lock_dir = lock_dir or os.path.join(tempfile.gettempdir(), "port_locks") + self.max_retries = max_retries + self.lock_timeout = lock_timeout + # Ensure lock directory exists + os.makedirs(self.lock_dir, exist_ok=True) + # Generate a unique ID for this process instance + self.instance_id = str(uuid.uuid4()) + + def _is_port_available(self, port): + """Check if a port is available by attempting to bind to it.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + try: + # Set socket to reuse address to handle TIME_WAIT state + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind(('localhost', port)) + return True + except (socket.error, OSError): + return False + + def _get_lock_file(self, port): + """Get the path to the lock file for a specific port.""" + return os.path.join(self.lock_dir, f"port_{port}.lock") + + def _clean_stale_locks(self): + """Remove stale lock files based on timeout.""" + now = time.time() + try: + for filename in os.listdir(self.lock_dir): + if filename.startswith("port_") and filename.endswith(".lock"): + lock_path = os.path.join(self.lock_dir, filename) + if os.path.exists(lock_path): + # Check if lock is stale + if now - os.path.getmtime(lock_path) > self.lock_timeout: + try: + os.remove(lock_path) + self.logger.debug(f"Removed stale lock file: {lock_path}") + except OSError: + # Another process might have removed it already + pass + except Exception as e: + self.logger.warning(f"Error cleaning stale locks: {e}") + + @contextmanager + def _try_lock_port(self, port): + """ + Try to create a lock file for a port using a cross-platform approach. + Uses atomic file creation to implement locking. + """ + lock_file = self._get_lock_file(port) + lock_acquired = False + try: + # Try to create the lock file - will only succeed if it doesn't exist + lock_data = { + "pid": os.getpid(), + "instance_id": self.instance_id, + "timestamp": time.time() + } + try: + # Try exclusive creation of the file (atomic operation) + with open(lock_file, 'x') as f: + json.dump(lock_data, f) + lock_acquired = True + yield True + except FileExistsError: + # Lock file already exists + try: + # Check if lock file is stale + if os.path.exists(lock_file): + if time.time() - os.path.getmtime(lock_file) > self.lock_timeout: + # Lock is stale, try to replace it + try: + os.remove(lock_file) + with open(lock_file, 'x') as f: + json.dump(lock_data, f) + lock_acquired = True + yield True + return + except (FileExistsError, OSError): + # Failed to acquire lock + pass + except OSError: + pass + yield False + finally: + # Clean up the lock file if we created it + if lock_acquired: + try: + if os.path.exists(lock_file): + os.remove(lock_file) + except OSError as e: + self.logger.warning(f"Error removing lock file for port {port}: {e}") + + def get_available_port(self): + """ + Find and reserve an available port. + Returns: + An available port number. + Raises: + RuntimeError: If no available port can be found after max_retries. + """ + # Clean up stale locks first + self._clean_stale_locks() + # Shuffle port range to distribute port selection + port_list = list(range(self.min_port, self.max_port + 1)) + random.shuffle(port_list) + attempts = 0 + while attempts < self.max_retries: + # Pick a random port from our shuffled list + if not port_list: + raise RuntimeError("Exhausted all ports in range") + port = port_list.pop(0) + attempts += 1 + # First check if port is available + if not self._is_port_available(port): + continue + # Try to acquire a lock + with self._try_lock_port(port) as locked: + if not locked: + # Another process got this port + continue + # Double-check port is still available after locking + if self._is_port_available(port): + self.logger.info(f"Reserved port {port} for process {os.getpid()}") + return port + raise RuntimeError(f"Could not find available port after {self.max_retries} attempts") + + @contextmanager + def reserved_port(self): + """Context manager that reserves a port and releases it after use.""" + port = self.get_available_port() + lock_file = self._get_lock_file(port) + try: + yield port + finally: + # Release the port by removing the lock file + if os.path.exists(lock_file): + try: + os.remove(lock_file) + self.logger.info(f"Released port {port}") + except OSError as e: + self.logger.warning(f"Error releasing port {port}: {e}") + + if __name__ == "__main__": host = "localhost" - port = 4004 + port = PortManager().get_available_port() ppw = PyPestWorker(None,host,port) - #ppw.initialize() - - + #ppw.initialize() \ No newline at end of file From 069fadc1ab1a03aee5707f0c48a38f12f23817fd Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 17 Jun 2025 13:54:38 +0100 Subject: [PATCH 12/58] adding ies_exe path arg to dsivc_fwd run fnx to deal with pytest --- pyemu/utils/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index 24cf748dc..b109fb7e4 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4426,7 +4426,7 @@ def dsi_forward_run(pvals,dsi,write_csv=False): sim_vals.to_csv("dsi_sim_vals.csv") return sim_vals -def dsivc_forward_run(md_ies="."): +def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"): import pandas as pd import pyemu import os @@ -4487,7 +4487,7 @@ def dsivc_forward_run(md_ies="."): worker_root="." dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb")) num_workers = dsi.dsivc_args.get("num_pyworkers",1) - pyemu.os_utils.start_workers(md_ies,"pestpp-ies","dsi.pst", + pyemu.os_utils.start_workers(md_ies,ies_exe_path,"dsi.pst", num_workers=num_workers, worker_root=worker_root, port = PortManager().get_available_port(), From eb26410c8996cb3b7fb41e69427c93ddf942952b Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 17 Jun 2025 13:56:22 +0100 Subject: [PATCH 13/58] updates to dsivc for pytest' --- pyemu/emulators/dsi.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index 6891f18e8..02f3e3277 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -80,7 +80,7 @@ def __init__(self, super().__init__(verbose=verbose) - self.__org_observation_data = pst.observation_data.copy() if pst is not None else None + self.observation_data = pst.observation_data.copy() if pst is not None else None #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None #self.__org_control_data = pst.control_data.copy() #breaks pickling if isinstance(sim_ensemble, ObservationEnsemble): @@ -102,35 +102,31 @@ def __init__(self, # check for quadratic_extrapolation if 'quadratic_extrapolation' in t: assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean" - self.transforms = transforms + self.transforms = transforms self.fitted = False self.data_transformed = None self.decision_variable_names = None #used for DSIVC def prepare_training_data(self, data=None): """ - Prepare training data by applying transformations and computing the projection matrix. - - This method follows these steps: - 1. Apply feature transformations (log transform, normal score transform) - 2. Compute projection matrix using SVD + Prepare and transform training data for model fitting. Parameters ---------- data : pandas.DataFrame, optional - Data to prepare. If None, uses self.data. + Raw training data. If None, uses self.data. Returns ------- - pandas.DataFrame - The prepared data. + tuple + Processed data ready for model fitting. """ if data is None: data = self.data - if data is None: raise ValueError("No data provided and no data stored in the emulator") - + + self.logger.statement("applying feature transforms") # Always use the base class transformation method for consistency if self.transforms is not None: self.data_transformed = self.apply_feature_transforms(data, self.transforms) @@ -342,8 +338,10 @@ def prepare_pestpp(self, t_d=None, observation_data=None): obs = pst.observation_data - if observation_data is None: - observation_data = self.__org_observation_data + if observation_data is not None: + self.observation_data = observation_data + else: + observation_data = self.observation_data assert isinstance(observation_data, pd.DataFrame), "observation_data must be a pandas DataFrame" for col in observation_data.columns: obs.loc[sim_vals.index,col] = observation_data.loc[:,col] @@ -382,7 +380,7 @@ def prepare_pestpp(self, t_d=None, observation_data=None): pickle.dump(self,f) return pst - def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None): + def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None,ies_exe_path="pestpp-ies"): """ Prepare Data Space Inversion Variable Control (DSIVC) control files. @@ -532,6 +530,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F par.loc[decvar_names,"partrans"] = "none" par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max() par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min() + par.loc[decvar_names,"parval1"] = self.data.loc[:,decvar_names].quantile(.5) self.logger.statement(f"zero-weighting observation data...") # prepemtpively set obs weights 0.0 @@ -545,6 +544,9 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values obs.loc[stack_stats.index,"obgnme"] = "stack_stats" + obs.loc[stack_stats.index,"org_obsnme"] = [i.split("_stat:")[0] for i in stack_stats.index.values] + pst_dsivc.try_parse_name_metadata() + #obs.loc[stack.index,"obgnme"] = "stack" self.logger.statement(f"building dsivc_forward_run.py...") @@ -555,7 +557,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F file.write(function_source) file.write("\n\n") file.write("if __name__ == \"__main__\":\n") - file.write(f" {function_source.split('(')[0].split('def ')[1]}()\n") + file.write(f" {function_source.split('(')[0].split('def ')[1]}(ies_exe_path='{ies_exe_path}')\n") self.logger.statement(f"preparing nominal initial population...") if mou_population_size is None: From 4dcbeb47e0667e73896336e4969218d46c25d9c8 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 09:41:34 +0100 Subject: [PATCH 14/58] checkin dsi --- autotest/dsi_tests.py | 102 ++++++++++++++++++++++++++++++++++++++--- pyemu/emulators/dsi.py | 3 +- 2 files changed, 98 insertions(+), 7 deletions(-) diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py index 724973c09..7e267a214 100644 --- a/autotest/dsi_tests.py +++ b/autotest/dsi_tests.py @@ -60,7 +60,7 @@ # else: # Should be unchanged # assert np.isclose(transformed_means[col], orig_means[col]) -def test_dsi_freyberg(tmp_d): +def dsi_freyberg(tmp_d,transforms=None,tag=""): test_d = "ends_master" test_d = setup_tmp(test_d, tmp_d) @@ -75,21 +75,26 @@ def test_dsi_freyberg(tmp_d): oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] data = oe._df.copy() - dsi = DSI(sim_ensemble=data) + dsi = DSI(sim_ensemble=data,transforms=transforms) dsi.apply_feature_transforms() dsi.fit() # history match obsdata = pst.observation_data.copy() + if "quadratic_extrapolation" in transforms[0].keys(): + nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist() + ovals = oe.loc[:,nzobs].max(axis=0) * 1.1 + obsdata.loc[nzobs,"obsval"] = ovals.values + td = "template_dsi" pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata) - pstdsi.control_data.noptmax = 3 + pstdsi.control_data.noptmax = 1 pstdsi.pestpp_options["ies_num_reals"] = 100 pstdsi.write(os.path.join(td, "dsi.pst"),version=2) pvals = pd.read_csv(os.path.join(td, "dsi_pars.csv"), index_col=0) - md = "master_dsi" - num_workers= 3 + md = f"master_dsi{tag}" + num_workers = 1 worker_root = "." pyemu.os_utils.start_workers( td,ies_exe_path,"dsi.pst", num_workers=num_workers, @@ -99,10 +104,95 @@ def test_dsi_freyberg(tmp_d): "dsi": dsi, "pvals": pvals, } ) + return + +def test_dsi_basic(tmp_d="temp"): + dsi_freyberg(tmp_d,transforms=None) + return +def test_dsi_nst(tmp_d="temp"): + transforms = [ + {"type": "normal_score", } + ] + dsi_freyberg(tmp_d,transforms=transforms) + return +def test_dsi_nst_extrap(tmp_d="temp"): + transforms = [ + {"type": "normal_score", "quadratic_extrapolation":True} + ] + dsi_freyberg(tmp_d,transforms=transforms) return +def test_dsi_mixed(tmp_d="temp"): + transforms = [ + {"type": "log10", "columns": ["headwater_20171130", "tailwater_20161130"]}, + {"type": "normal_score", } + ] + dsi_freyberg(tmp_d,transforms=transforms) + return + +def test_dsivc_freyberg(): + + md_hm = "master_dsi" + assert os.path.exists(md_hm), f"Master directory {md_hm} does not exist." + td = "template_dsivc" + if os.path.exists(td): + shutil.rmtree(td) + shutil.copytree(md_hm, td) + + dsi = DSI.load(os.path.join(td, "dsi.pickle")) + + pst = pyemu.Pst(os.path.join(td, "dsi.pst")) + oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.1.obs.jcb")) + + obsdata = dsi.observation_data + decvars = obsdata.loc[obsdata.obgnme=="out_wel"].obsnme.tolist() + pstdsivc = dsi.prepare_dsivc(t_d=td, + oe=oe, + decvar_names=decvars, + track_stack=False, + percentiles=[0.05, 0.25, 0.5, 0.75, 0.95], + dsi_args={ + "noptmax":3, + "decvar_weight":10.0, + "num_pyworkers":1, + } + ) + + obs = pstdsivc.observation_data + obs.org_obsnme.unique() + + obsnme = obsdata.loc[obsdata.obgnme=="tailwater"].obsnme.tolist()[-1] + mou_objectives = obs.loc[(obs.org_obsnme==obsnme) & (obs.stat=="50%")].obsnme.tolist() + + pstdsivc.pestpp_options["mou_objectives"] = mou_objectives + obs.loc[mou_objectives, "weight"] = 1.0 + obs.loc[mou_objectives, "obgnme"] = "less_than_obj" + + pstdsivc.control_data.noptmax = 1 #just for testing + pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing + + pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2) + + md = "master_dsivc" + num_workers = 1 + worker_root = "." + + pyemu.os_utils.start_workers(td, + "pestpp-mou", + "dsivc.pst", + num_workers=num_workers, + worker_root=worker_root, + master_dir=md, + port=_get_port(),) + + + if __name__ == "__main__": - test_dsi_freyberg("temp") \ No newline at end of file + #test_dsi_basic() + #test_dsi_nst() + #test_dsi_nst_extrap() + #test_dsi_mixed() + test_dsivc_freyberg() \ No newline at end of file diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index 02f3e3277..eb3be1914 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -402,7 +402,8 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F Percentiles to calculate. Default is [0.25, 0.75, 0.5]. mou_population_size : int, optional Population size for multi-objective optimization. - + ies_exe_path : str, optional + Path to the PEST++ IES executable. Default is "pestpp-ies". Returns ------- Pst From 5b1ad836b20df4b56ce588843891945565966195 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 09:47:41 +0100 Subject: [PATCH 15/58] docstrings --- pyemu/emulators/dsi.py | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index eb3be1914..ec4c4a599 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -15,33 +15,9 @@ class DSI(Emulator): """ - Data Space Inversion emulator class. - - #TODO: add more docstring details - - Parameters - ---------- - pst : Pst, optional - A Pst object. If provided, the emulator will be initialized with the - information from the Pst object. - sim_ensemble : ObservationEnsemble, optional - An ensemble of simulated observations. If provided, the emulator will - be initialized with the information from the ensemble. - transforms : list of dict, optional - List of transformation specifications. Each dict should have: - - 'type': str - Type of transformation (e.g.,'log10', 'normal_score'). - - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. - - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). - Example: - transforms = [ - {'type': 'log10', 'columns': ['obs1', 'obs2']}, - {'type': 'normal_score', 'quadratic_extrapolation': True} - ] - Default is None, which means no transformations will be applied. - energy_threshold : float, optional - The energy threshold for the SVD. Default is 1.0, no truncation. - verbose : bool, optional - If True, enable verbose logging. Default is False. + Data Space Inversion (DS) emulator class. Based on DSI as described in Sun & + Durlofsky (2017) and Sun et al (2017). + """ def __init__(self, From 7d3fbff783f94654142ed71e99b88e383ca98e74 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 09:50:05 +0100 Subject: [PATCH 16/58] init --- pyemu/emulators/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py index 3bd39b1da..5b521dceb 100755 --- a/pyemu/emulators/__init__.py +++ b/pyemu/emulators/__init__.py @@ -8,9 +8,11 @@ AutobotsAssemble ) from .base import Emulator +from .dsi import DSI __all__ = [ 'Emulator', #base Emulator Class + 'DSI', # DSI Emulator Class 'BaseTransformer', 'Log10Transformer', 'RowWiseMinMaxScaler', From 8f57091edaa27b7e8c3171f0e01a79fb2d0a3495 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 09:51:10 +0100 Subject: [PATCH 17/58] init --- pyemu/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyemu/__init__.py b/pyemu/__init__.py index 9b88113c7..85e9fe45d 100644 --- a/pyemu/__init__.py +++ b/pyemu/__init__.py @@ -20,7 +20,13 @@ from .sc import Schur from .utils import (geostats, gw_utils, helpers, metrics, optimization, os_utils, pp_utils, smp_utils) -from .emulators import (Emulator, BaseTransformer, Log10Transformer, +from .emulators import ( + #emulators + Emulator, DSI, + + + #transformers + BaseTransformer, Log10Transformer, RowWiseMinMaxScaler, StandardScalerTransformer, NormalScoreTransformer, TransformerPipeline, AutobotsAssemble) #from .prototypes import * From e83f18f8f3bfe4f833e529ab41dc2bf22e779cc5 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 10:43:02 +0100 Subject: [PATCH 18/58] fix to dsi tests --- autotest/dsi_tests.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/autotest/dsi_tests.py b/autotest/dsi_tests.py index 7e267a214..3f0fa04a8 100644 --- a/autotest/dsi_tests.py +++ b/autotest/dsi_tests.py @@ -81,10 +81,11 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""): # history match obsdata = pst.observation_data.copy() - if "quadratic_extrapolation" in transforms[0].keys(): - nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist() - ovals = oe.loc[:,nzobs].max(axis=0) * 1.1 - obsdata.loc[nzobs,"obsval"] = ovals.values + if transforms is not None: + if "quadratic_extrapolation" in transforms[0].keys(): + nzobs = obsdata.loc[obsdata.weight>0].obsnme.tolist() + ovals = oe.loc[:,nzobs].max(axis=0) * 1.1 + obsdata.loc[nzobs,"obsval"] = ovals.values td = "template_dsi" pstdsi = dsi.prepare_pestpp(td,observation_data=obsdata) @@ -157,7 +158,8 @@ def test_dsivc_freyberg(): "noptmax":3, "decvar_weight":10.0, "num_pyworkers":1, - } + }, + ies_exe_path=ies_exe_path, ) obs = pstdsivc.observation_data From 61e8d9528542fcc7060981f0ced4dcc6dbff4804 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 10:45:54 +0100 Subject: [PATCH 19/58] moved dsi tests to dsi_tests.py --- autotest/la_tests.py | 157 ------------------------------------------- 1 file changed, 157 deletions(-) diff --git a/autotest/la_tests.py b/autotest/la_tests.py index 5ec2f8640..9b426c8f2 100644 --- a/autotest/la_tests.py +++ b/autotest/la_tests.py @@ -595,167 +595,10 @@ def ends_freyberg_test(tmp_path): -def ends_run_freyberg_dsi(tmp_d, nst=False, nst_extrap=None, ztz=False, energy=1.0): - import pyemu - import os - import pandas as pd - import numpy as np - test_d = "ends_master" - test_d = setup_tmp(test_d, tmp_d) - case = "freyberg6_run_ies" - pst_name = os.path.join(test_d, case + ".pst") - pst = pyemu.Pst(pst_name) - predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"] - pst.pestpp_options["predictions"] = predictions - - oe_name = pst_name.replace(".pst", ".0.obs.csv") - oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] - - ends = pyemu.EnDS(pst=pst, sim_ensemble=oe,verbose=True) - t_d = os.path.join(tmp_d, "dsi_template") - - ends.prep_for_dsi(t_d=t_d, - use_ztz=ztz, - apply_normal_score_transform=nst, - nst_extrap=nst_extrap, - energy=energy) - # copy exe to dsi_template - #shutil.copy2(os.path.join(test_d,"pestpp-ies.exe"),os.path.join(t_d,"pestpp-ies.exe")) - filename=os.path.join(t_d,"dsi.0.obs.csv") - if os.path.exists(filename): - os.remove(filename) - pst = pyemu.Pst(os.path.join(t_d,"dsi.pst")) - pst.control_data.noptmax = -1 - pst.pestpp_options["overdue_giveup_fac"] = 100000000 - pst.write(os.path.join(t_d,"dsi.pst"),version=2) - #pyemu.os_utils.run("pestpp-ies dsi.pst",cwd=t_d) - - pvals = pd.read_csv(os.path.join(t_d,"dsi_pars.csv"),index_col=0) - pmat = np.load(os.path.join(t_d,"dsi_proj_mat.npy")) - ovals = pd.read_csv(os.path.join(t_d,"dsi_pr_mean.csv"),index_col=0) - - - m_d = t_d.replace("template","master") - port = _get_port() - pyemu.os_utils.start_workers(t_d, ies_exe_path,"dsi.pst", - worker_root=tmp_d, - master_dir=m_d, num_workers=10, port=port, - ppw_function=pyemu.helpers.dsi_pyworker, - ppw_kwargs={"pmat":pmat,"ovals":ovals,"pvals":pvals}) - #read in the results - oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=os.path.join(m_d,"dsi.0.obs.csv")) - assert oe.shape[0]==50, f"{50-oe.shape[0]} failed runs" - phi_vector = oe.phi_vector.sort_values().values - assert phi_vector[0] != phi_vector[1],phi_vector - -def ends_freyberg_dsi_test(tmp_path): - ends_run_freyberg_dsi(tmp_path) - -def ends_freyberg_dsi_nst_test(tmp_path): - ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap=None) - -def ends_freyberg_dsi_extrap_test(tmp_path): - ends_run_freyberg_dsi(tmp_path,nst=True,nst_extrap='quadratic') - -def ends_freyberg_dsi_ztz_test(tmp_path): - ends_run_freyberg_dsi(tmp_path,ztz=True) - -def ends_freyberg_dsi_svd_test(tmp_path): - ends_run_freyberg_dsi(tmp_path,ztz=True,energy=0.999) - - -def plot_freyberg_dsi(): - import pandas as pd - import pyemu - import matplotlib.pyplot as plt - - test_d = "ends_master" - case = "freyberg6_run_ies" - pst_name = os.path.join(test_d, case + ".pst") - pst = pyemu.Pst(pst_name) - predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"] - oe_name = pst_name.replace(".pst", ".0.obs.csv") - pr_oe = pd.read_csv(os.path.join(test_d,"freyberg6_run_ies.0.obs.csv"),index_col=0) - pt_oe = pd.read_csv(os.path.join(test_d, "freyberg6_run_ies.3.obs.csv"), index_col=0) - - m_d = os.path.join("dsi", "master_dsi") - pst = pyemu.Pst(os.path.join(m_d,"dsi.pst")) - pr_oe_dsi = pd.read_csv(os.path.join(m_d,"dsi.0.obs.csv"),index_col=0) - pt_oe_dsi = pd.read_csv(os.path.join(m_d, "dsi.3.obs.csv"), index_col=0) - - pv = pyemu.ObservationEnsemble(pst=pst,df=pt_oe).phi_vector - pv_dsi = pyemu.ObservationEnsemble(pst=pst, df=pt_oe_dsi).phi_vector - #print(pt_oe.shape) - pt_oe = pt_oe.loc[pv<25, :] - pt_oe_dsi = pt_oe_dsi.loc[pv_dsi < 25, :] - - # print(pt_oe.shape) - # fig,ax = plt.subplots(1,1,figsize=(5,5)) - # ax.hist(pv,bins=10,facecolor="b",alpha=0.5,density=True) - # ax.hist(pv_dsi, bins=10, facecolor="m", alpha=0.5,density=True) - # ax.set_yticks([]) - # plt.tight_layout() - # plt.show() - - - - fig,axes = plt.subplots(len(predictions),1,figsize=(10,10)) - for p,ax in zip(predictions,axes): - ax.hist(pr_oe.loc[:,p].values,bins=10,alpha=0.5,facecolor="0.5",density=True,label="prior") - ax.hist(pt_oe.loc[:, p].values, bins=10, alpha=0.5, facecolor="b",density=True,label="posterior") - ax.hist(pr_oe_dsi.loc[:, p].values, bins=10, facecolor="none",hatch="/",edgecolor="0.5", - lw=2.5,density=True,label="dsi prior") - ax.hist(pt_oe_dsi.loc[:, p].values, bins=10, facecolor="none",density=True,hatch="/",edgecolor="b",lw=2.5, - label="dsi posterior") - ax.set_title(p,loc="left") - ax.legend(loc="upper right") - ax.set_yticks([]) - plt.tight_layout() - plt.savefig("dsi_pred.pdf") - - -def dsi_normscoretransform_test(): - import numpy as np - import pyemu - from pyemu.utils.helpers import randrealgen_optimized,normal_score_transform,inverse_normal_score_transform - test_d = "ends_master" - case = "freyberg6_run_ies" - pst_name = os.path.join(test_d, case + ".pst") - pst = pyemu.Pst(pst_name) - - oe_name = pst_name.replace(".pst", ".0.obs.csv") - oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] - - nstval = randrealgen_optimized(oe.shape[0], 1e-7, 1e4) - window_size=3 - if oe.shape[0]>40: - window_size=5 - if oe.shape[0]>90: - window_size=7 - if oe.shape[0]>200: - window_size=9 - for name in oe.columns: - print("transforming:",name) - sorted_values = oe._df.loc[:,name].sort_values().copy() - #if all values are the same, skip - if sorted_values.iloc[0] == sorted_values.iloc[-1]: - print("all values are the same, skipping") - continue - sorted_values.loc[:] = pyemu.eds.moving_average_with_endpoints(sorted_values.values, window_size) - transformed_values = np.asarray([normal_score_transform(nstval, sorted_values, value)[0] for value in sorted_values]) - backtransformed_values = np.asarray([inverse_normal_score_transform(nstval, sorted_values, value)[0] for value in transformed_values]) - - diff = backtransformed_values-sorted_values - assert max(abs(diff))<1e-7, backtransformed_values - if __name__ == "__main__": - #dsi_normscoretransform_test() #ends_freyberg_test("temp") - ends_freyberg_dsi_test("temp") #ends_freyberg_dev() - #ends_freyberg_dsi_test("temp") - #plot_freyberg_dsi() #obscomp_test() #alternative_dw() #freyberg_verf_test() From 042c96a8355e918bf77dd45c3947af62507a3d9a Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 10:46:16 +0100 Subject: [PATCH 20/58] moved dsi tests to dsi_tests.py --- autotest/la_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/la_tests.py b/autotest/la_tests.py index 9b426c8f2..6e79ec9c7 100644 --- a/autotest/la_tests.py +++ b/autotest/la_tests.py @@ -597,7 +597,7 @@ def ends_freyberg_test(tmp_path): if __name__ == "__main__": - #ends_freyberg_test("temp") + ends_freyberg_test("temp") #ends_freyberg_dev() #obscomp_test() #alternative_dw() From 8a817a59f20a41f3d3399c50b75f93eec5cecaac Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 10:57:50 +0100 Subject: [PATCH 21/58] docstrings --- pyemu/emulators/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py index f088d91ee..c00809dc1 100755 --- a/pyemu/emulators/base.py +++ b/pyemu/emulators/base.py @@ -99,7 +99,8 @@ def prepare_training_data(self, data=None): def apply_feature_transforms(self, data=None, transforms=None): """ Apply feature transformations to data with customizable transformer sequence. - + This function is not intended to be used directly by users. + Parameters ---------- data : pandas.DataFrame, optional From 18ed8ed90886dfd19945e970e2b99fee97d70ab5 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 10:58:05 +0100 Subject: [PATCH 22/58] use class save instead of pickle --- pyemu/emulators/dsi.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index ec4c4a599..a3b699d73 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -6,7 +6,6 @@ import pandas as pd import inspect from pyemu.utils.helpers import dsi_forward_run, series_to_insfile -import pickle import os import shutil from pyemu.pst.pst_handler import Pst @@ -351,9 +350,8 @@ def prepare_pestpp(self, t_d=None, observation_data=None): pst.write(os.path.join(t_d,"dsi.pst"),version=2) self.logger.statement("saved pst to {0}".format(os.path.join(t_d,"dsi.pst"))) - #self.pst_dsi = pst #breaks pickling #TODO: add save/load methods to Emulator class - with open(os.path.join(t_d,"dsi.pickle"),"wb") as f: - pickle.dump(self,f) + self.logger.statement("pickling dsi object to {0}".format(os.path.join(t_d,"dsi.pickle"))) + self.save(os.path.join(t_d,"dsi.pickle")) return pst def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=False, dsi_args=None, percentiles=[0.25,0.75,0.5], mou_population_size=None,ies_exe_path="pestpp-ies"): @@ -570,8 +568,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F self.logger.statement("overwriting dsi.pickle file...") self.decision_variable_names = decvar_names # re-pickle dsi to track dsivc args - with open(os.path.join(t_d,"dsi.pickle"),"wb") as f: - pickle.dump(self,f) + self.save(os.path.join(t_d,"dsi.pickle")) self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...") return pst_dsivc \ No newline at end of file From 19b3801affd5fd11ff69f3032cb50f461e01d248 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 18 Jun 2025 14:31:21 +0100 Subject: [PATCH 23/58] checkin baseline ldfa with sklearn --- pyemu/emulators/ldfa.py | 505 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 505 insertions(+) create mode 100644 pyemu/emulators/ldfa.py diff --git a/pyemu/emulators/ldfa.py b/pyemu/emulators/ldfa.py new file mode 100644 index 000000000..707ba80f3 --- /dev/null +++ b/pyemu/emulators/ldfa.py @@ -0,0 +1,505 @@ +""" +Learning-based pattern-data-driven forecast approach (LDFA) emulator implementation. + +""" +from __future__ import print_function, division +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.decomposition import PCA +from sklearn.neural_network import MLPRegressor + +from .base import Emulator +from .transformers import RowWiseMinMaxScaler + +# Define scikit-learn based model class +class LDFAModel: + """ + Scikit-learn MLPRegressor wrapper for LDFA neural network model. + """ + def __init__(self, input_dim, output_dim, hidden_units=None, activation='relu', + dropout_rate=0.0, learning_rate=0.01, max_iter=200, early_stopping=True): + + if hidden_units is None: + hidden_units = (2 * input_dim,) + elif isinstance(hidden_units, list): + hidden_units = tuple(hidden_units) + + # Map activation functions from PyTorch to scikit-learn + activation_map = { + 'relu': 'relu', + 'tanh': 'tanh', + 'sigmoid': 'logistic' + } + + self.model = MLPRegressor( + hidden_layer_sizes=hidden_units, + activation=activation_map.get(activation, 'relu'), + learning_rate_init=learning_rate, + max_iter=max_iter, + early_stopping=early_stopping, + validation_fraction=0.2, + n_iter_no_change=20, # Patience for early stopping + random_state=42, + warm_start=False, + alpha=dropout_rate if dropout_rate > 0 else 0.0001 # Use L2 regularization instead of dropout + ) + + def fit(self, X, y): + """Fit the model""" + return self.model.fit(X, y) + + def predict(self, X): + """Make predictions""" + return self.model.predict(X) + + @property + def loss_curve_(self): + """Get training loss curve""" + return getattr(self.model, 'loss_curve_', []) + + +class LDFA(Emulator): + """ + Class for the Learning-based pattern-data-driven forecast approach from Kim et al (2025). + + This emulator uses neural networks to learn the relationships between inputs + and forecast outputs, with dimensionality reduction via PCA. + + Parameters + ---------- + data : pandas.DataFrame + The training data with input and forecast columns. + input_cols : list + List of column names to use as inputs. + groups : dict + Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling. + fit_groups : dict + Dictionary mapping group names to lists of column names used to fit the scaling. + forecast_names : list, optional + List of column names to forecast. If None, all columns in data will be used. + energy_threshold : float, optional + Energy threshold for the PCA. Default is 1.0. + seed : int, optional + Random seed for reproducibility. Default is None. + early_stop : bool, optional + Whether to use early stopping during training. Default is True. + apply_std_scaler : bool, optional + Whether to apply standard scaling before min-max scaling. Default is False. + verbose : bool, optional + If True, enable verbose logging. Default is True. + """ + + def __init__(self, + data, + input_cols, + groups, + fit_groups, + forecast_names=None, + energy_threshold=1.0, + seed=None, + early_stop=True, + transforms=None, + verbose=True): + """ + Initialize the Learning-based pattern-data-driven NN emulator. + + Parameters + ---------- + data : pandas.DataFrame + The training data with input and forecast columns. + input_cols : list + List of column names to use as inputs. + groups : dict + Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling. + fit_groups : dict + Dictionary mapping group names to lists of column names used to fit the scaling. + forecast_names : list, optional + List of column names to forecast. If None, all columns in data will be used. + energy_threshold : float, optional + Energy threshold for the PCA. Default is 1.0. + seed : int, optional + Random seed for reproducibility. Default is None. + early_stop : bool, optional + Whether to use early stopping during training. Default is True. + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g.,'log10', 'normal_score'). + - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. + - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). + Example: + transforms = [ + {'type': 'log10', 'columns': ['obs1', 'obs2']}, + {'type': 'normal_score', 'quadratic_extrapolation': True} + ] + Default is None, which means no transformations will be applied. + verbose : bool, optional + If True, enable verbose logging. Default is True. + """ + + + super().__init__(verbose=verbose) + + self.seed = seed + self.data = data + self.input_cols = input_cols + self.groups = groups + self.fit_groups = fit_groups + + if forecast_names is None: + forecast_names = data.columns + self.forecast_names = forecast_names + + self.energy_threshold = energy_threshold + + # Store early stopping preference + self.use_early_stopping = early_stop + + self.transforms = transforms + self.noise_model = None + self.model = None + self.train_data = None + self.test_data = None + + def prepare_training_data(self, data=None, test_size=0.2): + """ + Prepare the training data for model fitting. + + This method: + 1. Splits the data into training and test sets + 2. Applies transform pipelines if specified + 3. Applies row-wise min-max scaling + 4. Performs PCA dimensionality reduction + + Parameters + ---------- + data : pandas.DataFrame, optional + Data to prepare. If None, uses self.data. Default is None. + test_size : float, optional + Fraction of data to use for testing. Default is 0.2. + + Returns + ------- + dict + Dictionary containing prepared data components: + - X_train: Input training data after transformation and PCA + - y_train: Target training data after transformation and PCA + - X_test: Input testing data after transformation and PCA + - y_test: Target testing data after transformation and PCA + """ + if data is None: + data = self.data + + if data is None: + raise ValueError("No data provided and no data stored in the emulator") + + # Split the data into training and test sets + train, test = train_test_split( + data, + test_size=test_size, + random_state=self.seed + ) + + self.logger.statement("preparing training data: data split complete") + + # Store for later use + self.train_data = train.copy() + self.test_data = test.copy() + + + # TODO: Apply feature transformations if specified + # Always use the base class transformation method for consistency + if self.transforms is None: + from .transformers import AutobotsAssemble + self.feature_transformer = AutobotsAssemble(train.copy()) + train_transformed = train + test_transformed = test + else: + train_transformed = self.apply_feature_transforms(train, self.transforms) + test_transformed = self.feature_transformer.transform(test) + + + # Apply row-wise min-max scaling directly (not through the pipeline) + # We need to keep train and test separate; there may be a more elgant solution to this.... + # training data + self.logger.statement("applying row-wise min-max scaling") + self.rowwise_mm_scalers ={ + "train": RowWiseMinMaxScaler( + feature_range=(-1, 1), + groups=self.groups, + fit_groups=self.fit_groups ) + } + self.rowwise_mm_scalers["train"].fit(train_transformed) + train_scaled = self.rowwise_mm_scalers["train"].transform(train_transformed) + + # test data + # We need to fit a new scaler on the test data + self.rowwise_mm_scalers["test"] = RowWiseMinMaxScaler( + feature_range=(-1, 1), + groups=self.groups, + fit_groups=self.fit_groups ) + self.rowwise_mm_scalers["train"].fit(test_transformed) + test_scaled = self.rowwise_mm_scalers["test"].transform(test_transformed) + + self.logger.statement("row-wise min-max scaling complete") + + # Split datasets into input (X) and target (y) variables + X_train = train_scaled.loc[:, self.input_cols].copy() + y_train = train_scaled.loc[:, self.forecast_names].copy() + + X_test = test_scaled.loc[:, self.input_cols].copy() + y_test = test_scaled.loc[:, self.forecast_names].copy() + + # Apply PCA to reduce the dimensionality of the data + self.logger.statement("applying PCA dimensionality reduction") + self.pcaX = PCA()#n_components=X_test.shape[1]) + self.pcay = PCA()#n_components=y_test.shape[1]) + + self.X = self.pcaX.fit_transform(X_train) + self.y = self.pcay.fit_transform(y_train) + + self.X_test = self.pcaX.transform(X_test) + self.y_test = self.pcay.transform(y_test) + + self.logger.statement("PCA dimensionality reduction complete") + + return { + 'X_train': self.X, + 'y_train': self.y, + 'X_test': self.X_test, + 'y_test': self.y_test + } + + def _build_model(self, params=None, prob=False): + """ + Build a neural network model with the specified parameters. + + Parameters + ---------- + params : dict or pandas.Series, optional + Dictionary with model parameters including: + - activation: Activation function to use + - hidden_units: List of units in each hidden layer + - dropout_rate: Rate of dropout for regularization + - learning_rate: Learning rate for optimizer + If None, uses default parameters. Default is None. + prob : bool, optional + Whether to build a probabilistic model. Default is False. + + Returns + ------- + LDFAModel + The scikit-learn MLPRegressor wrapper instance. + """ + if params is None: + params = { + 'activation': 'relu', + 'hidden_units': None, + 'dropout_rate': 0.0, + 'learning_rate': 0.01 + } + + if isinstance(params, pd.Series): + params = params.to_dict() + + input_dim = self.X.shape[1] + output_dim = self.y.shape[1] + + # Create the model architecture + model = LDFAModel( + input_dim=input_dim, + output_dim=output_dim, + hidden_units=params['hidden_units'], + activation=params['activation'], + dropout_rate=params['dropout_rate'], + learning_rate=params['learning_rate'], + early_stopping=self.use_early_stopping + ) + + return model + + def create_model(self, params=None): + """ + Create and store the main model. + + Parameters + ---------- + params : dict, optional + Dictionary of model parameters. Default is None. + + Returns + ------- + self : LDFA + The emulator instance with model created. + """ + self.model = self._build_model(params) + return self + + def add_noise_model(self, params=None): + """ + Add a noise model to capture residuals. + + Parameters + ---------- + params : dict, optional + Dictionary of model parameters for the noise model. Default is None. + + Returns + ------- + self : LDFA + The emulator instance with noise model added. + """ + # Create noise model + self.noise_model = self._build_model(params) + + # Get residuals from main model + self.logger.statement("calculating residuals for noise model") + + # Get predictions from main model + pred_train = self.model.predict(self.X) + residuals_train = self.y - pred_train + + # Train noise model on residuals + self.logger.statement("training noise model on residuals") + self.noise_model.fit(self.X, residuals_train) + + return self + + def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True): + """ + Fit the model to the training data. + + Parameters + ---------- + epochs : int, optional + Number of training epochs. Default is 200. + batch_size : int, optional + Batch size for training. Default is 32. + X : pandas.DataFrame, optional + Input data for training. If None and prepare_data is True, + will run prepare_training_data(). Default is None. + y : pandas.DataFrame, optional + Not used directly but included for API consistency. Default is None. + prepare_data : bool, optional + Whether to prepare training data if not already done. Default is True. + + Returns + ------- + self : LDFA + The fitted emulator. + """ + if prepare_data and (X is None or self.X is None): + self.prepare_training_data() + + if self.model is None: + self.create_model() + + # Update max_iter for the model + self.model.model.max_iter = epochs + + # Simple fit - scikit-learn handles batching, early stopping, etc. + self.logger.statement(f"fitting model with MLPRegressor: {epochs} epochs") + + X_train = self.X if X is None else X + y_train = self.y + + # Fit the model + self.model.fit(X_train, y_train) + + # Store training history + self.history = { + 'loss': self.model.loss_curve_, + 'val_loss': [] # MLPRegressor doesn't provide separate validation loss + } + + # Log final training info + n_iter = getattr(self.model.model, 'n_iter_', epochs) + final_loss = self.model.loss_curve_[-1] if self.model.loss_curve_ else "N/A" + self.logger.statement(f"Training completed in {n_iter} iterations, final loss: {final_loss}") + + self.fitted = True + return self + + def predict(self, data): + """ + Generate predictions for new data. + + Parameters + ---------- + data : pandas.DataFrame + New data to generate predictions for. + + Returns + ------- + pandas.DataFrame + Predictions for the input data. + """ + if not self.fitted: + raise ValueError("Emulator must be fitted before prediction") + + if self.model is None: + raise ValueError("No model has been created. Call create_model() first") + + self.logger.statement("generating predictions from fitted model") + + # Make a copy of the input data to avoid modifying the original + truth = data.copy() + predictions = truth.copy() + predictions[:] = np.nan + + # STEP 1: Apply the same sequence of transformations used during training + self.logger.statement("applying transformations to input data") + + # Apply transfrom pipeline if it was used during training + truth_transformed = self.feature_transformer.transform(truth) + + + # Apply row-wise min-max scaling + # We need to fit a new scaler on the truth data + forecast_rowwise_mm_scaler = RowWiseMinMaxScaler( + feature_range=(-1, 1), + groups=self.groups, + fit_groups=self.fit_groups + ) + forecast_rowwise_mm_scaler.fit(truth_transformed) + truth_scaled = forecast_rowwise_mm_scaler.transform(truth_transformed) + + # Extract input columns and apply PCA transformation + X_truth = truth_scaled.loc[:, self.input_cols].copy() + y_truth = truth_scaled.loc[:, self.forecast_names].copy() + + # Apply PCA transform + truth_pca = self.pcaX.transform(X_truth.values) + + # Run model prediction + self.logger.statement("running model prediction") + + # Get model prediction + pred_pca = self.model.predict(truth_pca) + + # Add noise prediction if available + if self.noise_model is not None: + self.logger.statement("adding noise model prediction") + noise_pred = self.noise_model.predict(truth_pca) + pred_pca = pred_pca + noise_pred + + # Apply inverse transformations in REVERSE order of the original transformations + self.logger.statement("performing inverse transformations") + + # First inverse the PCA transform (was the last transform applied) + pred_scaled = pd.DataFrame( + self.pcay.inverse_transform(pred_pca), + columns=y_truth.columns, + index=y_truth.index + ) + + # Then inverse the row-wise min-max scaling (applied before PCA) + pred_transformed = forecast_rowwise_mm_scaler.inverse_transform(pred_scaled) + + # Assign predictions to output + predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names] + + # Finally, inverse the transform pipeline if it was applied (was the first transform) + predictions = self.feature_transformer.inverse_transform(predictions) + + return predictions \ No newline at end of file From 133e0dea84c0218cd332b87c2d5d38e20365b79c Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 19 Jun 2025 09:30:40 +0100 Subject: [PATCH 24/58] rename test file --- autotest/{dsi_tests.py => emulator_tests.py} | 103 ++++++++++--------- 1 file changed, 53 insertions(+), 50 deletions(-) rename autotest/{dsi_tests.py => emulator_tests.py} (68%) diff --git a/autotest/dsi_tests.py b/autotest/emulator_tests.py similarity index 68% rename from autotest/dsi_tests.py rename to autotest/emulator_tests.py index 3f0fa04a8..459e3d0c7 100644 --- a/autotest/dsi_tests.py +++ b/autotest/emulator_tests.py @@ -10,55 +10,6 @@ from pyemu.emulators import DSI -#def test_dsi_feature_transforms(): -# """Test feature transforms in DSI emulator""" -# # Create test data simulating an ensemble -# np.random.seed(42) -# n_reals = 10 -# n_obs = 5 -# sim_names = [f"obs{i}" for i in range(n_obs)] -# sim_data = np.random.lognormal(mean=0, sigma=1, size=(n_reals, n_obs)) -# sim_ensemble = pd.DataFrame(sim_data, columns=sim_names) -# -# # Create DSI emulator -# pst = pyemu.Pst.from_par_obs_names(["p1"], sim_names) -# dsi = pyemu.emulators.DSI( -# pst=pst, -# sim_ensemble=sim_ensemble, -# transforms = [{"type": "log10", "columns": sim_names}, -# {"type": "normal_score", "columns": sim_names}], -# -# ) -# -# # Test feature transforms -# dsi.apply_feature_transforms() -# -# # Check that transformed data exists -# assert dsi.data_transformed is not None -# -# # Check log transform was applied (values should be smaller than original lognormal data) -# assert dsi.data_transformed.mean().mean() < sim_ensemble.mean().mean() -# -# # Check the feature transformer object exists -# assert hasattr(dsi, "feature_transformer") -# -# # Test with specific columns for log transform -# dsi2 = pyemu.emulators.DSI( -# pst=pst, -# sim_ensemble=sim_ensemble, -# transforms = [{"type": "log10", "columns": sim_names[:2]}] -# ) -# dsi2.apply_feature_transforms() -# -# # Check only specified columns were log transformed -# orig_means = sim_ensemble.mean() -# transformed_means = dsi2.data_transformed.mean() -# -# for i, col in enumerate(sim_names): -# if i < 2: # Should be log transformed -# assert transformed_means[col] < orig_means[col] -# else: # Should be unchanged -# assert np.isclose(transformed_means[col], orig_means[col]) def dsi_freyberg(tmp_d,transforms=None,tag=""): @@ -190,6 +141,57 @@ def test_dsivc_freyberg(): port=_get_port(),) +def plot_freyberg_dsi(): + import pandas as pd + import pyemu + import matplotlib.pyplot as plt + + test_d = "ends_master" + case = "freyberg6_run_ies" + pst_name = os.path.join(test_d, case + ".pst") + pst = pyemu.Pst(pst_name) + predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"] + oe_name = pst_name.replace(".pst", ".0.obs.csv") + pr_oe = pd.read_csv(os.path.join(test_d,"freyberg6_run_ies.0.obs.csv"),index_col=0) + #pt_oe = pd.read_csv(os.path.join(test_d, "freyberg6_run_ies.3.obs.csv"), index_col=0) + pt_oe = pr_oe.copy() + + + m_d = os.path.join( "master_dsi") + pst = pyemu.Pst(os.path.join(m_d,"dsi.pst")) + pr_oe_dsi = pyemu.ObservationEnsemble.from_binary(pst=pst, filename=os.path.join(m_d,"dsi.0.obs.jcb"))._df + pt_oe_dsi = pyemu.ObservationEnsemble.from_binary(pst=pst, filename=os.path.join(m_d,"dsi.1.obs.jcb"))._df + + pv = pyemu.ObservationEnsemble(pst=pst,df=pt_oe).phi_vector + pv_dsi = pyemu.ObservationEnsemble(pst=pst, df=pt_oe_dsi).phi_vector + #print(pt_oe.shape) + pt_oe = pt_oe.loc[pv<25, :] + pt_oe_dsi = pt_oe_dsi.loc[pv_dsi < 25, :] + + # print(pt_oe.shape) + # fig,ax = plt.subplots(1,1,figsize=(5,5)) + # ax.hist(pv,bins=10,facecolor="b",alpha=0.5,density=True) + # ax.hist(pv_dsi, bins=10, facecolor="m", alpha=0.5,density=True) + # ax.set_yticks([]) + # plt.tight_layout() + # plt.show() + + + + fig,axes = plt.subplots(len(predictions),1,figsize=(10,10)) + for p,ax in zip(predictions,axes): + ax.hist(pr_oe.loc[:,p].values,bins=10,alpha=0.5,facecolor="0.5",density=True,label="prior") + ax.hist(pt_oe.loc[:, p].values, bins=10, alpha=0.5, facecolor="b",density=True,label="posterior") + ax.hist(pr_oe_dsi.loc[:, p].values, bins=10, facecolor="none",hatch="/",edgecolor="0.5", + lw=2.5,density=True,label="dsi prior") + ax.hist(pt_oe_dsi.loc[:, p].values, bins=10, facecolor="none",density=True,hatch="/",edgecolor="b",lw=2.5, + label="dsi posterior") + ax.set_title(p,loc="left") + ax.legend(loc="upper right") + ax.set_yticks([]) + plt.tight_layout() + plt.savefig("dsi_pred.pdf") + if __name__ == "__main__": @@ -197,4 +199,5 @@ def test_dsivc_freyberg(): #test_dsi_nst() #test_dsi_nst_extrap() #test_dsi_mixed() - test_dsivc_freyberg() \ No newline at end of file + #test_dsivc_freyberg() + plot_freyberg_dsi() \ No newline at end of file From c7cfadc7e9f6536ebe6681a339df112ffd28ae90 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 19 Jun 2025 09:38:49 +0100 Subject: [PATCH 25/58] rename ldfa to lpfa --- pyemu/__init__.py | 2 +- pyemu/emulators/__init__.py | 3 +- pyemu/emulators/{ldfa.py => lpfa.py} | 49 ++++++++++++++++++++-------- 3 files changed, 38 insertions(+), 16 deletions(-) rename pyemu/emulators/{ldfa.py => lpfa.py} (89%) diff --git a/pyemu/__init__.py b/pyemu/__init__.py index 85e9fe45d..a53c116ac 100644 --- a/pyemu/__init__.py +++ b/pyemu/__init__.py @@ -22,7 +22,7 @@ os_utils, pp_utils, smp_utils) from .emulators import ( #emulators - Emulator, DSI, + Emulator, DSI, LPFA, #transformers diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py index 5b521dceb..5bb861e71 100755 --- a/pyemu/emulators/__init__.py +++ b/pyemu/emulators/__init__.py @@ -9,10 +9,11 @@ ) from .base import Emulator from .dsi import DSI - +from .lpfa import LPFA __all__ = [ 'Emulator', #base Emulator Class 'DSI', # DSI Emulator Class + 'LPFA', 'BaseTransformer', 'Log10Transformer', 'RowWiseMinMaxScaler', diff --git a/pyemu/emulators/ldfa.py b/pyemu/emulators/lpfa.py similarity index 89% rename from pyemu/emulators/ldfa.py rename to pyemu/emulators/lpfa.py index 707ba80f3..104bb24bd 100644 --- a/pyemu/emulators/ldfa.py +++ b/pyemu/emulators/lpfa.py @@ -1,5 +1,5 @@ """ -Learning-based pattern-data-driven forecast approach (LDFA) emulator implementation. +Learning-based pattern-data-driven forecast approach (LPFA) emulator implementation. """ from __future__ import print_function, division @@ -13,9 +13,9 @@ from .transformers import RowWiseMinMaxScaler # Define scikit-learn based model class -class LDFAModel: +class LPFAModel: """ - Scikit-learn MLPRegressor wrapper for LDFA neural network model. + Scikit-learn MLPRegressor wrapper for LPFA neural network model. """ def __init__(self, input_dim, output_dim, hidden_units=None, activation='relu', dropout_rate=0.0, learning_rate=0.01, max_iter=200, early_stopping=True): @@ -59,7 +59,7 @@ def loss_curve_(self): return getattr(self.model, 'loss_curve_', []) -class LDFA(Emulator): +class LPFA(Emulator): """ Class for the Learning-based pattern-data-driven forecast approach from Kim et al (2025). @@ -252,11 +252,32 @@ def prepare_training_data(self, data=None, test_size=0.2): # Apply PCA to reduce the dimensionality of the data self.logger.statement("applying PCA dimensionality reduction") - self.pcaX = PCA()#n_components=X_test.shape[1]) - self.pcay = PCA()#n_components=y_test.shape[1]) + self.pcaX = PCA() + self.pcay = PCA() - self.X = self.pcaX.fit_transform(X_train) - self.y = self.pcay.fit_transform(y_train) + X_transformed = self.pcaX.fit_transform(X_train) + y_transformed = self.pcay.fit_transform(y_train) + + # Apply energy-based truncation + if self.energy_threshold < 1.0: + self.logger.statement("applying energy-based PCA truncation") + # For input PCA + explained_var_ratio_X = np.cumsum(self.pcaX.explained_variance_ratio_) + n_components_X = np.argmax(explained_var_ratio_X >= self.energy_threshold) + 1 + self.pcaX = PCA(n_components=n_components_X) + X_transformed = self.pcaX.fit_transform(X_train) + + # For output PCA + explained_var_ratio_y = np.cumsum(self.pcay.explained_variance_ratio_) + n_components_y = np.argmax(explained_var_ratio_y >= self.energy_threshold) + 1 + self.pcay = PCA(n_components=n_components_y) + y_transformed = self.pcay.fit_transform(y_train) + + self.logger.statement(f"Reduced X from {X_train.shape[1]} to {n_components_X} components") + self.logger.statement(f"Reduced y from {y_train.shape[1]} to {n_components_y} components") + + self.X = X_transformed + self.y = y_transformed self.X_test = self.pcaX.transform(X_test) self.y_test = self.pcay.transform(y_test) @@ -288,7 +309,7 @@ def _build_model(self, params=None, prob=False): Returns ------- - LDFAModel + LPFAModel The scikit-learn MLPRegressor wrapper instance. """ if params is None: @@ -306,7 +327,7 @@ def _build_model(self, params=None, prob=False): output_dim = self.y.shape[1] # Create the model architecture - model = LDFAModel( + model = LPFAModel( input_dim=input_dim, output_dim=output_dim, hidden_units=params['hidden_units'], @@ -329,7 +350,7 @@ def create_model(self, params=None): Returns ------- - self : LDFA + self : LPFA The emulator instance with model created. """ self.model = self._build_model(params) @@ -346,7 +367,7 @@ def add_noise_model(self, params=None): Returns ------- - self : LDFA + self : LPFA The emulator instance with noise model added. """ # Create noise model @@ -385,7 +406,7 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True): Returns ------- - self : LDFA + self : LPFA The fitted emulator. """ if prepare_data and (X is None or self.X is None): @@ -500,6 +521,6 @@ def predict(self, data): predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names] # Finally, inverse the transform pipeline if it was applied (was the first transform) - predictions = self.feature_transformer.inverse_transform(predictions) + predictions = self.feature_transformer.inverse(predictions) return predictions \ No newline at end of file From d4ae84eac0efb3e74b5840e255e009631222e467 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 19 Jun 2025 10:11:09 +0100 Subject: [PATCH 26/58] lpfa test --- autotest/emulator_tests.py | 119 ++++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 459e3d0c7..879f03987 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -7,7 +7,7 @@ import platform import pyemu from pst_from_tests import setup_tmp, ies_exe_path, _get_port -from pyemu.emulators import DSI +from pyemu.emulators import DSI, LPFA @@ -193,6 +193,120 @@ def plot_freyberg_dsi(): plt.savefig("dsi_pred.pdf") +def test_lpfa(tmp_d,transforms=None): + + test_d = "ends_master" + test_d = setup_tmp(test_d, tmp_d) + + case = "freyberg6_run_ies" + pst_name = os.path.join(test_d, case + ".pst") + pst = pyemu.Pst(pst_name) + predictions = ["headwater_20171130", "tailwater_20161130", "trgw_0_9_1_20161130"] + pst.pestpp_options["predictions"] = predictions + + oe_name = pst_name.replace(".pst", ".0.obs.csv") + oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] + data = oe._df.copy() + + obs = pst.observation_data.copy() + #obs["date"] = pd.to_datetime(obs.obsnme.str.split("_")[-1]) + #obs.sort_values(by=["obgnme", "date"], inplace=True) + + fit_groups = { + o: obs.loc[obs.obgnme == o, "obsnme"].tolist()[:12] for o in obs.obgnme.unique() + } + groups ={ + o: obs.loc[obs.obgnme == o, "obsnme"].tolist() for o in obs.obgnme.unique() + } + + input_cols = obs.loc[obs.weight>0, "obsnme"].tolist() + forecast_names = obs.obsnme.tolist() + + # Create LPFA emulator + lpfa = LPFA( + data=data, + input_cols=input_cols, + groups=groups, + fit_groups=fit_groups, + forecast_names=forecast_names, + energy_threshold=0.9999, # Keep most variance in PCA + seed=42, + early_stop=True, + #transforms=None, # No additional transforms for this demo + transforms = transforms, + verbose=True + ) + + training_data = lpfa.prepare_training_data(test_size=0.2) + + # Define model parameters + model_params = { + 'activation': 'relu', + 'hidden_units': [128, 64], # Two hidden layers + 'dropout_rate': 0.1, + 'learning_rate': 0.01 + } + + # Create the model + lpfa.create_model(model_params) + + # Train the model + lpfa.fit(epochs=200) + + # Add noise model to capture residuals + noise_params = { + 'activation': 'relu', + 'hidden_units': [64, 32], # Smaller network for residuals + 'dropout_rate': 0.05, + 'learning_rate': 0.005 + } + + lpfa.add_noise_model(noise_params) + + # Generate predictions + predictions = lpfa.predict(obs[["obsval"]].T) + + + # Create scatter plot comparing predictions vs truth + import matplotlib.pyplot as plt + fig, ax = plt.subplots(1, 1, figsize=(8, 6)) + + # Get non-zero weight observations for comparison + comparison_obs = obs.loc[obs.weight > 0].obsnme.values + + # Extract values for plotting + nzobsnmes = obs.loc[obs.weight>0].obsnme.tolist() + truth_values = obs.loc[nzobsnmes].obsval.values.flatten() + pred_values = predictions.loc[:,nzobsnmes].values.flatten() + + # Create scatter plot + ax.scatter(truth_values, pred_values, alpha=0.6, s=20) + ax.set_xlabel('Truth Values') + ax.set_ylabel('Predicted Values') + ax.set_title('lpfa Emulator: Predicted vs Truth') + + # Add 1:1 line + min_val = min(ax.get_xlim()[0], ax.get_ylim()[0]) + max_val = max(ax.get_xlim()[1], ax.get_ylim()[1]) + ax.plot([min_val, max_val], [min_val, max_val], 'k-', lw=1, alpha=0.7) + ax.set_xlim(min_val, max_val) + ax.set_ylim(min_val, max_val) + + # Calculate R² + correlation = np.corrcoef(truth_values, pred_values)[0, 1] + r_squared = correlation ** 2 + assert r_squared >= 0.9, "R-squared should deccent" + ax.text(0.05, 0.95, f'R² = {r_squared:.3f}', transform=ax.transAxes, + bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) + + plt.tight_layout() + #plt.show() + + print(f"Correlation coefficient: {correlation:.3f}") + print(f"R-squared: {r_squared:.3f}") + + return + if __name__ == "__main__": #test_dsi_basic() @@ -200,4 +314,5 @@ def plot_freyberg_dsi(): #test_dsi_nst_extrap() #test_dsi_mixed() #test_dsivc_freyberg() - plot_freyberg_dsi() \ No newline at end of file + #plot_freyberg_dsi() + test_lpfa(tmp_d="temp",) \ No newline at end of file From 6a8900ba44d486320e550a73294abfc6c50e9c72 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 19 Jun 2025 10:16:18 +0100 Subject: [PATCH 27/58] added transform pipeline test for ldfa --- autotest/emulator_tests.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 879f03987..3645cf341 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -193,7 +193,7 @@ def plot_freyberg_dsi(): plt.savefig("dsi_pred.pdf") -def test_lpfa(tmp_d,transforms=None): +def lpfa_freyberg(tmp_d="temp",transforms=None): test_d = "ends_master" test_d = setup_tmp(test_d, tmp_d) @@ -307,6 +307,15 @@ def test_lpfa(tmp_d,transforms=None): return +def test_lpfa_basic(): + lpfa_freyberg(tmp_d="temp",transforms=None) + return + +def test_lpfa_std(): + lpfa_freyberg(tmp_d="temp",transforms=[ + {"type": "standard_scaler"} + ]) + return if __name__ == "__main__": #test_dsi_basic() @@ -315,4 +324,4 @@ def test_lpfa(tmp_d,transforms=None): #test_dsi_mixed() #test_dsivc_freyberg() #plot_freyberg_dsi() - test_lpfa(tmp_d="temp",) \ No newline at end of file + test_lpfa_std() \ No newline at end of file From 54985370010dc72f2838da9123af3d0123797a73 Mon Sep 17 00:00:00 2001 From: jwhite Date: Fri, 20 Jun 2025 08:43:46 -0600 Subject: [PATCH 28/58] chasing CI issue on linux --- .github/workflows/ci.yml | 2 +- etc/environment.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03ab32948..62749abb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,7 +128,7 @@ jobs: working-directory: ./examples run: | micromamba install --name pyemu jupyter jupytext - pytest -v -rP -rx --capture=no -n=auto --nbmake --cov=pyemu --cov-report=lcov:../autotest/coverage.lcov \ + pytest -v -s --nbmake --cov=pyemu --cov-report=lcov:../autotest/coverage.lcov \ --cov-config=../autotest/.coveragerc *.ipynb env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/etc/environment.yml b/etc/environment.yml index d4ba654e2..5df38d2a5 100644 --- a/etc/environment.yml +++ b/etc/environment.yml @@ -1,6 +1,7 @@ name: pyemu channels: - conda-forge + - nodefaults dependencies: # required - python>=3.8 From ad8893d70225c8d0cfc63a855e88d80ec34e78d4 Mon Sep 17 00:00:00 2001 From: jwhite Date: Fri, 20 Jun 2025 12:37:14 -0600 Subject: [PATCH 29/58] more chasing --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 62749abb1..1a5a598dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,7 +118,7 @@ jobs: shell: bash -l {0} working-directory: ./autotest run: | - pytest -rP -rx --capture=no -v -n=auto --tb=native --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} + pytest -v -s --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b2761dc72ce88a5646dc41e6905c9a9420bb9a1f Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 30 Jun 2025 15:39:44 +0100 Subject: [PATCH 30/58] refactor StandardSclaer to use sklearn --- pyemu/emulators/transformers.py | 84 +++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py index 22c1bbb02..f786f252b 100755 --- a/pyemu/emulators/transformers.py +++ b/pyemu/emulators/transformers.py @@ -4,6 +4,8 @@ from __future__ import print_function, division import numpy as np import pandas as pd +from sklearn.preprocessing import StandardScaler + class BaseTransformer: """Base class for all transformers providing a consistent interface.""" @@ -56,6 +58,7 @@ class RowWiseMinMaxScaler(BaseTransformer): groups : dict or None, default=None Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group). If None, all columns will be treated as a single group. + Example: {'group1': ['col1', 'col2'], 'group2': ['col3', 'col4']} fit_groups : dict or None, default=None Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max. If None, defaults to using the same columns as in groups. @@ -315,40 +318,61 @@ def inverse_transform(self, X): return result class StandardScalerTransformer(BaseTransformer): - """Apply standard scaling (zero mean, unit variance) to data.""" - - def __init__(self): - self.means = {} - self.stds = {} - + def __init__(self, with_mean=True, with_std=True, copy=True): + self.with_mean = with_mean + self.with_std = with_std + self.copy = copy + self._sklearn_scaler = None + self._columns = None + def fit(self, X): - """Compute mean and standard deviation for each feature.""" - for col in X.columns: - self.means[col] = X[col].mean() - self.stds[col] = X[col].std() - if self.stds[col] == 0: - self.stds[col] = 1.0 # Avoid division by zero + # Store column names for DataFrame reconstruction + self._columns = X.columns.tolist() + + # Create sklearn StandardScaler + self._sklearn_scaler = StandardScaler( + with_mean=self.with_mean, + with_std=self.with_std, + copy=self.copy + ) + + # Fit on numpy array (sklearn expects this) + self._sklearn_scaler.fit(X.values) return self - + def transform(self, X): - """Transform the data using mean and std from fit.""" - result = X.copy() - for col in X.columns: - if col in self.means: - mean = self.means[col] - std = self.stds[col] - result[col] = (X[col] - mean) / std - return result - + if self._sklearn_scaler is None: + raise ValueError("Transformer must be fitted before transform") + + # Transform using sklearn + transformed_values = self._sklearn_scaler.transform(X.values) + + # Reconstruct DataFrame with original structure + if isinstance(X, pd.DataFrame): + return pd.DataFrame( + transformed_values, + index=X.index, + columns=X.columns + ) + else: + return transformed_values + def inverse_transform(self, X): - """Inverse transform data back to original scale.""" - result = X.copy() - for col in X.columns: - if col in self.means: - mean = self.means[col] - std = self.stds[col] - result[col] = (X[col] * std) + mean - return result + if self._sklearn_scaler is None: + raise ValueError("Transformer must be fitted before inverse_transform") + + # Inverse transform using sklearn + inverse_values = self._sklearn_scaler.inverse_transform(X.values) + + # Reconstruct DataFrame + if isinstance(X, pd.DataFrame): + return pd.DataFrame( + inverse_values, + index=X.index, + columns=X.columns + ) + else: + return inverse_values class NormalScoreTransformer(BaseTransformer): """A transformer for normal score transformation.""" From 9e62644abe714639a1cd2fe585971fe690a2edc0 Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 30 Jun 2025 15:40:09 +0100 Subject: [PATCH 31/58] fix imports --- pyemu/emulators/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py index f786f252b..39345159e 100755 --- a/pyemu/emulators/transformers.py +++ b/pyemu/emulators/transformers.py @@ -198,7 +198,7 @@ class MinMaxScaler(BaseTransformer): Parameters ---------- - feature_range : tuple (min, max), default=(0, 1) + feature_range : tuple (min, max), default=(-1, 1) The range to scale features into. columns : list, optional List of column names to be scaled. If None, all columns will be scaled. From 444591bb0e548db120052c1175f8c1f97311d3b8 Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 1 Jul 2025 16:03:54 +0100 Subject: [PATCH 32/58] refactor naming and streamline emulator building workflow --- autotest/emulator_tests.py | 15 +++--- pyemu/emulators/base.py | 90 +++++++++++++++++++++++--------- pyemu/emulators/dsi.py | 52 +++++++++---------- pyemu/emulators/lpfa.py | 102 ++++++++++++++++++++----------------- 4 files changed, 151 insertions(+), 108 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 3645cf341..b2aa7fcb9 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -7,7 +7,7 @@ import platform import pyemu from pst_from_tests import setup_tmp, ies_exe_path, _get_port -from pyemu.emulators import DSI, LPFA +from pyemu.emulators import DSI, LPFA, GPR @@ -26,8 +26,8 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""): oe = pyemu.ObservationEnsemble.from_csv(pst=pst, filename=oe_name).iloc[:100, :] data = oe._df.copy() - dsi = DSI(sim_ensemble=data,transforms=transforms) - dsi.apply_feature_transforms() + dsi = DSI(data=data,transforms=transforms) + #dsi._fit_transformer_pipeline() dsi.fit() # history match @@ -225,10 +225,10 @@ def lpfa_freyberg(tmp_d="temp",transforms=None): # Create LPFA emulator lpfa = LPFA( data=data, - input_cols=input_cols, + input_names=input_cols, groups=groups, fit_groups=fit_groups, - forecast_names=forecast_names, + output_names=forecast_names, energy_threshold=0.9999, # Keep most variance in PCA seed=42, early_stop=True, @@ -237,7 +237,7 @@ def lpfa_freyberg(tmp_d="temp",transforms=None): verbose=True ) - training_data = lpfa.prepare_training_data(test_size=0.2) + #training_data = lpfa.prepare_training_data(test_size=0.2) # Define model parameters model_params = { @@ -312,6 +312,7 @@ def test_lpfa_basic(): return def test_lpfa_std(): + #NOTE: fit with standard scaler transform are worse than without lpfa_freyberg(tmp_d="temp",transforms=[ {"type": "standard_scaler"} ]) @@ -324,4 +325,4 @@ def test_lpfa_std(): #test_dsi_mixed() #test_dsivc_freyberg() #plot_freyberg_dsi() - test_lpfa_std() \ No newline at end of file + test_lpfa_std() diff --git a/pyemu/emulators/base.py b/pyemu/emulators/base.py index c00809dc1..f1dab6951 100755 --- a/pyemu/emulators/base.py +++ b/pyemu/emulators/base.py @@ -14,18 +14,25 @@ class Emulator: This class defines the common interface for all emulator implementations and provides shared functionality used by multiple emulator types. - Parameters - ---------- - verbose : bool, optional - If True, enable verbose logging. Default is True. """ - def __init__(self, verbose=True): + def __init__(self,transforms=None, verbose=True): """ Initialize the Emulator base class. Parameters ---------- + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g.,'log10', 'normal_score'). + - 'columns': list of str,optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. + - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). + Example: + transforms = [ + {'type': 'log10', 'columns': ['obs1', 'obs2']}, + {'type': 'normal_score', 'quadratic_extrapolation': True} + ] + Default is None, which means no transformations will be applied. verbose : bool, optional If True, enable verbose logging. Default is True. """ @@ -34,9 +41,8 @@ def __init__(self, verbose=True): self.fitted = False self.data = None self.data_transformed = None - self.feature_scaler = None - self.energy_threshold = 1.0 - self.feature_transformer = None + self.transforms = transforms + self.transformer_pipeline = None def fit(self, X, y=None): """ @@ -74,32 +80,46 @@ def predict(self, X): raise ValueError("Emulator must be fitted before prediction") raise NotImplementedError("Subclasses must implement predict method") - def prepare_training_data(self, data=None): + def _prepare_training_data(self): """ Prepare and transform training data for model fitting. Parameters ---------- - data : pandas.DataFrame, optional - Raw training data. If None, uses self.data. - + self : Emulator + The emulator instance. Returns ------- tuple Processed data ready for model fitting. """ + data = self.data if data is None: - if self.data is None: - raise ValueError("No data provided and no data stored in the emulator") - data = self.data + raise ValueError("No data provided and no data stored in the emulator") + + # Common preprocessing logic could go here + self.logger.statement("preparing training data") - # Common preprocessing logic could go here - return data + # apply feature transformations if they exist, etc.. + # Always use the base class transformation method for consistency + if self.transforms is not None: + self.logger.statement("applying feature transforms") + self.data_transformed = self._fit_transformer_pipeline(data, self.transforms) + else: + # Still need to set up a dummy transformer for inverse operations + from .transformers import AutobotsAssemble + self.feature_transformer = AutobotsAssemble(data.copy()) + self.data_transformed = data.copy() + + return self.data_transformed + + return - def apply_feature_transforms(self, data=None, transforms=None): + def _fit_transformer_pipeline(self, data=None, transforms=None): """ Apply feature transformations to data with customizable transformer sequence. This function is not intended to be used directly by users. + External data must be accepted to handle train/test spliting for certain emulators (e.g., LPFA). Parameters ---------- @@ -137,10 +157,13 @@ def apply_feature_transforms(self, data=None, transforms=None): # Import AutobotsAssemble here to avoid circular import from .transformers import AutobotsAssemble - ft = AutobotsAssemble(data.copy()) + transformer_pipeline = AutobotsAssemble(data.copy()) # Process the transforms parameter if provided + if transforms is None: + transforms = self.transforms if transforms: + self._validate_transforms(transforms) for transform in transforms: transform_type = transform.get('type') columns = transform.get('columns') @@ -149,13 +172,12 @@ def apply_feature_transforms(self, data=None, transforms=None): if k not in ('type', 'columns')} self.logger.statement(f"applying {transform_type} transform") - ft.apply(transform_type, columns=columns, **kwargs) + transformer_pipeline.apply(transform_type, columns=columns, **kwargs) - transformed_data = ft.df.copy() - self.feature_transformer = ft - self.data_transformed = transformed_data + self.transformer_pipeline = transformer_pipeline + self.data_transformed = transformer_pipeline.df.copy() - return transformed_data + return self.data_transformed def save(self, filename): """ @@ -185,4 +207,22 @@ def load(cls, filename): The loaded emulator instance. """ with open(filename, "rb") as f: - return pickle.load(f) \ No newline at end of file + return pickle.load(f) + + + def _validate_transforms(self, transforms): + """Validate the transforms parameter.""" + if not isinstance(transforms, list): + raise ValueError("transforms must be a list of dicts or None") + + for t in transforms: + if not isinstance(t, dict): + raise ValueError("each transform must be a dict") + if 'type' not in t: + raise ValueError("each transform dict must have a 'type' key") + if 'columns' in t and not isinstance(t['columns'], list): + raise ValueError("'columns' must be a list of column names") + + + + #TODO: implment helper function that scrapes directory and collates training data from Pst ensemble files + control file information. \ No newline at end of file diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index a3b699d73..940868a57 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -14,14 +14,14 @@ class DSI(Emulator): """ - Data Space Inversion (DS) emulator class. Based on DSI as described in Sun & + Data Space Inversion (DSI) emulator class. Based on DSI as described in Sun & Durlofsky (2017) and Sun et al (2017). """ def __init__(self, pst=None, - sim_ensemble=None, + data=None, transforms=None, energy_threshold=1.0, verbose=False): @@ -33,7 +33,7 @@ def __init__(self, pst : Pst, optional A Pst object. If provided, the emulator will be initialized with the information from the Pst object. - sim_ensemble : ObservationEnsemble, optional + data : DataFrame or ObservationEnsemble, optional An ensemble of simulated observations. If provided, the emulator will be initialized with the information from the ensemble. transforms : list of dict, optional @@ -58,11 +58,12 @@ def __init__(self, self.observation_data = pst.observation_data.copy() if pst is not None else None #self.__org_parameter_data = pst.parameter_data.copy() if pst is not None else None #self.__org_control_data = pst.control_data.copy() #breaks pickling - if isinstance(sim_ensemble, ObservationEnsemble): - sim_ensemble = sim_ensemble._df.copy() - #self.__org_sim_ensemble = sim_ensemble.copy() if sim_ensemble is not None else None - self.data = sim_ensemble.copy() if sim_ensemble is not None else None - #self.feature_scaler = None + if isinstance(data, ObservationEnsemble): + data = data._df.copy() + # set all data to be floats + data = data.astype(float) if data is not None else None + #self.__org_data = data.copy() if data is not None else None + self.data = data.copy() if data is not None else None self.energy_threshold = energy_threshold assert isinstance(transforms, list) or transforms is None, "transforms must be a list of dicts or None" if transforms is not None: @@ -79,32 +80,31 @@ def __init__(self, assert isinstance(t['quadratic_extrapolation'], bool), "'quadratic_extrapolation' must be a boolean" self.transforms = transforms self.fitted = False - self.data_transformed = None + self.data_transformed = self._prepare_training_data() self.decision_variable_names = None #used for DSIVC - def prepare_training_data(self, data=None): + def _prepare_training_data(self): """ Prepare and transform training data for model fitting. Parameters ---------- - data : pandas.DataFrame, optional - Raw training data. If None, uses self.data. + self : DSI + The DSI emulator instance. Returns ------- tuple Processed data ready for model fitting. """ + data = self.data if data is None: - data = self.data - if data is None: - raise ValueError("No data provided and no data stored in the emulator") + raise ValueError("No data stored in the emulator") self.logger.statement("applying feature transforms") # Always use the base class transformation method for consistency if self.transforms is not None: - self.data_transformed = self.apply_feature_transforms(data, self.transforms) + self.data_transformed = self._fit_transformer_pipeline(data, self.transforms) else: # Still need to set up a dummy transformer for inverse operations from .transformers import AutobotsAssemble @@ -164,30 +164,24 @@ def compute_projection_matrix(self, energy_threshold=None): self.s = s return - def fit(self, X=None, y=None): + def fit(self): """ Fit the emulator to training data. Parameters ---------- - X : pandas.DataFrame - Input data to fit the emulator on. - y : None - Not used, present for API consistency. + self : DSI + The DSI emulator instance. Returns ------- self : DSI The fitted emulator. """ - if X is not None: - self.data = X - self.logger.statement("transforming new training data") - self.data_transformed = self.prepare_training_data() if self.data_transformed is None: self.logger.statement("transforming training data") - self.data_transformed = self.prepare_training_data() + self.data_transformed = self._prepare_training_data() # Compute projection matrix self.compute_projection_matrix() @@ -211,7 +205,7 @@ def predict(self, pvals): if not self.fitted: raise ValueError("Emulator must be fitted before prediction") - if not hasattr(self, 'feature_transformer') or self.feature_transformer is None: + if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None: raise ValueError("Emulator must be fitted and have valid transformations before prediction") if isinstance(pvals, pd.Series): @@ -221,8 +215,8 @@ def predict(self, pvals): pmat = self.pmat ovals = self.ovals sim_vals = ovals + np.dot(pmat,pvals) - ft = self.feature_transformer - sim_vals = ft.inverse(sim_vals) + pipeline = self.transformer_pipeline + sim_vals = pipeline.inverse(sim_vals) sim_vals.index.name = 'obsnme' sim_vals.name = "obsval" self.sim_vals = sim_vals diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py index 104bb24bd..4252a61d7 100644 --- a/pyemu/emulators/lpfa.py +++ b/pyemu/emulators/lpfa.py @@ -70,36 +70,46 @@ class LPFA(Emulator): ---------- data : pandas.DataFrame The training data with input and forecast columns. - input_cols : list + input_names : list List of column names to use as inputs. groups : dict Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling. fit_groups : dict Dictionary mapping group names to lists of column names used to fit the scaling. - forecast_names : list, optional - List of column names to forecast. If None, all columns in data will be used. + output_names : list, optional + List of column names to forecast. If None, all columns not in input_names are used. energy_threshold : float, optional Energy threshold for the PCA. Default is 1.0. seed : int, optional Random seed for reproducibility. Default is None. early_stop : bool, optional Whether to use early stopping during training. Default is True. - apply_std_scaler : bool, optional - Whether to apply standard scaling before min-max scaling. Default is False. + transforms : list of dict, optional + List of transformation specifications. Each dict should have: + - 'type': str - Type of transformation (e.g., 'log10', 'normal_score'). + - 'columns': list of str, optional - Columns to apply the transformation to. If not supplied, transformation is applied to all columns. + - Additional kwargs for the transformation (e.g., 'quadratic_extrapolation' for normal score transform). + Example: + transforms = [ + {'type': 'log10', 'columns': ['obs1', 'obs2']}, + {'type': 'normal_score', 'quadratic_extrapolation': True} + ] + Default is None, which means no transformations will be applied. verbose : bool, optional If True, enable verbose logging. Default is True. """ def __init__(self, data, - input_cols, + input_names, groups, fit_groups, - forecast_names=None, + output_names=None, energy_threshold=1.0, seed=None, early_stop=True, transforms=None, + test_size=0.2, verbose=True): """ Initialize the Learning-based pattern-data-driven NN emulator. @@ -108,13 +118,13 @@ def __init__(self, ---------- data : pandas.DataFrame The training data with input and forecast columns. - input_cols : list + input_names : list List of column names to use as inputs. groups : dict Dictionary mapping group names to lists of column names. Used for row-wise min-max scaling. fit_groups : dict Dictionary mapping group names to lists of column names used to fit the scaling. - forecast_names : list, optional + output_names : list, optional List of column names to forecast. If None, all columns in data will be used. energy_threshold : float, optional Energy threshold for the PCA. Default is 1.0. @@ -133,6 +143,8 @@ def __init__(self, {'type': 'normal_score', 'quadratic_extrapolation': True} ] Default is None, which means no transformations will be applied. + test_size : float, optional + Fraction of data to use for testing. Default is 0.2. verbose : bool, optional If True, enable verbose logging. Default is True. """ @@ -142,13 +154,13 @@ def __init__(self, self.seed = seed self.data = data - self.input_cols = input_cols + self.input_names = input_names self.groups = groups self.fit_groups = fit_groups - if forecast_names is None: - forecast_names = data.columns - self.forecast_names = forecast_names + if output_names is None: + output_names = data.columns + self.output_names = output_names self.energy_threshold = energy_threshold @@ -160,8 +172,13 @@ def __init__(self, self.model = None self.train_data = None self.test_data = None + self.test_size = test_size + + # Prepare the training data - def prepare_training_data(self, data=None, test_size=0.2): + self._prepare_training_data() + + def _prepare_training_data(self): """ Prepare the training data for model fitting. @@ -187,36 +204,36 @@ def prepare_training_data(self, data=None, test_size=0.2): - X_test: Input testing data after transformation and PCA - y_test: Target testing data after transformation and PCA """ - if data is None: - data = self.data - + + self.logger.statement("preparing training data") + data = self.data if data is None: raise ValueError("No data provided and no data stored in the emulator") # Split the data into training and test sets train, test = train_test_split( data, - test_size=test_size, + test_size=self.test_size, random_state=self.seed ) - self.logger.statement("preparing training data: data split complete") + self.logger.statement("train/test data split complete") # Store for later use self.train_data = train.copy() self.test_data = test.copy() - - # TODO: Apply feature transformations if specified + self.logger.statement("applying feature transformation pipeline") + # Apply feature transformations if specified # Always use the base class transformation method for consistency if self.transforms is None: from .transformers import AutobotsAssemble - self.feature_transformer = AutobotsAssemble(train.copy()) + self.transformer_pipeline = AutobotsAssemble(train.copy()) train_transformed = train test_transformed = test else: - train_transformed = self.apply_feature_transforms(train, self.transforms) - test_transformed = self.feature_transformer.transform(test) + train_transformed = self._fit_transformer_pipeline(train, self.transforms) + test_transformed = self.transformer_pipeline.transform(test) # Apply row-wise min-max scaling directly (not through the pipeline) @@ -244,11 +261,11 @@ def prepare_training_data(self, data=None, test_size=0.2): self.logger.statement("row-wise min-max scaling complete") # Split datasets into input (X) and target (y) variables - X_train = train_scaled.loc[:, self.input_cols].copy() - y_train = train_scaled.loc[:, self.forecast_names].copy() + X_train = train_scaled.loc[:, self.input_names].copy() + y_train = train_scaled.loc[:, self.output_names].copy() - X_test = test_scaled.loc[:, self.input_cols].copy() - y_test = test_scaled.loc[:, self.forecast_names].copy() + X_test = test_scaled.loc[:, self.input_names].copy() + y_test = test_scaled.loc[:, self.output_names].copy() # Apply PCA to reduce the dimensionality of the data self.logger.statement("applying PCA dimensionality reduction") @@ -386,7 +403,7 @@ def add_noise_model(self, params=None): return self - def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True): + def fit(self, epochs=200): """ Fit the model to the training data. @@ -394,23 +411,14 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True): ---------- epochs : int, optional Number of training epochs. Default is 200. - batch_size : int, optional - Batch size for training. Default is 32. - X : pandas.DataFrame, optional - Input data for training. If None and prepare_data is True, - will run prepare_training_data(). Default is None. - y : pandas.DataFrame, optional - Not used directly but included for API consistency. Default is None. - prepare_data : bool, optional - Whether to prepare training data if not already done. Default is True. - Returns ------- self : LPFA The fitted emulator. """ - if prepare_data and (X is None or self.X is None): - self.prepare_training_data() + if self.data_transformed is None: + self.logger.statement("transforming training data") + self.data_transformed = self._prepare_training_data() if self.model is None: self.create_model() @@ -421,7 +429,7 @@ def fit(self, epochs=200, batch_size=32, X=None, y=None, prepare_data=True): # Simple fit - scikit-learn handles batching, early stopping, etc. self.logger.statement(f"fitting model with MLPRegressor: {epochs} epochs") - X_train = self.X if X is None else X + X_train = self.X y_train = self.y # Fit the model @@ -472,7 +480,7 @@ def predict(self, data): self.logger.statement("applying transformations to input data") # Apply transfrom pipeline if it was used during training - truth_transformed = self.feature_transformer.transform(truth) + truth_transformed = self.transformer_pipeline.transform(truth) # Apply row-wise min-max scaling @@ -486,8 +494,8 @@ def predict(self, data): truth_scaled = forecast_rowwise_mm_scaler.transform(truth_transformed) # Extract input columns and apply PCA transformation - X_truth = truth_scaled.loc[:, self.input_cols].copy() - y_truth = truth_scaled.loc[:, self.forecast_names].copy() + X_truth = truth_scaled.loc[:, self.input_names].copy() + y_truth = truth_scaled.loc[:, self.output_names].copy() # Apply PCA transform truth_pca = self.pcaX.transform(X_truth.values) @@ -518,9 +526,9 @@ def predict(self, data): pred_transformed = forecast_rowwise_mm_scaler.inverse_transform(pred_scaled) # Assign predictions to output - predictions.loc[:, self.forecast_names] = pred_transformed.loc[:, self.forecast_names] + predictions.loc[:, self.output_names] = pred_transformed.loc[:, self.output_names] # Finally, inverse the transform pipeline if it was applied (was the first transform) - predictions = self.feature_transformer.inverse(predictions) + predictions = self.transformer_pipeline.inverse(predictions) return predictions \ No newline at end of file From 23f57acfd049c5b20db95cdf6a6feb78f79c5c78 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 15:20:17 +0100 Subject: [PATCH 33/58] functional gpr class + pestpp setup --- pyemu/emulators/gpr.py | 495 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 495 insertions(+) create mode 100644 pyemu/emulators/gpr.py diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py new file mode 100644 index 000000000..ac43b567a --- /dev/null +++ b/pyemu/emulators/gpr.py @@ -0,0 +1,495 @@ +""" +Gaussian Process Regression (GPR) emulator implementation. +""" +from __future__ import print_function, division +import numpy as np +import pandas as pd +import os +import shutil +import inspect +from .base import Emulator +from .transformers import AutobotsAssemble +from sklearn.gaussian_process import GaussianProcessRegressor +from pyemu.utils import run + +from pyemu.pst import Pst + + +class GPR(Emulator): + """ + Gaussian Process Regression (GPR) emulator class. + + This class implements a GPR-based emulator that trains separate Gaussian Process + models for each output variable. It supports various kernel types, feature + transformations, and provides uncertainty quantification. + + Parameters + ---------- + data : pandas.DataFrame, optional + Input and output features for training. + input_names : list of str, optional + Names of input features to use. If None, all columns in input_data are used. + output_names : list of str, optional + Names of output variables to emulate. If None, all columns in output_data are used. + kernel : sklearn kernel object, optional + Kernel to use for GP regression. If None, defaults to Matern kernel. + transforms : list of dict, optional. Defaults to [{'type': 'standard_scaler'}] + n_restarts_optimizer : int, optional + Number of restarts for kernel hyperparameter optimization. Default is 10. + return_std : bool, optional + Whether to return prediction uncertainties. Default is True. + verbose : bool, optional + Enable verbose logging. Default is True. + """ + + def __init__(self, + data, + input_names=None, + output_names=None, + kernel=None, + transforms=[{'type': 'standard_scaler'}], + n_restarts_optimizer=10, + return_std=True, + verbose=True): + """Initialize the GPR emulator.""" + + super().__init__(verbose=verbose) + + # Store initialization parameters + # check data is a DataFrame + if not isinstance(data, pd.DataFrame): + raise ValueError("data must be a pandas DataFrame") + self.data = data.copy() + + # Check input and output names + # check input_names and output_names are lists or None + if input_names is not None and not isinstance(input_names, list): + raise ValueError("input_names must be a list or None") + if output_names is not None and not isinstance(output_names, list): + raise ValueError("output_names must be a list or None") + self.input_names = input_names + self.output_names = output_names + + self.kernel = kernel + self.transforms = transforms + self.n_restarts_optimizer = n_restarts_optimizer + self.return_std = return_std + + # Initialize data + self.data = data + + # Model storage + self.models = {} + self.model_info = None + self.verification_results = {} + + # PEST++ integration + self.template_dir = None + + # Validate transforms parameter + if transforms is not None: + self._validate_transforms(transforms) + self._validate_transforms_for_gpr() + + def _validate_transforms_for_gpr(self): + """Validate transforms parameter for GPR. Make sure transforms are only applied to input data.""" + # Validate transforms parameter + transforms = self.transforms + if transforms is not None: + # For the speicif case of GPR, we only transform input data + for t in transforms: + if 'columns' in t: + # check if any columns are in output_names + if self.output_names is not None: + common_cols = set(t['columns']).intersection(self.output_names) + if common_cols: + self.logger.statement(f"Transform {t['type']} will not be applied to output columns: {common_cols}") + # remove these columns from transforms + t['columns'] = [col for col in t['columns'] if col not in common_cols] + if not t['columns']: + self.logger.statement(f"Transform {t['type']} has no columns left after removing output columns: {common_cols}") + # remove this transform + self.logger.statement(f"Removing transform {t['type']} as it has no columns left") + self.transforms.remove(t) + else: + self.logger.statement(f"Transform {t['type']} has no specified columns, applying to all input columns") + t['columns'] = self.input_names if self.input_names is not None else [] + return transforms + +# def _combine_input_output_data(self, input_data, output_data): +# """Combine input and output data into a single DataFrame.""" +# if input_data.shape[0] != output_data.shape[0]: +# raise ValueError("Input and output data must have the same number of rows") +# +# combined = input_data.copy() +# for col in output_data.columns: +# if col not in combined.columns: +# combined[col] = output_data[col] +# else: +# self.logger.statement(f"Warning: column '{col}' exists in both input and output data, using output data") +# combined[col] = output_data[col] +# +# return combined + + def _setup_kernel(self): + """Set up the GP kernel if not provided.""" + if self.kernel is None: + try: + from sklearn.gaussian_process.kernels import Matern,ConstantKernel,RBF + self.kernel = ConstantKernel(1.0, (1e-3, 1e3)) * Matern( + length_scale=np.ones(len(self.input_names)) * 2.0, + length_scale_bounds=(1e-4, 1e4), + nu=1.5) + self.logger.statement("Using default Matern kernel") + except ImportError: + raise ImportError("scikit-learn is required for GPR emulator") + + # Log kernel hyperparameters + self.logger.statement(f"Using kernel: {self.kernel}") + + + def _prepare_training_data(self): + """ + Prepare and transform training data for model fitting. + + Parameters + ---------- + self : GPR + The GPR emulator instance containing the data and configuration. + + Returns + ------- + pandas.DataFrame + Processed data ready for model fitting. + """ + + if self.data is None: + raise ValueError("No data provided and no data stored in the emulator") + data = self.data + + # Apply feature transformations if specified + if self.transforms is not None: + self._validate_transforms_for_gpr() + self.logger.statement("applying feature transforms") + self.data_transformed = self._fit_transformer_pipeline(data, self.transforms) + else: + # Still need to set up a dummy transformer for consistency + from .transformers import AutobotsAssemble + self.transformer_pipeline = AutobotsAssemble(data.copy()) + self.data_transformed = data.copy() + + return self.data_transformed + + + def fit(self): + """ + Fit the emulator to training data. + + Parameters + ---------- + self: GPR + The GPR emulator instance containing the data and configuration. + + Returns + ------- + self : GPR + Fitted GPR emulator instance. + """ + + if self.data_transformed is None: + self.logger.statement("transforming training data") + self.data_transformed = self._prepare_training_data() + if self.kernel is None: + self._setup_kernel() + # transformed input data + X_transformed = self.data_transformed.loc[:,self.input_names].copy() + y_transformed = self.data_transformed.loc[:,self.output_names].copy() #Note that these are actualy not transformed + + assert X_transformed.shape[0] == y_transformed.shape[0], \ + "Input and output data must have the same number of rows" + assert X_transformed.shape[1] > 0, "Input data must have at least one feature" + assert y_transformed.shape[1] > 0, "Output data must have at least one variable" + + # Create and fit separate GPR model for each output + self.gpr_models = {} + for output_name in self.output_names: + gpr = GaussianProcessRegressor( + kernel=self.kernel, + #alpha=self.alpha, + n_restarts_optimizer=self.n_restarts_optimizer, + #random_state=self.random_state + ) + + # Fit the GPR model for this output + gpr.fit(X_transformed.loc[:,self.input_names].values, y_transformed.loc[:,output_name].values) + self.gpr_models[output_name] = gpr + + self.fitted = True + return self + + def predict(self, X, return_std=False): + """ + Make predictions using the fitted GPR emulators. + + Parameters + ---------- + X : pandas.DataFrame + Input features for prediction + return_std : bool, default False + Whether to return prediction standard deviation + + Returns + ------- + predictions : pandas.DataFrame + Predicted values for each output + std : pandas.DataFrame, optional + Prediction standard deviations (if return_std=True) + """ + if not self.fitted: + raise ValueError("Emulator must be fitted before making predictions") + + if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None: + raise ValueError("Emulator must be fitted and have valid transformations before prediction") + + # Apply same transforms as training data + X_transformed = self.transformer_pipeline.transform(X.copy()) + + + # Make predictions for each output + predictions_dict = {} + std_dict = {} + + for output_name in self.output_names: + gpr = self.gpr_models[output_name] + + if return_std: + pred, std = gpr.predict(X_transformed.values, return_std=True) + predictions_dict[output_name] = pred + std_dict[output_name] = std + else: + pred = gpr.predict(X_transformed.values) + predictions_dict[output_name] = pred + + # Convert to DataFrame + predictions_df = pd.DataFrame(predictions_dict, index=X.index) + + if return_std: + std_df = pd.DataFrame(std_dict, index=X.index) + return predictions_df, std_df + else: + return predictions_df + + + def scrape_pst_dir(self,pst_dir,casename): + + if not os.path.exists(pst_dir): + raise FileNotFoundError(f"PEST control file {pst_dir} does not exist") + + pst = Pst(os.path.join(pst_dir,casename + ".pst")) + + # work out input variable names + input_groups = pst.pestpp_options.get("opt_dec_var_groups",None) + par = pst.parameter_data + if input_groups is None: + print("using all adjustable parameters as inputs") + input_names = pst.adj_par_names + else: + input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")]) + print("input groups:",input_groups) + adj_par = par.loc[pst.adj_par_names,:].copy() + adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:] + input_names = adj_par.parnme.tolist() + print("input names:",input_names) + + #work out constraints and objectives + ineq_names = pst.less_than_obs_constraints.tolist() + ineq_names.extend(pst.greater_than_obs_constraints.tolist()) + obs = pst.observation_data + objs = pst.pestpp_options.get("mou_objectives",None) + constraints = [] + + if objs is None: + print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives") + objs = ineq_names + else: + objs = objs.lower().strip().split(',') + constraints = [n for n in ineq_names if n not in objs] + + print("objectives:",objs) + print("constraints:",constraints) + output_names = objs + output_names.extend(constraints) + + return pst, input_names, output_names, objs, constraints + + + def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): + """ + Prepare a PEST++ template directory for the GPR emulator. + + Parameters + ---------- + gpr_t_d : str + Path to the PEST++ template directory. + pst_fpath : str + Path to an existing PEST control file (PST). The assumption is that an existing PST setup exists for the process-based model. + + Returns + ------- + None + """ + + + + #TODO: it may be more logical to pass in a Pst object instead of a file path; assume the user loads Pst and training data before hand??? + # Give Emulators a "harvest" function that returns a Pst object with the necessary information? + + # what are the things we need to get from Pst? + # 1. decivsion variable names (parameters) a.k.a input_names + # 2. observation names (outputs) aka output_names + # 3. which obs are objectives; subset of output_names + # 4. which obs are constraints; subset of output_names + + pst, input_names, output_names, objs, constraints = self.scrape_pst_dir(pst_dir,casename) + + + # check that all input_names ar ein par data + if self.input_names is None: + raise ValueError("input_names must be provided") + missing_inputs = set(self.input_names) - set(pst.parameter_data.index) + if missing_inputs: + raise ValueError(f"Input names {missing_inputs} not found in parameter data") + # check that all input names are adjsutable + fixed_inputs = pst.parameter_data.loc[self.input_names, "partrans"].str.contains("fixed|tied", case=False, na=False) + if fixed_inputs.any(): + raise ValueError(f"Input names {self.input_names[fixed_inputs]} cannot be fixed or tied") + self.logger.statement(f"Decision variable parameter names: {self.input_names}") + + # check that all self.output_names are in observation_data + if self.output_names is None: + raise ValueError("output_names must be provided") + missing_outputs = set(self.output_names) - set(pst.observation_data.index) + if missing_outputs: + raise ValueError(f"Output names {missing_outputs} not found in observation data") + self.logger.statement(f"Observation names: {self.output_names}") + + + # preapre the GPR template directory + if os.path.exists(gpr_t_d): + self.logger.statement(f"Removing existing template directory {gpr_t_d}") + shutil.rmtree(gpr_t_d) + self.logger.statement(f"Creating template directory {gpr_t_d}") + os.makedirs(gpr_t_d) + + # pickle + self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl")) + self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}") + + # preapre template files + self.logger.statement("Preparing PEST++ template files") + + #write a template file + tpl_fname = os.path.join(gpr_t_d,"gpr_input.csv.tpl") + with open(tpl_fname,'w') as f: + f.write("ptf ~\nparnme,parval1\n") + for input_name in self.input_names: + f.write("{0},~ {0} ~\n".format(input_name)) + # keep track of other non-decvar parameters + other_pars = list(set(pst.parameter_data.parnme.tolist())-set(self.input_names)) + aux_tpl_fname = None + if len(other_pars) > 0: + aux_tpl_fname = os.path.join(gpr_t_d,"aux_par.csv.tpl") + print("writing aux par tpl file: ",aux_tpl_fname) + with open(aux_tpl_fname,'w') as f: + f.write("ptf ~\n") + for input_name in other_pars: + f.write("{0},~ {0} ~\n".format(input_name)) + + #write an ins file + ins_fname = os.path.join(gpr_t_d,"gpr_output.csv.ins") + with open(ins_fname,'w') as f: + f.write("pif ~\nl1\n") + for output_name in self.output_names: + if self.return_std: + f.write("l1 ~,~ !{0}! ~,~ !{0}_gprstd!\n".format(output_name)) + else: + f.write("l1 ~,~ !{0}!\n".format(output_name)) + + # build the GPR Pst object + self.logger.statement("Building PEST++ control file") + tpl_list = [tpl_fname] + if aux_tpl_fname is not None: + tpl_list.append(aux_tpl_fname) + input_list = [f.replace(".tpl","") for f in tpl_list] + gpst = Pst.from_io_files(tpl_list,input_list, + [ins_fname],[ins_fname.replace(".ins","")],pst_path=".") + + + def fix_df_col_type(orgdf,fixdf): + for col in orgdf.columns: + # this gross thing is to avoid a future error warning in pandas - + # why is it getting so strict?! isn't python duck-typed? + if col in fixdf.columns and\ + fixdf.dtypes[col] != orgdf.dtypes[col]: + fixdf[col] = fixdf[col].astype(orgdf.dtypes[col]) + fixdf.loc[orgdf.index,col] = orgdf.loc[orgdf.index,col].values + return + + fix_df_col_type(orgdf=pst.parameter_data,fixdf=gpst.parameter_data) + fix_df_col_type(orgdf=pst.observation_data,fixdf=gpst.observation_data) + + if self.return_std: + stdobs = [o for o in gpst.obs_names if o.endswith("_gprstd")] + assert len(stdobs) > 0 + gpst.observation_data.loc[stdobs,"weight"] = 0.0 + + gpst.pestpp_options = pst.pestpp_options + gpst.prior_information = pst.prior_information.copy() + + gpst.model_command = "python forward_run.py" + frun_lines = inspect.getsource(gpr_forward_run) + with open(os.path.join(gpr_t_d, "forward_run.py"), 'w') as f: + f.write("\n") + for import_name in ["pandas as pd","os","numpy as np"]: + f.write("import {0}\n".format(import_name)) + for line in frun_lines: + f.write(line) + f.write("if __name__ == '__main__':\n") + f.write(" gpr_forward_run()\n") + + + + gpst.control_data.noptmax = 0 + + gpst_fname = f"{casename}_gpr.pst" + gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2) + print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d) + try: + run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d) + except Exception as e: + print("WARNING: pestpp-mou test run failed: {0}".format(str(e))) + gpst.control_data.noptmax = pst.control_data.noptmax + gpst.write(os.path.join(gpr_t_d, gpst_fname), version=2) + + return + +def gpr_forward_run(): + """the function to evaluate a set of inputs thru the GPR emulators.\ + This function gets added programmatically to the forward run process""" + import pandas as pd + from pyemu.emulators import GPR + input_df = pd.read_csv("gpr_input.csv",index_col=0).T + + gpr = GPR.load("gpr_emulator.pkl") + df = pd.DataFrame(index=gpr.output_names, + columns=["sim","sim_std"]) + df.index.name = "output_name" + if gpr.return_std: + predmean,predstdv = gpr.predict(input_df.loc[:,gpr.input_names], return_std=True) + df.loc[:,"sim"] = predmean[df.index].values + df.loc[:,"sim_std"] = predstdv[df.index].values + else: + predmean = gpr.predict(input_df.loc[:,gpr.input_names]) + df.loc[:,"sim"] = predmean[df.index].values + df.to_csv("gpr_output.csv",index=True) + return df \ No newline at end of file From a50fe5147b49131414b06e81c79f5ddf9322d208 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 16:46:58 +0100 Subject: [PATCH 34/58] gpr tests --- autotest/emulator_tests.py | 600 ++++++++++++++++++++++++++++++++++++- 1 file changed, 596 insertions(+), 4 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index b2aa7fcb9..72370d827 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -6,10 +6,11 @@ import pandas as pd import platform import pyemu -from pst_from_tests import setup_tmp, ies_exe_path, _get_port +from pst_from_tests import setup_tmp, bin_path, _get_port from pyemu.emulators import DSI, LPFA, GPR - +ies_exe_path = os.path.join(bin_path, "pestpp-ies") +mou_exe_path = os.path.join(bin_path, "pestpp-mou") def dsi_freyberg(tmp_d,transforms=None,tag=""): @@ -133,7 +134,7 @@ def test_dsivc_freyberg(): worker_root = "." pyemu.os_utils.start_workers(td, - "pestpp-mou", + mou_exe_path, "dsivc.pst", num_workers=num_workers, worker_root=worker_root, @@ -318,6 +319,595 @@ def test_lpfa_std(): ]) return + +def gpr_compare_invest(): + import numpy as np + from sklearn.gaussian_process import GaussianProcessRegressor + case = "zdt1" + use_chances = False + m_d = os.path.join(case+"_gpr_baseline") + org_d = os.path.join("utils",case+"_template") + t_d = case+"_template" + if os.path.exists(t_d): + shutil.rmtree(t_d) + shutil.copytree(org_d,t_d) + if os.path.exists(m_d): + shutil.rmtree(m_d) + + pst = pyemu.Pst(os.path.join(t_d, case+".pst")) + pst.pestpp_options["mou_generator"] = "pso" + if use_chances: + pst.pestpp_options["opt_risk"] = 0.95 + pst.pestpp_options["opt_stack_size"] = 50 + pst.pestpp_options["opt_recalc_chance_every"] = 10000 + pst.pestpp_options["opt_chance_points"] = "single" + else: + pst.pestpp_options["opt_risk"] = 0.5 + + pop_size = 60 + num_workers = 60 + noptmax_full = 30 + noptmax_inner = 10 + noptmax_outer = 5 + port = 4554 + pst.control_data.noptmax = noptmax_full + pst.pestpp_options["mou_population_size"] = pop_size + pst.pestpp_options["mou_save_population_every"] = 1 + pst.write(os.path.join(t_d, case+".pst")) + if not os.path.exists(m_d): + pyemu.os_utils.start_workers(t_d, mou_exe_path, case+".pst", num_workers, worker_root=".", + master_dir=m_d, verbose=True, port=port) + #shutil.copytree(t_d,m_d) + #pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=m_d) + # use the initial population files for training + dv_pops = [os.path.join(m_d,"{0}.0.dv_pop.csv".format(case))] + obs_pops = [f.replace("dv_","obs_") for f in dv_pops] + + pst_fname = os.path.join(m_d,case+".pst") + gpr_t_d = os.path.join(case+"_gpr_template") + pyemu.helpers.prep_for_gpr(pst_fname,dv_pops,obs_pops,t_d=m_d,gpr_t_d=gpr_t_d,nverf=int(pop_size*.1),\ + plot_fits=True,apply_standard_scalar=False,include_emulated_std_obs=True) + gpst = pyemu.Pst(os.path.join(gpr_t_d,case+".pst")) + shutil.copy2(os.path.join(m_d,case+".0.dv_pop.csv"),os.path.join(gpr_t_d,"initial_dv_pop.csv")) + gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv" + gpst.control_data.noptmax = noptmax_full + gpst.write(os.path.join(gpr_t_d,case+".pst"),version=2) + gpr_m_d = gpr_t_d.replace("template","master") + if os.path.exists(gpr_m_d): + shutil.rmtree(gpr_m_d) + pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path, case+".pst", num_workers, worker_root=".", + master_dir=gpr_m_d, verbose=True, port=port) + + #o1 = pd.read_csv(os.path.join(m_d,case+".{0}.obs_pop.csv".format(max(0,pst.control_data.noptmax)))) + o1 = pd.read_csv(os.path.join(m_d,case+".pareto.archive.summary.csv")) + o1 = o1.loc[o1.generation == o1.generation.max(), :] + o1 = o1.loc[o1.is_feasible == True, :] + o1 = o1.loc[o1.nsga2_front == 1, :] + + + import matplotlib.pyplot as plt + o2 = pd.read_csv(os.path.join(gpr_m_d, case + ".{0}.obs_pop.csv".format(max(0, gpst.control_data.noptmax)))) + fig,ax = plt.subplots(1,1,figsize=(5,5)) + ax.scatter(o1.obj_1,o1.obj_2,c="r",s=10) + ax.scatter(o2.obj_1,o2.obj_2,c="0.5",s=10,alpha=0.5) + plt.tight_layout() + plt.savefig("gpr_{0}_compare_noiter.pdf".format(case)) + plt.close(fig) + + # now lets try an inner-outer scheme... + + gpst.control_data.noptmax = noptmax_inner + gpst.write(os.path.join(gpr_t_d,case+".pst"),version=2) + gpr_t_d_iter = gpr_t_d+"_outeriter{0}".format(0) + if os.path.exists(gpr_t_d_iter): + shutil.rmtree(gpr_t_d_iter) + shutil.copytree(gpr_t_d,gpr_t_d_iter) + for iouter in range(1,noptmax_outer+1): + #run the gpr emulator + gpr_m_d_iter = gpr_t_d_iter.replace("template","master") + complex_m_d_iter = t_d.replace("template", "master_complex_retrain_outeriter{0}".format(iouter)) + if os.path.exists(gpr_m_d_iter): + shutil.rmtree(gpr_m_d_iter) + pyemu.os_utils.start_workers(gpr_t_d_iter, mou_exe_path, case+".pst", num_workers, worker_root=".", + master_dir=gpr_m_d_iter, verbose=True, port=port) + o2 = pd.read_csv(os.path.join(gpr_m_d_iter,case+".{0}.obs_pop.csv".format(gpst.control_data.noptmax))) + + # now run the final dv pop thru the "complex" model + final_gpr_dvpop_fname = os.path.join(gpr_m_d_iter,case+".archive.dv_pop.csv") + assert os.path.exists(final_gpr_dvpop_fname) + complex_model_dvpop_fname = os.path.join(t_d,"gpr_outeriter{0}_dvpop.csv".format(iouter)) + if os.path.exists(complex_model_dvpop_fname): + os.remove(complex_model_dvpop_fname) + # load the gpr archive and do something clever to pick new points to eval + # with the complex model + dvpop = pd.read_csv(final_gpr_dvpop_fname,index_col=0) + if dvpop.shape[0] > pop_size: + arc_sum = pd.read_csv(os.path.join(gpr_m_d_iter,case+".pareto.archive.summary.csv")) + as_front_map = {member:front for member,front in zip(arc_sum.member,arc_sum.nsga2_front)} + as_crowd_map = {member: crowd for member, crowd in zip(arc_sum.member, arc_sum.nsga2_crowding_distance)} + as_feas_map = {member: feas for member, feas in zip(arc_sum.member, arc_sum.feasible_distance)} + as_gen_map = {member: gen for member, gen in zip(arc_sum.member, arc_sum.generation)} + + dvpop.loc[:,"front"] = dvpop.index.map(lambda x: as_front_map.get(x,np.nan)) + dvpop.loc[:, "crowd"] = dvpop.index.map(lambda x: as_crowd_map.get(x, np.nan)) + dvpop.loc[:,"feas"] = dvpop.index.map(lambda x: as_feas_map.get(x,np.nan)) + dvpop.loc[:, "gen"] = dvpop.index.map(lambda x: as_gen_map.get(x, np.nan)) + #drop members that have missing archive info + dvpop = dvpop.dropna() + if dvpop.shape[0] > pop_size: + dvpop.sort_values(by=["gen","feas","front","crowd"],ascending=[False,True,True,False],inplace=True) + dvpop = dvpop.iloc[:pop_size,:] + dvpop.drop(["gen","feas","front","crowd"],axis=1,inplace=True) + + #shutil.copy2(final_gpr_dvpop_fname,complex_model_dvpop_fname) + dvpop.to_csv(complex_model_dvpop_fname) + pst.pestpp_options["mou_dv_population_file"] = os.path.split(complex_model_dvpop_fname)[1] + pst.control_data.noptmax = -1 + pst.write(os.path.join(t_d,case+".pst"),version=2) + + pyemu.os_utils.start_workers(t_d, mou_exe_path, case+".pst", num_workers, worker_root=".", + master_dir=complex_m_d_iter, verbose=True, port=port) + + # plot the complex model results... + o2 = pd.read_csv(os.path.join(complex_m_d_iter, case + ".pareto.archive.summary.csv")) + o2 = o2.loc[o2.generation == o2.generation.max(), :] + #o2 = o2.loc[o2.is_feasible==True,:] + o2 = o2.loc[o2.nsga2_front == 1, :] + fig, ax = plt.subplots(1, 1, figsize=(5, 5)) + ax.scatter(o1.obj_1, o1.obj_2,c="r",s=10,label="full complex") + ax.scatter(o2.obj_1, o2.obj_2,c="0.5",s=10,alpha=0.5,label="mixed emulated-complex") + ax.legend(loc="upper right") + ax.set_xlim(0,10) + ax.set_ylim(0,20) + plt.tight_layout() + plt.savefig("gpr_{0}_compare_iterscheme_{1}.pdf".format(case,iouter)) + plt.close(fig) + + # now add those complex model input-output pop files to the list and retrain + # the gpr + dv_pops.append(os.path.join(complex_m_d_iter,case+".0.dv_pop.csv")) + obs_pops.append(os.path.join(complex_m_d_iter,case+".0.obs_pop.csv")) + gpr_t_d_iter = gpr_t_d+"_outeriter{0}".format(iouter) + pyemu.helpers.prep_for_gpr(pst_fname,dv_pops,obs_pops,t_d=gpr_t_d,gpr_t_d=gpr_t_d_iter,nverf=int(pop_size*.1), + plot_fits=True,apply_standard_scalar=False,include_emulated_std_obs=True) + gpst_iter = pyemu.Pst(os.path.join(gpr_t_d_iter,case+".pst")) + #aggdf = pd.read_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv"),index_col=0) + #aggdf.index = ["outeriter{0}_member{1}".format(iouter,i) for i in range(aggdf.shape[0])] + restart_gpr_dvpop_fname = "gpr_restart_dvpop_outeriter{0}.csv".format(iouter) + #aggdf.to_csv(os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname)) + shutil.copy2(os.path.join(complex_m_d_iter,case+".0.dv_pop.csv"),os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname)) + gpst_iter.pestpp_options["mou_dv_population_file"] = restart_gpr_dvpop_fname + gpst_iter.control_data.noptmax = gpst.control_data.noptmax + gpst_iter.write(os.path.join(gpr_t_d_iter,case+".pst"),version=2) + + +def gpr_constr_invest(): + import numpy as np + from sklearn.gaussian_process import GaussianProcessRegressor + case = "constr" + use_chances = False + m_d = os.path.join(case + "_gpr_baseline") + org_d = os.path.join("utils", case + "_template") + t_d = case + "_template" + if os.path.exists(t_d): + shutil.rmtree(t_d) + shutil.copytree(org_d, t_d) + if os.path.exists(m_d): + shutil.rmtree(m_d) + + pst = pyemu.Pst(os.path.join(t_d, case + ".pst")) + pst.pestpp_options["mou_generator"] = "pso" + if use_chances: + pst.pestpp_options["opt_risk"] = 0.95 + pst.pestpp_options["opt_stack_size"] = 50 + pst.pestpp_options["opt_recalc_chance_every"] = 10000 + pst.pestpp_options["opt_chance_points"] = "single" + else: + pst.pestpp_options["opt_risk"] = 0.5 + + pop_size = 15 + num_workers = 5 + noptmax_full = 3 + noptmax_inner = 2 + noptmax_outer = 2 + port = 4554 + pst.control_data.noptmax = -1 + pst.pestpp_options["mou_population_size"] = pop_size + pst.pestpp_options["mou_save_population_every"] = 1 + pst.write(os.path.join(t_d, case + ".pst")) + #if not os.path.exists(m_d): + # pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=m_d, verbose=True, port=port) + if os.path.exists(m_d): + shutil.rmtree(m_d) + shutil.copytree(t_d,m_d) + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=m_d) + # use the initial population files for training + dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))] + obs_pops = [f.replace("dv_", "obs_") for f in dv_pops] + + pst_fname = os.path.join(m_d, case + ".pst") + gpr_t_d = os.path.join(case + "_gpr_template") + pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops,t_d=m_d, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ + plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True) + gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst")) + #shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv")) + #gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv" + gpst.pestpp_options.pop("mou_dv_population_file",None) #= "initial_dv_pop.csv" + + gpst.control_data.noptmax = noptmax_full + gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2) + gpr_m_d = gpr_t_d.replace("template", "master") + if os.path.exists(gpr_m_d): + shutil.rmtree(gpr_m_d) + #pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=gpr_m_d, verbose=True, port=port) + shutil.copytree(gpr_t_d,gpr_m_d) + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_m_d) + + # o1 = pd.read_csv(os.path.join(m_d,case+".{0}.obs_pop.csv".format(max(0,pst.control_data.noptmax)))) + o1 = pd.read_csv(os.path.join(m_d, case + ".pareto.archive.summary.csv")) + o1 = o1.loc[o1.generation == o1.generation.max(), :] + o1 = o1.loc[o1.is_feasible == True, :] + o1 = o1.loc[o1.nsga2_front == 1, :] + + # import matplotlib.pyplot as plt + # o2 = pd.read_csv(os.path.join(gpr_m_d, case + ".{0}.obs_pop.csv".format(max(0, gpst.control_data.noptmax)))) + # fig, ax = plt.subplots(1, 1, figsize=(5, 5)) + # ax.scatter(o1.obj_1, o1.obj_2, c="r", s=10) + # ax.scatter(o2.obj_1, o2.obj_2, c="0.5", s=10, alpha=0.5) + # plt.tight_layout() + # plt.savefig("gpr_{0}_compare_noiter.pdf".format(case)) + # plt.close(fig) + + # now lets try an inner-outer scheme... + + gpst.control_data.noptmax = noptmax_inner + gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2) + gpr_t_d_iter = gpr_t_d + "_outeriter{0}".format(0) + if os.path.exists(gpr_t_d_iter): + shutil.rmtree(gpr_t_d_iter) + shutil.copytree(gpr_t_d, gpr_t_d_iter) + for iouter in range(1, noptmax_outer + 1): + # run the gpr emulator + gpr_m_d_iter = gpr_t_d_iter.replace("template", "master") + complex_m_d_iter = t_d.replace("template", "master_complex_retrain_outeriter{0}".format(iouter)) + if os.path.exists(gpr_m_d_iter): + shutil.rmtree(gpr_m_d_iter) + shutil.copytree(gpr_t_d_iter,gpr_m_d_iter) + + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_m_d_iter) + + #pyemu.os_utils.start_workers(gpr_t_d_iter, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=gpr_m_d_iter, verbose=True, port=port) + + o2 = pd.read_csv(os.path.join(gpr_m_d_iter, case + ".{0}.obs_pop.csv".format(gpst.control_data.noptmax))) + + # now run the final dv pop thru the "complex" model + final_gpr_dvpop_fname = os.path.join(gpr_m_d_iter, case + ".archive.dv_pop.csv") + assert os.path.exists(final_gpr_dvpop_fname) + complex_model_dvpop_fname = os.path.join(t_d, "gpr_outeriter{0}_dvpop.csv".format(iouter)) + if os.path.exists(complex_model_dvpop_fname): + os.remove(complex_model_dvpop_fname) + # load the gpr archive and do something clever to pick new points to eval + # with the complex model + dvpop = pd.read_csv(final_gpr_dvpop_fname, index_col=0) + if dvpop.shape[0] > pop_size: + arc_sum = pd.read_csv(os.path.join(gpr_m_d_iter, case + ".pareto.archive.summary.csv")) + as_front_map = {member: front for member, front in zip(arc_sum.member, arc_sum.nsga2_front)} + as_crowd_map = {member: crowd for member, crowd in zip(arc_sum.member, arc_sum.nsga2_crowding_distance)} + as_feas_map = {member: feas for member, feas in zip(arc_sum.member, arc_sum.feasible_distance)} + as_gen_map = {member: gen for member, gen in zip(arc_sum.member, arc_sum.generation)} + + dvpop.loc[:, "front"] = dvpop.index.map(lambda x: as_front_map.get(x, np.nan)) + dvpop.loc[:, "crowd"] = dvpop.index.map(lambda x: as_crowd_map.get(x, np.nan)) + dvpop.loc[:, "feas"] = dvpop.index.map(lambda x: as_feas_map.get(x, np.nan)) + dvpop.loc[:, "gen"] = dvpop.index.map(lambda x: as_gen_map.get(x, np.nan)) + # drop members that have missing archive info + dvpop = dvpop.dropna() + if dvpop.shape[0] > pop_size: + dvpop.sort_values(by=["gen", "feas", "front", "crowd"], ascending=[False, True, True, False], + inplace=True) + dvpop = dvpop.iloc[:pop_size, :] + dvpop.drop(["gen", "feas", "front", "crowd"], axis=1, inplace=True) + + # shutil.copy2(final_gpr_dvpop_fname,complex_model_dvpop_fname) + dvpop.to_csv(complex_model_dvpop_fname) + pst.pestpp_options["mou_dv_population_file"] = os.path.split(complex_model_dvpop_fname)[1] + pst.control_data.noptmax = -1 + pst.write(os.path.join(t_d, case + ".pst"), version=2) + if os.path.exists(complex_m_d_iter): + shutil.rmtree(complex_m_d_iter) + shutil.copytree(t_d,complex_m_d_iter) + #pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=complex_m_d_iter, verbose=True, port=port) + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=complex_m_d_iter) + + # plot the complex model results... + o2 = pd.read_csv(os.path.join(complex_m_d_iter, case + ".pareto.archive.summary.csv")) + o2 = o2.loc[o2.generation == o2.generation.max(), :] + # o2 = o2.loc[o2.is_feasible==True,:] + o2 = o2.loc[o2.nsga2_front == 1, :] + # fig, ax = plt.subplots(1, 1, figsize=(5, 5)) + # ax.scatter(o1.obj_1, o1.obj_2, c="r", s=10, label="full complex") + # ax.scatter(o2.obj_1, o2.obj_2, c="0.5", s=10, alpha=0.5, label="mixed emulated-complex") + # ax.legend(loc="upper right") + # ax.set_xlim(0, 10) + # ax.set_ylim(0, 20) + # plt.tight_layout() + # plt.savefig("gpr_{0}_compare_iterscheme_{1}.pdf".format(case, iouter)) + # plt.close(fig) + + # now add those complex model input-output pop files to the list and retrain + # the gpr + dv_pops.append(os.path.join(complex_m_d_iter, case + ".0.dv_pop.csv")) + obs_pops.append(os.path.join(complex_m_d_iter, case + ".0.obs_pop.csv")) + gpr_t_d_iter = gpr_t_d + "_outeriter{0}".format(iouter) + pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=gpr_t_d,gpr_t_d=gpr_t_d_iter, nverf=int(pop_size * .1), + plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True) + gpst_iter = pyemu.Pst(os.path.join(gpr_t_d_iter, case + ".pst")) + # aggdf = pd.read_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv"),index_col=0) + # aggdf.index = ["outeriter{0}_member{1}".format(iouter,i) for i in range(aggdf.shape[0])] + #restart_gpr_dvpop_fname = "gpr_restart_dvpop_outeriter{0}.csv".format(iouter) + # aggdf.to_csv(os.path.join(gpr_t_d_iter,restart_gpr_dvpop_fname)) + #shutil.copy2(os.path.join(complex_m_d_iter, case + ".0.dv_pop.csv"), + # os.path.join(gpr_t_d_iter, restart_gpr_dvpop_fname)) + gpst_iter.pestpp_options.pop("mou_dv_population_file",None)# = restart_gpr_dvpop_fname + gpst_iter.control_data.noptmax = gpst.control_data.noptmax + gpst_iter.write(os.path.join(gpr_t_d_iter, case + ".pst"), version=2) + + psum_fname = os.path.join(complex_m_d_iter,case+".pareto.archive.summary.csv") + assert os.path.exists(psum_fname) + psum = pd.read_csv(psum_fname) + #assert 1.0 in psum.obj_1.values + #assert 1.0 in psum.obj_2.values + + +def collate_training_data(pst,m_d,case): + + input_fnames = [os.path.join(m_d,"{0}.0.dv_pop.csv".format(case))] + output_fnames = [f.replace("dv_","obs_") for f in input_fnames] + + # work out input variable names + input_groups = pst.pestpp_options.get("opt_dec_var_groups",None) + par = pst.parameter_data + if input_groups is None: + print("using all adjustable parameters as inputs") + input_names = pst.adj_par_names + else: + input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")]) + print("input groups:",input_groups) + adj_par = par.loc[pst.adj_par_names,:].copy() + adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:] + input_names = adj_par.parnme.tolist() + print("input names:",input_names) + + #work out constraints and objectives + ineq_names = pst.less_than_obs_constraints.tolist() + ineq_names.extend(pst.greater_than_obs_constraints.tolist()) + obs = pst.observation_data + objs = pst.pestpp_options.get("mou_objectives",None) + constraints = [] + + if objs is None: + print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives") + objs = ineq_names + else: + objs = objs.lower().strip().split(',') + constraints = [n for n in ineq_names if n not in objs] + + print("objectives:",objs) + print("constraints:",constraints) + output_names = objs + output_names.extend(constraints) + + print("loading input and output files") + if isinstance(input_fnames,str): + input_fnames = [input_fnames] + if isinstance(output_fnames,str): + output_fnames = [output_fnames] + if len(output_fnames) != len(input_fnames): + raise Exception("len(input_fnames) != len(output_fnames)") + + + dfs = [] + for input_fname,output_fname in zip(input_fnames,output_fnames): + if input_fname.lower().endswith(".csv"): + input_df = pd.read_csv(os.path.join(input_fname),index_col=0) + elif input_fname.lower().endswith(".jcb"): + input_df = pyemu.ParameterEnsemble.from_binary(pst=pst,filename=input_fname)._df + else: + raise Exception("unrecognized input_fname extension:'{0}', looking for csv or jcb".\ + format(input_fname.lower())) + + if output_fname.lower().endswith(".csv"): + output_df = pd.read_csv(os.path.join(output_fname),index_col=0) + elif output_fname.lower().endswith(".jcb"): + output_df = pyemu.ObservationEnsemble.from_binary(pst=pst,filename=output_fname)._df + else: + raise Exception("unrecognized output_fname extension:'{0}', looking for csv or jcb".\ + format(output_fname.lower())) + + if input_df.shape[0] != output_df.shape[0]: + raise Exception("input rows != output rows for {0} and {1}".\ + format(input_fname,output_fname)) + input_df = input_df.loc[:,input_names] + assert input_df.shape == input_df.dropna().shape + + output_df = output_df.loc[:, output_names] + assert output_df.shape == output_df.dropna().shape + + input_df.loc[:,output_names] = output_df.values + dfs.append(input_df) + print("...loaded",input_fname,output_fname) + + data = pd.concat(dfs) + assert data.shape == data.dropna().shape + #df.to_csv(os.path.join(gpr_t_d,"gpr_aggregate_training_data.csv")) + #print("aggregated training dataset shape",df.shape,"saved to",pst_fname + ".aggresults.csv") + return data, input_names, output_names + +def gpr_zdt1_test(): + import numpy as np + import subprocess as sp + import multiprocessing as mp + from datetime import datetime + from sklearn.gaussian_process import GaussianProcessRegressor + case = "zdt1" + use_chances = False + m_d = os.path.join(case + "_gpr_baseline") + org_d = os.path.join("utils", case + "_template") + t_d = case + "_template" + if os.path.exists(t_d): + shutil.rmtree(t_d) + shutil.copytree(org_d, t_d) + if os.path.exists(m_d): + shutil.rmtree(m_d) + + pst = pyemu.Pst(os.path.join(t_d, case + ".pst")) + pst.pestpp_options["mou_generator"] = "pso" + pst.pestpp_options["overdue_giveup_fac"] = 1e10 + pst.pestpp_options["overdue_resched_fac"] = 1e10 + if use_chances: + pst.pestpp_options["opt_risk"] = 0.95 + pst.pestpp_options["opt_stack_size"] = 50 + pst.pestpp_options["opt_recalc_chance_every"] = 10000 + pst.pestpp_options["opt_chance_points"] = "single" + else: + pst.pestpp_options["opt_risk"] = 0.5 + + pop_size = 20 + num_workers = 10 + noptmax_full = 1 + + port = 4569 + pst.control_data.noptmax = -1 + pst.pestpp_options["mou_population_size"] = pop_size + pst.pestpp_options["mou_save_population_every"] = 1 + pst.write(os.path.join(t_d, case + ".pst")) + #if not os.path.exists(m_d): + # pyemu.os_utils.start_workers(t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=m_d, verbose=True, port=port) + + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=t_d) + + + m_d = t_d + dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))] + obs_pops = [f.replace("dv_", "obs_") for f in dv_pops] + + pst_fname = os.path.join(m_d, case + ".pst") + gpr_t_d = os.path.join(case + "_gpr_template") + + data, input_names, output_names = collate_training_data(pst,m_d,case) + from pyemu.emulators.gpr import GPR + gpr = GPR(data=data.copy(), + input_names=input_names, + output_names=output_names, + #transforms=transforms, + #kernel=gp_kernel, + n_restarts_optimizer=20, + ); + gpr.fit() + gpr.prepare_pestpp(m_d,case,gpr_t_d=gpr_t_d) + + #pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ + # plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True) + gpst = pyemu.Pst(os.path.join(gpr_t_d, case + "_gpr.pst")) + shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv")) + gpst.pestpp_options["mou_dv_population_file"] = "initial_dv_pop.csv" + gpst.control_data.noptmax = noptmax_full + gpst.write(os.path.join(gpr_t_d, case + ".pst"), version=2) + gpr_m_d = gpr_t_d.replace("template", "master") + if os.path.exists(gpr_m_d): + shutil.rmtree(gpr_m_d) + start = datetime.now() + #pyemu.os_utils.start_workers(gpr_t_d, mou_exe_path, case + ".pst", num_workers, worker_root=".", + # master_dir=gpr_m_d, verbose=True, port=port) + pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_t_d) + + gpr_m_d = gpr_t_d + + finish = datetime.now() + duration1 = (finish - start).total_seconds() + arcorg = pd.read_csv(os.path.join(gpr_m_d,"zdt1.archive.obs_pop.csv"),index_col=0) + + + psum_fname = os.path.join(gpr_m_d,case+".pareto.archive.summary.csv") + assert os.path.exists(psum_fname) + psum = pd.read_csv(psum_fname) + print(psum.obj_1.min()) + print(psum.obj_2.min()) + assert psum.obj_1.min() < 0.05 + + gpr_t_d2 = gpr_t_d + "_ppw" + if os.path.exists(gpr_t_d2): + shutil.rmtree(gpr_t_d2) + shutil.copytree(gpr_t_d,gpr_t_d2) + + gpr_m_d2 = gpr_t_d2.replace("template","master") + gpr_d2 = GPR.load(os.path.join(gpr_m_d2,"gpr_emulator.pkl")) + input_df = pd.read_csv(os.path.join(gpr_t_d2,"gpr_input.csv"),index_col=0) + #mdf = pd.read_csv(os.path.join(gpr_t_d2,"gprmodel_info.csv"),index_col=0) + #mdf["model_fname"] = mdf.model_fname.apply(lambda x: os.path.join(gpr_t_d2,x)) + pyemu.os_utils.start_workers(gpr_t_d2, mou_exe_path, case + ".pst", num_workers, worker_root=".", + master_dir=gpr_m_d2, verbose=True, port=port, + ppw_function=pyemu.helpers.gpr_pyworker, + ppw_kwargs={"input_df":input_df, + #"mdf":mdf, + "gpr":gpr_d2}) + + + arcppw = pd.read_csv(os.path.join(gpr_m_d2,"zdt1.archive.obs_pop.csv"),index_col=0) + diff = np.abs(arcppw.values - arcorg.values) + print(diff.max()) + assert diff.max() < 1e-6 + + + start = datetime.now() + b_d = os.getcwd() + os.chdir(gpr_t_d2) + p = sp.Popen([mou_exe_path,"{0}.pst".format(case),"/h",":{0}".format(port)]) + os.chdir(b_d) + #p.wait() + #return + + # looper over and start the workers - in this + # case they dont need unique dirs since they aren't writing + # anything + procs = [] + # try this test with 1 worker as an edge case + num_workers = 1 + for i in range(num_workers): + pp = mp.Process(target=gpr_zdt1_ppw) + pp.start() + procs.append(pp) + # if everything worked, the the workers should receive the + # shutdown signal from the master and exit gracefully... + for pp in procs: + pp.join() + + # wait for the master to finish...but should already be finished + p.wait() + finish = datetime.now() + print("ppw` took",(finish-start).total_seconds()) + print("org took",duration1) + + arcppw = pd.read_csv(os.path.join(gpr_t_d2,"zdt1.archive.obs_pop.csv"),index_col=0) + diff = np.abs(arcppw.values - arcorg.values) + print(diff.max()) + assert diff.max() < 1e-6 + + + +def gpr_zdt1_ppw(): + t_d = "zdt1_gpr_template" + os.chdir(t_d) + pst_name = "zdt1.pst" + ppw = pyemu.helpers.gpr_pyworker(pst_name,"localhost",4569,gpr=True) + os.chdir("..") + + if __name__ == "__main__": #test_dsi_basic() #test_dsi_nst() @@ -325,4 +915,6 @@ def test_lpfa_std(): #test_dsi_mixed() #test_dsivc_freyberg() #plot_freyberg_dsi() - test_lpfa_std() + #test_lpfa_std() + gpr_zdt1_test() + From a7f3a6fc21c092b8bdd9cf03e981c33be852d07d Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 16:48:32 +0100 Subject: [PATCH 35/58] general fixes to ppw --- pyemu/emulators/gpr.py | 115 +++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py index ac43b567a..6473090fe 100644 --- a/pyemu/emulators/gpr.py +++ b/pyemu/emulators/gpr.py @@ -280,47 +280,7 @@ def predict(self, X, return_std=False): return predictions_df - def scrape_pst_dir(self,pst_dir,casename): - if not os.path.exists(pst_dir): - raise FileNotFoundError(f"PEST control file {pst_dir} does not exist") - - pst = Pst(os.path.join(pst_dir,casename + ".pst")) - - # work out input variable names - input_groups = pst.pestpp_options.get("opt_dec_var_groups",None) - par = pst.parameter_data - if input_groups is None: - print("using all adjustable parameters as inputs") - input_names = pst.adj_par_names - else: - input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")]) - print("input groups:",input_groups) - adj_par = par.loc[pst.adj_par_names,:].copy() - adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:] - input_names = adj_par.parnme.tolist() - print("input names:",input_names) - - #work out constraints and objectives - ineq_names = pst.less_than_obs_constraints.tolist() - ineq_names.extend(pst.greater_than_obs_constraints.tolist()) - obs = pst.observation_data - objs = pst.pestpp_options.get("mou_objectives",None) - constraints = [] - - if objs is None: - print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives") - objs = ineq_names - else: - objs = objs.lower().strip().split(',') - constraints = [n for n in ineq_names if n not in objs] - - print("objectives:",objs) - print("constraints:",constraints) - output_names = objs - output_names.extend(constraints) - - return pst, input_names, output_names, objs, constraints def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): @@ -350,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): # 3. which obs are objectives; subset of output_names # 4. which obs are constraints; subset of output_names - pst, input_names, output_names, objs, constraints = self.scrape_pst_dir(pst_dir,casename) + pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename) # check that all input_names ar ein par data @@ -381,9 +341,6 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): self.logger.statement(f"Creating template directory {gpr_t_d}") os.makedirs(gpr_t_d) - # pickle - self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl")) - self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}") # preapre template files self.logger.statement("Preparing PEST++ template files") @@ -457,7 +414,9 @@ def fix_df_col_type(orgdf,fixdf): f.write("if __name__ == '__main__':\n") f.write(" gpr_forward_run()\n") - + # pickle + self.save(os.path.join(gpr_t_d, "gpr_emulator.pkl")) + self.logger.statement(f"Saved GPR emulator to {os.path.join(gpr_t_d, 'gpr_emulator.pkl')}") gpst.control_data.noptmax = 0 @@ -471,6 +430,8 @@ def fix_df_col_type(orgdf,fixdf): gpst.control_data.noptmax = pst.control_data.noptmax gpst.write(os.path.join(gpr_t_d, gpst_fname), version=2) + + return def gpr_forward_run(): @@ -478,18 +439,58 @@ def gpr_forward_run(): This function gets added programmatically to the forward run process""" import pandas as pd from pyemu.emulators import GPR - input_df = pd.read_csv("gpr_input.csv",index_col=0).T - + input_df = pd.read_csv("gpr_input.csv",index_col=0) gpr = GPR.load("gpr_emulator.pkl") - df = pd.DataFrame(index=gpr.output_names, - columns=["sim","sim_std"]) - df.index.name = "output_name" + simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"]) + simdf.index.name = "output_name" if gpr.return_std: - predmean,predstdv = gpr.predict(input_df.loc[:,gpr.input_names], return_std=True) - df.loc[:,"sim"] = predmean[df.index].values - df.loc[:,"sim_std"] = predstdv[df.index].values + predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True) + simdf.loc[:,"sim"] = predmean[simdf.index].values + simdf.loc[:,"sim_std"] = predstdv[simdf.index].values else: - predmean = gpr.predict(input_df.loc[:,gpr.input_names]) - df.loc[:,"sim"] = predmean[df.index].values - df.to_csv("gpr_output.csv",index=True) - return df \ No newline at end of file + predmean = gpr.predict(input_df.loc[gpr.input_names].T) + simdf.loc[:,"sim"] = predmean[simdf.index].values + simdf.to_csv("gpr_output.csv",index=True) + return simdf + +def scrape_pst_dir(self,pst_dir,casename): + + if not os.path.exists(pst_dir): + raise FileNotFoundError(f"PEST control file {pst_dir} does not exist") + + pst = Pst(os.path.join(pst_dir,casename + ".pst")) + + # work out input variable names + input_groups = pst.pestpp_options.get("opt_dec_var_groups",None) + par = pst.parameter_data + if input_groups is None: + print("using all adjustable parameters as inputs") + input_names = pst.adj_par_names + else: + input_groups = set([i.strip() for i in input_groups.lower().strip().split(",")]) + print("input groups:",input_groups) + adj_par = par.loc[pst.adj_par_names,:].copy() + adj_par = adj_par.loc[adj_par.pargp.apply(lambda x: x in input_groups),:] + input_names = adj_par.parnme.tolist() + print("input names:",input_names) + + #work out constraints and objectives + ineq_names = pst.less_than_obs_constraints.tolist() + ineq_names.extend(pst.greater_than_obs_constraints.tolist()) + obs = pst.observation_data + objs = pst.pestpp_options.get("mou_objectives",None) + constraints = [] + + if objs is None: + print("'mou_objectives' not found in ++ options, using all ineq tagged non-zero weighted obs as objectives") + objs = ineq_names + else: + objs = objs.lower().strip().split(',') + constraints = [n for n in ineq_names if n not in objs] + + print("objectives:",objs) + print("constraints:",constraints) + output_names = objs + output_names.extend(constraints) + + return pst, input_names, output_names, objs, constraints \ No newline at end of file From 993742012f29410864584b65190bfd8f4965a7de Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 16:49:07 +0100 Subject: [PATCH 36/58] refactored gpr helper fnxs to maintain legacy, but also use new GPR class --- pyemu/utils/helpers.py | 73 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index b109fb7e4..d993d642a 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4349,13 +4349,13 @@ def emulate_with_gpr(input_df,mdf,gpr_model_dict): mdf.loc[output_name,"sim_std"] = sim[1] return mdf - -def gpr_pyworker(pst,host,port,input_df=None,mdf=None): +def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None): import os import pandas as pd import numpy as np import pickle + # if explicit args weren't passed, get the default ones... if input_df is None: input_df = pd.read_csv("gpr_input.csv",index_col=0) @@ -4402,6 +4402,75 @@ def gpr_pyworker(pst,host,port,input_df=None,mdf=None): # if None, we are done if parameters is None: break + + +def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False): + + if gpr is False: + print("WARNING: using legacy gpr_pyworker function, which is deprecated") + gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf) + elif gpr is True: + gpr = None + + import pandas as pd + from pyemu.emulators import GPR + + # if explicit args weren't passed, get the default ones... + if input_df is None: + input_df = pd.read_csv("gpr_input.csv",index_col=0) + if gpr is None: + gpr = GPR.load("gpr_emulator.pkl") + simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float) + simdf.index.name = "output_name" + + ppw = PyPestWorker(pst,host,port,verbose=False) + + # we can only get parameters once the worker has initialize and + # is ready to run, so getting the first of pars here + # essentially blocks until the worker is ready + parameters = ppw.get_parameters() + # if its None, the master already quit... + if parameters is None: + return + + obs = ppw._pst.observation_data.copy() + # align the obsval series with the order sent from the master + obs = obs.loc[ppw.obs_names,"obsval"] + + # work out which par values sent from the master we need to run the emulator + par = ppw._pst.parameter_data.copy() + usepar_idx = [] + ppw_par_names = list(ppw.par_names) + for i,pname in enumerate(input_df.index.values): + usepar_idx.append(ppw_par_names.index(pname)) + + + while True: + # map the current dv values in parameters into the + # df needed to run the emulator + input_df["parval1"] = parameters.values[usepar_idx] + # do the emulation + if gpr.return_std: + predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True) + simdf.loc[:,"sim"] = predmean[simdf.index].values + simdf.loc[:,"sim_std"] = predstdv[simdf.index].values + else: + predmean = gpr.predict(input_df.loc[gpr.input_names].T) + simdf.loc[:,"sim"] = predmean[simdf.index].values + + + # replace the emulated quantities in the obs series + obs.loc[simdf.index] = simdf.sim.values + obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values + + #send the obs series to the master + ppw.send_observations(obs.values) + + #try to get more pars + parameters = ppw.get_parameters() + # if None, we are done + if parameters is None: + break From 3c942a0af33a826832562f4a1f9c58ee578869c6 Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 17:25:16 +0100 Subject: [PATCH 37/58] init updates --- pyemu/__init__.py | 2 +- pyemu/emulators/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyemu/__init__.py b/pyemu/__init__.py index a53c116ac..15dd1e1dc 100644 --- a/pyemu/__init__.py +++ b/pyemu/__init__.py @@ -22,7 +22,7 @@ os_utils, pp_utils, smp_utils) from .emulators import ( #emulators - Emulator, DSI, LPFA, + Emulator, DSI, LPFA, GPR, #transformers diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py index 5bb861e71..4833fd494 100755 --- a/pyemu/emulators/__init__.py +++ b/pyemu/emulators/__init__.py @@ -10,10 +10,12 @@ from .base import Emulator from .dsi import DSI from .lpfa import LPFA +from .gpr import GPR __all__ = [ 'Emulator', #base Emulator Class 'DSI', # DSI Emulator Class 'LPFA', + 'GPR', # GPR Emulator Class 'BaseTransformer', 'Log10Transformer', 'RowWiseMinMaxScaler', From 7bd18073b6000af8441ce48f2ccb25b48c869fcf Mon Sep 17 00:00:00 2001 From: rhugman Date: Wed, 2 Jul 2025 17:27:32 +0100 Subject: [PATCH 38/58] fix to utils gpr test --- autotest/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index c492848bc..c1ab978ff 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -3161,7 +3161,7 @@ def gpr_zdt1_test(): pst_fname = os.path.join(m_d, case + ".pst") gpr_t_d = os.path.join(case + "_gpr_template") - pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ + pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True) gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst")) shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv")) From 3133d09ad844c3fa13243177148f1fd5ba2f5aa6 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 3 Jul 2025 12:00:12 +0100 Subject: [PATCH 39/58] fi to grp_pyworker --- pyemu/utils/helpers.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index d993d642a..a862f2497 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4406,11 +4406,13 @@ def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None): def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False): - if gpr is False: + if gpr == False: print("WARNING: using legacy gpr_pyworker function, which is deprecated") gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf) - elif gpr is True: - gpr = None + elif gpr == True: + gpr = GPR.load("gpr_emulator.pkl") + else: + assert isinstance(gpr, GPR), "gpr must be a GPR object or True to load from 'gpr_emulator.pkl'" import pandas as pd from pyemu.emulators import GPR @@ -4418,8 +4420,7 @@ def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False): # if explicit args weren't passed, get the default ones... if input_df is None: input_df = pd.read_csv("gpr_input.csv",index_col=0) - if gpr is None: - gpr = GPR.load("gpr_emulator.pkl") + simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float) simdf.index.name = "output_name" From c6b1f0c37c83db76f6be7652e0a7530f029313fe Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 3 Jul 2025 12:08:48 +0100 Subject: [PATCH 40/58] fix legacy gpr oyworker handling --- pyemu/utils/helpers.py | 105 +++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index a862f2497..c6ab64772 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4406,72 +4406,73 @@ def gpr_pyworker_legacy(pst,host,port,input_df=None,mdf=None): def gpr_pyworker(pst,host,port,input_df=None,mdf=None,gpr=False): - if gpr == False: + if gpr is False: print("WARNING: using legacy gpr_pyworker function, which is deprecated") gpr_pyworker_legacy(pst,host,port,input_df=input_df,mdf=mdf) - elif gpr == True: - gpr = GPR.load("gpr_emulator.pkl") else: + if gpr is True: + gpr = GPR.load("gpr_emulator.pkl") + assert isinstance(gpr, GPR), "gpr must be a GPR object or True to load from 'gpr_emulator.pkl'" - import pandas as pd - from pyemu.emulators import GPR - - # if explicit args weren't passed, get the default ones... - if input_df is None: - input_df = pd.read_csv("gpr_input.csv",index_col=0) + import pandas as pd + from pyemu.emulators import GPR + + # if explicit args weren't passed, get the default ones... + if input_df is None: + input_df = pd.read_csv("gpr_input.csv",index_col=0) - simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float) - simdf.index.name = "output_name" + simdf = pd.DataFrame(index=gpr.output_names,columns=["sim","sim_std"],dtype=float) + simdf.index.name = "output_name" - ppw = PyPestWorker(pst,host,port,verbose=False) + ppw = PyPestWorker(pst,host,port,verbose=False) - # we can only get parameters once the worker has initialize and - # is ready to run, so getting the first of pars here - # essentially blocks until the worker is ready - parameters = ppw.get_parameters() - # if its None, the master already quit... - if parameters is None: - return + # we can only get parameters once the worker has initialize and + # is ready to run, so getting the first of pars here + # essentially blocks until the worker is ready + parameters = ppw.get_parameters() + # if its None, the master already quit... + if parameters is None: + return - obs = ppw._pst.observation_data.copy() - # align the obsval series with the order sent from the master - obs = obs.loc[ppw.obs_names,"obsval"] - - # work out which par values sent from the master we need to run the emulator - par = ppw._pst.parameter_data.copy() - usepar_idx = [] - ppw_par_names = list(ppw.par_names) - for i,pname in enumerate(input_df.index.values): - usepar_idx.append(ppw_par_names.index(pname)) - + obs = ppw._pst.observation_data.copy() + # align the obsval series with the order sent from the master + obs = obs.loc[ppw.obs_names,"obsval"] + + # work out which par values sent from the master we need to run the emulator + par = ppw._pst.parameter_data.copy() + usepar_idx = [] + ppw_par_names = list(ppw.par_names) + for i,pname in enumerate(input_df.index.values): + usepar_idx.append(ppw_par_names.index(pname)) + - while True: - # map the current dv values in parameters into the - # df needed to run the emulator - input_df["parval1"] = parameters.values[usepar_idx] - # do the emulation - if gpr.return_std: - predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True) - simdf.loc[:,"sim"] = predmean[simdf.index].values - simdf.loc[:,"sim_std"] = predstdv[simdf.index].values - else: - predmean = gpr.predict(input_df.loc[gpr.input_names].T) - simdf.loc[:,"sim"] = predmean[simdf.index].values + while True: + # map the current dv values in parameters into the + # df needed to run the emulator + input_df["parval1"] = parameters.values[usepar_idx] + # do the emulation + if gpr.return_std: + predmean,predstdv = gpr.predict(input_df.loc[gpr.input_names].T, return_std=True) + simdf.loc[:,"sim"] = predmean[simdf.index].values + simdf.loc[:,"sim_std"] = predstdv[simdf.index].values + else: + predmean = gpr.predict(input_df.loc[gpr.input_names].T) + simdf.loc[:,"sim"] = predmean[simdf.index].values - # replace the emulated quantities in the obs series - obs.loc[simdf.index] = simdf.sim.values - obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values + # replace the emulated quantities in the obs series + obs.loc[simdf.index] = simdf.sim.values + obs.loc[simdf.index.map(lambda x: x+"_gprstd")] = simdf.sim_std.values - #send the obs series to the master - ppw.send_observations(obs.values) + #send the obs series to the master + ppw.send_observations(obs.values) - #try to get more pars - parameters = ppw.get_parameters() - # if None, we are done - if parameters is None: - break + #try to get more pars + parameters = ppw.get_parameters() + # if None, we are done + if parameters is None: + break From 11569036dcf730ddcd5bb3be94b2481841fc6860 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 3 Jul 2025 13:59:01 +0100 Subject: [PATCH 41/58] mystery of the disapearing t_d argument --- pyemu/utils/helpers.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index c6ab64772..9e921ffe8 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4043,7 +4043,7 @@ def get_current_prop(_cur_thresh): return thresh, prop -def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_kernel=None,nverf=0, +def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",t_d="template",gp_kernel=None,nverf=0, plot_fits=False,apply_standard_scalar=False, include_emulated_std_obs=False): """helper function to setup a gaussian-process-regression (GPR) emulator for outputs of interest. This is primarily targeted at low-dimensional settings like those encountered in PESTPP-MOU @@ -4054,6 +4054,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_ output_fnames (str | list[str]): usually a list of observation population files that corresponds to the simulation results associated with `input_fnames` gpr_t_d (str): the template file dir to create that will hold the GPR emulators + t_d (str): the template dir containing the PESTPP-MOU outputs that the GPR emulators are trained on gp_kernel (sklearn GaussianProcess kernel): the kernel to use. if None, a standard RBF kernel is created and used nverf (int): the number of input-output pairs to hold back for a simple verification test @@ -4180,7 +4181,7 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_ import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(os.path.join(gpr_t_d,"gpr_fits.pdf")) - for output_name in output_names: + for i,output_name in enumerate(output_names): y_verf = df.loc[:,output_name].values.copy()[cut:] y_train = df.loc[:, output_name].values.copy()[:cut] @@ -4220,8 +4221,8 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_ plt.close(fig) - - model_fname = os.path.split(pst_fname)[1]+"."+output_name+".pkl" + objname = f'obj_{i}' + model_fname = os.path.split(pst_fname)[1]+"."+objname+".pkl" if os.path.exists(os.path.join(gpr_t_d,model_fname)): print("WARNING: model_fname '{0}' exists, overwriting...".format(model_fname)) with open(os.path.join(gpr_t_d,model_fname),'wb') as f: @@ -4323,6 +4324,13 @@ def prep_for_gpr(pst_fname,input_fnames,output_fnames,gpr_t_d="gpr_template",gp_ gpst_fname = os.path.split(pst_fname)[1] gpst.write(os.path.join(gpr_t_d,gpst_fname),version=2) print("saved gpr pst:",gpst_fname,"in gpr_t_d",gpr_t_d) + + #if they exist, copy pestpp bins from t_d over to gpr_t_d. otherwise, we assume bin is in path + pp_bins = [f for f in os.listdir(t_d) if 'pestpp-' in f] + if len(pp_bins)>0: + for pp_bin in pp_bins: + shutil.copy2(os.path.join(t_d,pp_bin),os.path.join(gpr_t_d,pp_bin)) + try: pyemu.os_utils.run("pestpp-mou {0}".format(gpst_fname),cwd=gpr_t_d) except Exception as e: From b27d95f13acc7a7bff522eba910d010798c48ee2 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 3 Jul 2025 14:02:43 +0100 Subject: [PATCH 42/58] checkin tests --- autotest/utils_tests.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index c1ab978ff..7bd45c2dd 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -3158,10 +3158,10 @@ def gpr_zdt1_test(): m_d = t_d dv_pops = [os.path.join(m_d, "{0}.0.dv_pop.csv".format(case))] obs_pops = [f.replace("dv_", "obs_") for f in dv_pops] - + pst_fname = os.path.join(m_d, case + ".pst") gpr_t_d = os.path.join(case + "_gpr_template") - pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops, gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ + pyemu.helpers.prep_for_gpr(pst_fname, dv_pops, obs_pops,t_d=m_d,gpr_t_d=gpr_t_d, nverf=int(pop_size * .1), \ plot_fits=True, apply_standard_scalar=False, include_emulated_std_obs=True) gpst = pyemu.Pst(os.path.join(gpr_t_d, case + ".pst")) shutil.copy2(os.path.join(m_d, case + ".0.dv_pop.csv"), os.path.join(gpr_t_d, "initial_dv_pop.csv")) @@ -3176,26 +3176,25 @@ def gpr_zdt1_test(): # master_dir=gpr_m_d, verbose=True, port=port) pyemu.os_utils.run("{0} {1}.pst".format(mou_exe_path,case),cwd=gpr_t_d) gpr_m_d = gpr_t_d - + finish = datetime.now() duration1 = (finish - start).total_seconds() arcorg = pd.read_csv(os.path.join(gpr_m_d,"zdt1.archive.obs_pop.csv"),index_col=0) - + psum_fname = os.path.join(gpr_m_d,case+".pareto.archive.summary.csv") assert os.path.exists(psum_fname) psum = pd.read_csv(psum_fname) print(psum.obj_1.min()) print(psum.obj_2.min()) - assert psum.obj_1.min() < 0.05 - + assert psum.obj_1.min() < 0.05 gpr_t_d2 = gpr_t_d + "_ppw" if os.path.exists(gpr_t_d2): shutil.rmtree(gpr_t_d2) shutil.copytree(gpr_t_d,gpr_t_d2) - + gpr_m_d2 = gpr_t_d2.replace("template","master") - + input_df = pd.read_csv(os.path.join(gpr_t_d2,"gpr_input.csv"),index_col=0) mdf = pd.read_csv(os.path.join(gpr_t_d2,"gprmodel_info.csv"),index_col=0) mdf["model_fname"] = mdf.model_fname.apply(lambda x: os.path.join(gpr_t_d2,x)) @@ -3210,8 +3209,7 @@ def gpr_zdt1_test(): diff = np.abs(arcppw.values - arcorg.values) print(diff.max()) assert diff.max() < 1e-6 - - + start = datetime.now() b_d = os.getcwd() os.chdir(gpr_t_d2) @@ -3234,13 +3232,13 @@ def gpr_zdt1_test(): # shutdown signal from the master and exit gracefully... for pp in procs: pp.join() - + # wait for the master to finish...but should already be finished p.wait() finish = datetime.now() print("ppw` took",(finish-start).total_seconds()) print("org took",duration1) - + arcppw = pd.read_csv(os.path.join(gpr_t_d2,"zdt1.archive.obs_pop.csv"),index_col=0) diff = np.abs(arcppw.values - arcorg.values) print(diff.max()) @@ -3258,7 +3256,8 @@ def gpr_zdt1_ppw(): if __name__ == "__main__": #ppu_geostats_test(".") - gpr_compare_invest() + gpr_zdt1_test() + #gpr_compare_invest() #gpr_constr_test() # import sys # t_d = "constr_ppw_template" From 0817aa3ddb252a2a7e682d099c6631a625b418b7 Mon Sep 17 00:00:00 2001 From: rhugman Date: Thu, 3 Jul 2025 15:32:53 +0100 Subject: [PATCH 43/58] fix to scrape dir fnx --- pyemu/emulators/gpr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py index 6473090fe..a79035ed9 100644 --- a/pyemu/emulators/gpr.py +++ b/pyemu/emulators/gpr.py @@ -453,7 +453,7 @@ def gpr_forward_run(): simdf.to_csv("gpr_output.csv",index=True) return simdf -def scrape_pst_dir(self,pst_dir,casename): +def scrape_pst_dir(pst_dir,casename): if not os.path.exists(pst_dir): raise FileNotFoundError(f"PEST control file {pst_dir} does not exist") From a3adfa006c2393f41d5e1e81a0e5225ea3e4317b Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 07:21:52 -0600 Subject: [PATCH 44/58] trying a lower max port number --- pyemu/utils/os_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyemu/utils/os_utils.py b/pyemu/utils/os_utils.py index d16183662..680f323ed 100644 --- a/pyemu/utils/os_utils.py +++ b/pyemu/utils/os_utils.py @@ -958,7 +958,7 @@ def send_killed_run(self,group=None,runid=None,desc="killed"): class PortManager(object): """Cross-platform port manager for parallel processes.""" def __init__(self, - port_range=(4004, 65535), + port_range=(4004, 4999), lock_dir=None, max_retries=50, lock_timeout=5, From 324a88cacb6485ddbe099fb4d616cddddb1613c2 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 08:14:06 -0600 Subject: [PATCH 45/58] turned off zdt1 test in utils_test - this functionality has moved to emulator tests --- autotest/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index 7bd45c2dd..cd962dad4 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -3110,7 +3110,7 @@ def gpr_constr_invest(): #assert 1.0 in psum.obj_2.values -def gpr_zdt1_test(): +def gpr_zdt1_invest(): import numpy as np import subprocess as sp import multiprocessing as mp From 2adf7f1654211555363642b7f86ccc628cd83063 Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 7 Jul 2025 17:08:09 +0100 Subject: [PATCH 46/58] refactro fixes for dsivc --- pyemu/emulators/dsi.py | 29 ++++++++++++++++++++--------- pyemu/emulators/lpfa.py | 6 ++---- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index 940868a57..dc773fa45 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -108,7 +108,7 @@ def _prepare_training_data(self): else: # Still need to set up a dummy transformer for inverse operations from .transformers import AutobotsAssemble - self.feature_transformer = AutobotsAssemble(data.copy()) + self.transformer_pipeline = AutobotsAssemble(data.copy()) self.data_transformed = data.copy() return self.data_transformed @@ -387,7 +387,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F #track dsivc args for forward run self.dsivc_args = {"percentiles":percentiles, - "decvar_names":decvar_names, + "decvar_names":decvar_names, "track_stack":track_stack, } @@ -403,8 +403,7 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}" pst = Pst(os.path.join(t_d,"dsi.pst")) if oe is None: - self.logger.statement("no posterior DSI observation ensemble provided, using dsi.3.obs.jcb in DSI template dir...") - self.logger.statement(f"using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...") + self.logger.statement(f"no posterior DSI observation ensemble provided, using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...") assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}" oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")) else: @@ -429,13 +428,22 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F } # ensure it's a dict if dsi_args is None: - dsi_args = {} + dsi_args = default_dsi_args elif not isinstance(dsi_args, dict): raise TypeError("Expected a dictionary for 'options'") # merge with defaults (user values override defaults) - dsi_args = {**default_dsi_args, **dsi_args} - - + #dsi_args = {**default_dsi_args, **dsi_args} + else: + for key, value in default_dsi_args.items(): + if key not in dsi_args: + dsi_args[key] = value + + # check that dsi_args has the required keys + required_keys = ["noptmax", "decvar_weight", "num_pyworkers"] + for key in required_keys: + if key not in dsi_args: + raise KeyError(f"Missing required key '{key}' in 'dsi_args'") + self.dsi_args = dsi_args out_files = [] self.logger.statement(f"preparing stack stats observations...") @@ -556,6 +564,9 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F self.logger.statement("overwriting dsi.pst file...") pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"] pst.control_data.noptmax = dsi_args["noptmax"] + + #TODO: ensure no noise for dvars obs + pst.write(os.path.join(t_d,"dsi.pst"), version=2) @@ -563,6 +574,6 @@ def prepare_dsivc(self, decvar_names, t_d=None, pst=None, oe=None, track_stack=F self.decision_variable_names = decvar_names # re-pickle dsi to track dsivc args self.save(os.path.join(t_d,"dsi.pickle")) - + self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...") return pst_dsivc \ No newline at end of file diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py index 4252a61d7..a1a2da5f8 100644 --- a/pyemu/emulators/lpfa.py +++ b/pyemu/emulators/lpfa.py @@ -190,10 +190,8 @@ def _prepare_training_data(self): Parameters ---------- - data : pandas.DataFrame, optional - Data to prepare. If None, uses self.data. Default is None. - test_size : float, optional - Fraction of data to use for testing. Default is 0.2. + self: LPFA + The emulator instance containing the data and configuration. Returns ------- From 323fb46eded85dd79d60a56d5c478bef184c66d3 Mon Sep 17 00:00:00 2001 From: rhugman Date: Mon, 7 Jul 2025 17:08:48 +0100 Subject: [PATCH 47/58] dsivc fix --- pyemu/utils/helpers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index 9e921ffe8..2eb966ee8 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4505,7 +4505,7 @@ def dsi_forward_run(pvals,dsi,write_csv=False): sim_vals.to_csv("dsi_sim_vals.csv") return sim_vals -def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"): +def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies",num_workers=1): import pandas as pd import pyemu import os @@ -4562,10 +4562,11 @@ def dsivc_forward_run(md_ies=".",ies_exe_path="pestpp-ies"): # deploy dsi... pvals = pd.read_csv(os.path.join(md_ies,"dsi_pars.csv"),index_col=0) - num_workers=1 + worker_root="." dsi = pickle.load(open(os.path.join(md_ies,"dsi.pickle"),"rb")) - num_workers = dsi.dsivc_args.get("num_pyworkers",1) + num_workers = dsi.dsi_args.get("num_pyworkers",1) + print(num_workers,"workers requested for dsi") pyemu.os_utils.start_workers(md_ies,ies_exe_path,"dsi.pst", num_workers=num_workers, worker_root=worker_root, From 1bf5432001a42136c67b76b8fbed02e8102cec1d Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 11:59:44 -0600 Subject: [PATCH 48/58] changed port on pyworkertest... --- autotest/utils_tests.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index cd962dad4..9c8f6b6c2 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -2692,7 +2692,7 @@ def pypestworker_test(): import subprocess as sp import multiprocessing as mp host = "localhost" - port = 4004 + port = 4111 case = "constr" org_d = os.path.join("utils","{0}_template".format(case)) t_d = "{0}_ppw_template".format(case) @@ -3256,7 +3256,8 @@ def gpr_zdt1_ppw(): if __name__ == "__main__": #ppu_geostats_test(".") - gpr_zdt1_test() + pypestworker_test() + #gpr_zdt1_test() #gpr_compare_invest() #gpr_constr_test() # import sys From 14f8c77c2b851973fdf5060c4d486cf7b26caea3 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 12:33:55 -0600 Subject: [PATCH 49/58] trying to speedup tests --- autotest/emulator_tests.py | 2 +- autotest/utils_tests.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 72370d827..92bd82882 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -777,7 +777,7 @@ def gpr_zdt1_test(): pst.pestpp_options["opt_risk"] = 0.5 pop_size = 20 - num_workers = 10 + num_workers = 3 noptmax_full = 1 port = 4569 diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index 9c8f6b6c2..4391c0e7c 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -2706,7 +2706,7 @@ def pypestworker_test(): pst.pestpp_options["overdue_giveup_fac"] = 1e10 pst.pestpp_options["overdue_resched_fac"] = 1e10 - pst.control_data.noptmax = 5 + pst.control_data.noptmax = 2 pst.write(os.path.join(t_d,"{0}.pst".format(case)),version=2) import sys sys.path.insert(0,t_d) @@ -3255,6 +3255,7 @@ def gpr_zdt1_ppw(): if __name__ == "__main__": + maha_pdc_test('.') #ppu_geostats_test(".") pypestworker_test() #gpr_zdt1_test() From dd1f5c634a74f9694d469c19c924765ae8617c92 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 12:35:19 -0600 Subject: [PATCH 50/58] serial pytest --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e2efecad..60ad36e99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,7 +118,7 @@ jobs: shell: bash -l {0} working-directory: ./autotest run: | - pytest -rP -rx --capture=no -v -n=auto --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} + pytest -rP -rx --capture=no -v -n=1 --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From cba560457548629e262d4a943238beacd7fc7df4 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 12:35:29 -0600 Subject: [PATCH 51/58] serial pytest --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60ad36e99..fc01f2dca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,7 +118,7 @@ jobs: shell: bash -l {0} working-directory: ./autotest run: | - pytest -rP -rx --capture=no -v -n=1 --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} + pytest -rP -rx --capture=no -v --tb=native --durations=20 --cov=pyemu --cov-report=lcov ${{ matrix.test-path }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 0cbdd6ca5f49da752f3ac87618fb9c4a2021f030 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 13:22:28 -0600 Subject: [PATCH 52/58] trying to speed up dsivc test --- autotest/emulator_tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 92bd82882..8e17498a1 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -107,7 +107,7 @@ def test_dsivc_freyberg(): track_stack=False, percentiles=[0.05, 0.25, 0.5, 0.75, 0.95], dsi_args={ - "noptmax":3, + "noptmax":1, "decvar_weight":10.0, "num_pyworkers":1, }, @@ -124,13 +124,13 @@ def test_dsivc_freyberg(): obs.loc[mou_objectives, "weight"] = 1.0 obs.loc[mou_objectives, "obgnme"] = "less_than_obj" - pstdsivc.control_data.noptmax = 1 #just for testing - pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing + pstdsivc.control_data.noptmax = -1 #just for testing + pstdsivc.pestpp_options["mou_population_size"] = 3 #just for testing pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2) md = "master_dsivc" - num_workers = 1 + num_workers = 3 worker_root = "." pyemu.os_utils.start_workers(td, From 5a9af48af876f422c60366197049440cc7cf4f13 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 14:10:48 -0600 Subject: [PATCH 53/58] more speed up --- autotest/emulator_tests.py | 4 ++-- autotest/pst_from_tests.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 8e17498a1..cb267ac67 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -909,12 +909,12 @@ def gpr_zdt1_ppw(): if __name__ == "__main__": - #test_dsi_basic() + test_dsi_basic() #test_dsi_nst() #test_dsi_nst_extrap() #test_dsi_mixed() #test_dsivc_freyberg() #plot_freyberg_dsi() #test_lpfa_std() - gpr_zdt1_test() + #gpr_zdt1_test() diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index e7ab7f2e0..3bbc8c090 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -5088,9 +5088,9 @@ def mf6_freyberg_thresh_test(tmp_path): # reset away from the truth... pst.parameter_data.loc[:,"parval1"] = org_par.parval1.values.copy() - pst.control_data.noptmax = 2 + pst.control_data.noptmax = 1 pst.pestpp_options["ies_par_en"] = "prior.jcb" - pst.pestpp_options["ies_num_reals"] = 30 + pst.pestpp_options["ies_num_reals"] = 10 pst.pestpp_options["ies_subset_size"] = -10 pst.pestpp_options["ies_no_noise"] = True #pst.pestpp_options["ies_bad_phi_sigma"] = 2.0 @@ -5110,7 +5110,7 @@ def mf6_freyberg_thresh_test(tmp_path): m_d = "master_thresh" port = _get_port() pyemu.os_utils.start_workers(pf.new_d, ies_exe_path, "freyberg.pst", - worker_root=".", master_dir=m_d, num_workers=10, + worker_root=".", master_dir=m_d, num_workers=5, port=port) phidf = pd.read_csv(os.path.join(m_d,"freyberg.phi.actual.csv")) # print(phidf["mean"]) From 31db71040c57cf7abd25952449eec36d4a9d6304 Mon Sep 17 00:00:00 2001 From: jwhite Date: Mon, 7 Jul 2025 15:31:30 -0600 Subject: [PATCH 54/58] skipping zdt1 test for now - something is up. Tried to fix dsi predict if case where transforms is None --- autotest/emulator_tests.py | 4 ++++ pyemu/emulators/dsi.py | 8 +++++--- pyemu/emulators/gpr.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index cb267ac67..94c35977b 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -31,6 +31,7 @@ def dsi_freyberg(tmp_d,transforms=None,tag=""): #dsi._fit_transformer_pipeline() dsi.fit() + # history match obsdata = pst.observation_data.copy() if transforms is not None: @@ -747,6 +748,8 @@ def collate_training_data(pst,m_d,case): #print("aggregated training dataset shape",df.shape,"saved to",pst_fname + ".aggresults.csv") return data, input_names, output_names + +@pytest.mark.skip(reason="seems like it still in dev") def gpr_zdt1_test(): import numpy as np import subprocess as sp @@ -909,6 +912,7 @@ def gpr_zdt1_ppw(): if __name__ == "__main__": + test_dsi_basic() #test_dsi_nst() #test_dsi_nst_extrap() diff --git a/pyemu/emulators/dsi.py b/pyemu/emulators/dsi.py index 940868a57..755462a2a 100755 --- a/pyemu/emulators/dsi.py +++ b/pyemu/emulators/dsi.py @@ -205,7 +205,7 @@ def predict(self, pvals): if not self.fitted: raise ValueError("Emulator must be fitted before prediction") - if not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None: + if self.transforms is not None and (not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None): raise ValueError("Emulator must be fitted and have valid transformations before prediction") if isinstance(pvals, pd.Series): @@ -215,8 +215,9 @@ def predict(self, pvals): pmat = self.pmat ovals = self.ovals sim_vals = ovals + np.dot(pmat,pvals) - pipeline = self.transformer_pipeline - sim_vals = pipeline.inverse(sim_vals) + if self.transforms is not None: + pipeline = self.transformer_pipeline + sim_vals = pipeline.inverse(sim_vals) sim_vals.index.name = 'obsnme' sim_vals.name = "obsval" self.sim_vals = sim_vals @@ -274,6 +275,7 @@ def prepare_pestpp(self, t_d=None, observation_data=None): # run once to get the dsi_pars.csv file pvals = np.zeros_like(self.s) + sim_vals = self.predict(pvals) self.logger.log("creating ins file") diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py index 6473090fe..7e2e6ea5c 100644 --- a/pyemu/emulators/gpr.py +++ b/pyemu/emulators/gpr.py @@ -310,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): # 3. which obs are objectives; subset of output_names # 4. which obs are constraints; subset of output_names - pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename) + pst, input_names, output_names, objs, constraints = scrape_pst_dir(self,pst_dir,casename) # check that all input_names ar ein par data From e2f6f4ea8b33052fd07042f6a334c25869dfdc6e Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 8 Jul 2025 11:29:12 +0100 Subject: [PATCH 55/58] checkin before merge --- autotest/emulator_tests.py | 6 +++--- autotest/pst_from_tests.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 72370d827..a3e088444 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -97,7 +97,7 @@ def test_dsivc_freyberg(): dsi = DSI.load(os.path.join(td, "dsi.pickle")) pst = pyemu.Pst(os.path.join(td, "dsi.pst")) - oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.1.obs.jcb")) + oe = pyemu.ObservationEnsemble.from_binary(pst,os.path.join(td, "dsi.0.obs.jcb")) obsdata = dsi.observation_data decvars = obsdata.loc[obsdata.obgnme=="out_wel"].obsnme.tolist() @@ -107,7 +107,7 @@ def test_dsivc_freyberg(): track_stack=False, percentiles=[0.05, 0.25, 0.5, 0.75, 0.95], dsi_args={ - "noptmax":3, + "noptmax":-1, #just for testing "decvar_weight":10.0, "num_pyworkers":1, }, @@ -125,7 +125,7 @@ def test_dsivc_freyberg(): obs.loc[mou_objectives, "obgnme"] = "less_than_obj" pstdsivc.control_data.noptmax = 1 #just for testing - pstdsivc.pestpp_options["mou_population_size"] = 10 #just for testing + pstdsivc.pestpp_options["mou_population_size"] = 1 #just for testing pstdsivc.write(os.path.join(td, "dsivc.pst"),version=2) diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index e7ab7f2e0..05772529b 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -11,7 +11,7 @@ import pytest ext = '' -local_bins = False # change if wanting to test with local binary exes +local_bins = True # change if wanting to test with local binary exes if local_bins: bin_path = os.path.join("..", "..", "bin") if "linux" in platform.system().lower(): From 87a1fb8ef6087179e14c0c32de5a404382590c74 Mon Sep 17 00:00:00 2001 From: rhugman Date: Tue, 8 Jul 2025 12:54:56 +0100 Subject: [PATCH 56/58] dangrerous boolean flag... --- autotest/pst_from_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index 72e17d60c..3bbc8c090 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -11,7 +11,7 @@ import pytest ext = '' -local_bins = True # change if wanting to test with local binary exes +local_bins = False # change if wanting to test with local binary exes if local_bins: bin_path = os.path.join("..", "..", "bin") if "linux" in platform.system().lower(): From 2d4c7789b5a844b74a365b4ef8f386ae69e7cd93 Mon Sep 17 00:00:00 2001 From: rhugman Date: Sat, 26 Jul 2025 18:02:28 -0500 Subject: [PATCH 57/58] fix handling of columns for transformer pipeline --- pyemu/emulators/transformers.py | 142 ++++++++++++++++++++++---------- 1 file changed, 99 insertions(+), 43 deletions(-) diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py index 39345159e..2c1ac7b29 100755 --- a/pyemu/emulators/transformers.py +++ b/pyemu/emulators/transformers.py @@ -4,7 +4,16 @@ from __future__ import print_function, division import numpy as np import pandas as pd -from sklearn.preprocessing import StandardScaler + + +# Check sklearn availability at module level +try: + from sklearn.preprocessing import StandardScaler + HAS_SKLEARN = True +except ImportError: + HAS_SKLEARN = False + # Create dummy classes or set to None + StandardScaler = None class BaseTransformer: @@ -27,14 +36,24 @@ def inverse_transform(self, X): raise NotImplementedError class Log10Transformer(BaseTransformer): - """Apply log10 transformation.""" + """Apply log10 transformation. + + Parameters + ---------- + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ - def __init__(self): + def __init__(self, columns=None): + self.columns = columns self.shifts = {} def transform(self, X): result = X.copy() - for col in X.columns: + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + + for col in columns: min_val = X[col].min() shift = -min_val + 1e-6 if min_val <= 0 else 0 self.shifts[col] = shift @@ -43,9 +62,10 @@ def transform(self, X): def inverse_transform(self, X): result = X.copy() - for col in X.columns: - shift = self.shifts.get(col, 0) - result[col] = (10 ** X[col]) - shift + for col in self.shifts.keys(): + if col in X.columns: + shift = self.shifts.get(col, 0) + result[col] = (10 ** X[col]) - shift return result class RowWiseMinMaxScaler(BaseTransformer): @@ -318,16 +338,33 @@ def inverse_transform(self, X): return result class StandardScalerTransformer(BaseTransformer): - def __init__(self, with_mean=True, with_std=True, copy=True): + """Wrapper around sklearn's StandardScaler for DataFrame compatibility. + + Parameters + ---------- + with_mean : bool, default=True + If True, center the data before scaling. + with_std : bool, default=True + If True, scale the data to unit variance. + copy : bool, default=True + If True, a copy of X will be created. If False, centering and scaling happen in-place. + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ + + def __init__(self, with_mean=True, with_std=True, copy=True, columns=None): self.with_mean = with_mean self.with_std = with_std self.copy = copy + self.columns = columns self._sklearn_scaler = None - self._columns = None + self._fitted_columns = None def fit(self, X): - # Store column names for DataFrame reconstruction - self._columns = X.columns.tolist() + # Determine which columns to fit + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + self._fitted_columns = columns # Create sklearn StandardScaler self._sklearn_scaler = StandardScaler( @@ -337,56 +374,69 @@ def fit(self, X): ) # Fit on numpy array (sklearn expects this) - self._sklearn_scaler.fit(X.values) + if columns: + self._sklearn_scaler.fit(X[columns].values) return self def transform(self, X): if self._sklearn_scaler is None: raise ValueError("Transformer must be fitted before transform") - - # Transform using sklearn - transformed_values = self._sklearn_scaler.transform(X.values) - # Reconstruct DataFrame with original structure - if isinstance(X, pd.DataFrame): - return pd.DataFrame( - transformed_values, - index=X.index, - columns=X.columns - ) - else: - return transformed_values + result = X.copy() + + if self._fitted_columns: + # Transform using sklearn + transformed_values = self._sklearn_scaler.transform(X[self._fitted_columns].values) + + # Update only the fitted columns in the result + result[self._fitted_columns] = transformed_values + + return result def inverse_transform(self, X): if self._sklearn_scaler is None: raise ValueError("Transformer must be fitted before inverse_transform") - - # Inverse transform using sklearn - inverse_values = self._sklearn_scaler.inverse_transform(X.values) - # Reconstruct DataFrame - if isinstance(X, pd.DataFrame): - return pd.DataFrame( - inverse_values, - index=X.index, - columns=X.columns - ) - else: - return inverse_values + result = X.copy() + + if self._fitted_columns: + # Inverse transform using sklearn + inverse_values = self._sklearn_scaler.inverse_transform(X[self._fitted_columns].values) + + # Update only the fitted columns in the result + result[self._fitted_columns] = inverse_values + + return result class NormalScoreTransformer(BaseTransformer): - """A transformer for normal score transformation.""" + """A transformer for normal score transformation. + + Parameters + ---------- + tol : float, default=1e-7 + Tolerance for convergence in random generation. + max_samples : int, default=1000000 + Maximum number of samples for random generation. + quadratic_extrapolation : bool, default=False + Whether to use quadratic extrapolation for values outside the fitted range. + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ - def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False): + def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False, columns=None): self.tol = tol self.max_samples = max_samples self.quadratic_extrapolation = quadratic_extrapolation + self.columns = columns self.column_parameters = {} self.shared_z_scores = {} def fit(self, X): """Fit the transformer to the data.""" - for col in X.columns: + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + + for col in columns: values = X[col].values sorted_vals = np.sort(values) smoothed_vals = self._moving_average_with_endpoints(sorted_vals) @@ -417,7 +467,10 @@ def transform(self, X): The transformed DataFrame with normal scores. """ result = X.copy() - for col in X.columns: + for col in self.column_parameters.keys(): + if col not in X.columns: + continue + params = self.column_parameters.get(col, {}) z_scores = params.get('z_scores', []) originals = params.get('originals', []) @@ -476,7 +529,10 @@ def inverse_transform(self, X): The inverse-transformed DataFrame. """ result = X.copy() - for col in X.columns: + for col in self.column_parameters.keys(): + if col not in X.columns: + continue + params = self.column_parameters.get(col, {}) z_scores = params.get('z_scores', []) originals = params.get('originals', []) @@ -747,13 +803,13 @@ def inverse_on_external_df(self, df, columns=None): def _create_transformer(self, transform_type, **kwargs): """Factory method to create appropriate transformer.""" if transform_type == "log10": - return Log10Transformer() + return Log10Transformer(**kwargs) elif transform_type == "normal_score": return NormalScoreTransformer(**kwargs) elif transform_type == "row_wise_minmax": return RowWiseMinMaxScaler(**kwargs) elif transform_type == "standard_scaler": - return StandardScalerTransformer() + return StandardScalerTransformer(**kwargs) elif transform_type == "minmax_scaler": return MinMaxScaler(**kwargs) else: From 2180ebd993b1ab066e3b8e9df6f7011bbcebe054 Mon Sep 17 00:00:00 2001 From: rhugman Date: Sat, 26 Jul 2025 18:02:54 -0500 Subject: [PATCH 58/58] make sklearn an optional dependency --- pyemu/emulators/__init__.py | 39 ++++++++++++++++++++++++++++++++----- pyemu/emulators/gpr.py | 2 +- pyemu/emulators/lpfa.py | 17 +++++++++++++--- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/pyemu/emulators/__init__.py b/pyemu/emulators/__init__.py index 4833fd494..3fc5f847e 100755 --- a/pyemu/emulators/__init__.py +++ b/pyemu/emulators/__init__.py @@ -2,25 +2,54 @@ BaseTransformer, Log10Transformer, RowWiseMinMaxScaler, - StandardScalerTransformer, + #StandardScalerTransformer, NormalScoreTransformer, TransformerPipeline, AutobotsAssemble ) from .base import Emulator from .dsi import DSI -from .lpfa import LPFA -from .gpr import GPR +#from .lpfa import LPFA +#from .gpr import GPR + + __all__ = [ 'Emulator', #base Emulator Class 'DSI', # DSI Emulator Class 'LPFA', - 'GPR', # GPR Emulator Class +## 'GPR', # GPR Emulator Class 'BaseTransformer', 'Log10Transformer', 'RowWiseMinMaxScaler', - 'StandardScalerTransformer', +# 'StandardScalerTransformer', 'NormalScoreTransformer', 'TransformerPipeline', 'AutobotsAssemble' ] + +# Check sklearn availability +try: + import sklearn + HAS_SKLEARN = True +except ImportError: + HAS_SKLEARN = False + +# Conditional imports +if HAS_SKLEARN: + from .lpfa import LPFA + from .gpr import GPR + from .transformers import StandardScalerTransformer + __all__.extend(['LPFA', 'GPR','StandardScalerTransformer']) +else: + # Create placeholder classes that raise informative errors + class LPFA: + def __init__(self, *args, **kwargs): + raise ImportError("LPFA emulator requires scikit-learn. Install with: pip install scikit-learn") + + class GPR: + def __init__(self, *args, **kwargs): + raise ImportError("GPR emulator requires scikit-learn. Install with: pip install scikit-learn") + + class StandardScalerTransformer: + def __init__(self, *args, **kwargs): + raise ImportError("StandardScalerTransformer requires scikit-learn. Install with: pip install scikit-learn") diff --git a/pyemu/emulators/gpr.py b/pyemu/emulators/gpr.py index a07e2797c..a79035ed9 100644 --- a/pyemu/emulators/gpr.py +++ b/pyemu/emulators/gpr.py @@ -310,7 +310,7 @@ def prepare_pestpp(self,pst_dir,casename,gpr_t_d="gpr_template"): # 3. which obs are objectives; subset of output_names # 4. which obs are constraints; subset of output_names - pst, input_names, output_names, objs, constraints = scrape_pst_dir(self,pst_dir,casename) + pst, input_names, output_names, objs, constraints = scrape_pst_dir(pst_dir,casename) # check that all input_names ar ein par data diff --git a/pyemu/emulators/lpfa.py b/pyemu/emulators/lpfa.py index a1a2da5f8..2de198858 100644 --- a/pyemu/emulators/lpfa.py +++ b/pyemu/emulators/lpfa.py @@ -3,11 +3,22 @@ """ from __future__ import print_function, division + +# Check sklearn availability at module level +try: + from sklearn.model_selection import train_test_split + from sklearn.decomposition import PCA + from sklearn.neural_network import MLPRegressor + HAS_SKLEARN = True +except ImportError: + HAS_SKLEARN = False + # Create dummy classes or set to None + train_test_split = None + PCA = None + MLPRegressor = None + import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.decomposition import PCA -from sklearn.neural_network import MLPRegressor from .base import Emulator from .transformers import RowWiseMinMaxScaler