Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions autotest/pst_from_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,12 @@ def freyberg_prior_build_test():

def generic_function():
import pandas as pd
onames = ["generic_obs_{0}".format(i) for i in range(100)]
df = pd.DataFrame({"obsval":1,"simval":2,"obsnme":onames},index=onames)
df.to_csv("generic.csv",index=False)
import numpy as np
#onames = ["generic_obs_{0}".format(i) for i in range(100)]
onames = pd.date_range("1-1-2020",periods=100,freq='d')
df = pd.DataFrame({"index_2":np.arange(100),"simval1":1,"simval2":2,"datetime":onames})
df.index = df.pop("datetime")
df.to_csv("generic.csv",date_format="%d-%m-%Y %H:%M:%S")
return df


Expand Down Expand Up @@ -589,7 +592,7 @@ def mf6_freyberg_test():
df = generic_function()
os.chdir("..")
# add the values in generic to the ctl file
pf.add_observations("generic.csv",insfile="generic.csv.ins",index_cols="obsnme",use_cols="simval")
pf.add_observations("generic.csv",insfile="generic.csv.ins",index_cols=["datetime","index_2"],use_cols=["simval1","simval2"])
# add the function call to make generic to the forward run script
pf.add_py_function("pst_from_tests.py","generic_function()",is_pre_cmd=False)

Expand Down Expand Up @@ -720,6 +723,7 @@ def mf6_freyberg_test():
sfr_pars[['inst', 'usecol', '#rno']] = sfr_pars.parnme.apply(
lambda x: pd.DataFrame([s.split(':') for s in x.split('_')
if ':' in s]).set_index(0)[1])

sfr_pars['#rno'] = sfr_pars['#rno'] .astype(int)
os.chdir(pf.new_d)
pst.write_input_files()
Expand Down Expand Up @@ -1299,7 +1303,7 @@ def mf6_freyberg_direct_test():
if __name__ == "__main__":
#freyberg_test()
#freyberg_prior_build_test()
mf6_freyberg_test()
# mf6_freyberg_shortnames_test()
#mf6_freyberg_test()
mf6_freyberg_shortnames_test()
#mf6_freyberg_da_test()
#mf6_freyberg_direct_test()
26 changes: 14 additions & 12 deletions autotest/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1792,20 +1792,22 @@ def maha_pdc_test():
l1_critical_value = 6.4 #chi squared value at df=1,p=0.01
l2_critical_value = 9.2 #chi sqaured value at df=2,p=0.01
pst = pyemu.Pst(os.path.join("la", "pest.pst"))
pst.observation_data.loc[:,"weight"] = 1.0
en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst,num_reals=20)
level_1,level_2 = pyemu.helpers.get_maha_obs_summary(en)
assert level_1.max() < l1_critical_value
assert level_2.sq_distance.max() < l2_critical_value

# pst = pyemu.Pst(os.path.join("pst","zoned_nz_64.pst"))
# en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst, num_reals=20)
# level_1, level_2 = pyemu.helpers.get_maha_obs_summary(en)
# level_1.sort_values(inplace=True)
# level_2.sort_values(by="sq_distance",inplace=True)
# print(level_1)
# print(level_2)
# assert level_1.max() < l1_critical_value
# assert level_2.sq_distance.max() < l2_critical_value
assert level_1.shape[0] == 0
assert level_2.shape[0] == 0


pst = pyemu.Pst(os.path.join("pst","zoned_nz_64.pst"))
en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst, num_reals=20)
level_1, level_2 = pyemu.helpers.get_maha_obs_summary(en)
level_1.sort_values(inplace=True)
level_2.sort_values(by="sq_distance",inplace=True)
print(level_1)
print(level_2)
assert level_1.shape[0] == 0
assert level_2.shape[0] == 0


if __name__ == "__main__":
Expand Down
24 changes: 14 additions & 10 deletions pyemu/pst/pst_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2818,14 +2818,18 @@ def try_parse_name_metadata(self):
obs_cols = pst_utils.pst_config["obs_fieldnames"]

for df,name,fieldnames in zip([par,obs],["parnme","obsnme"],[par_cols,obs_cols]):
meta_dict = df.loc[:,name].apply(lambda x: dict([item.split(':') for item in x.split('_') if ':' in item]))
unique_keys = []
for k,v in meta_dict.items():
for kk,vv in v.items():
if kk not in fieldnames and kk not in unique_keys:
unique_keys.append(kk)
for uk in unique_keys:
if uk not in df.columns:
df.loc[:,uk] = np.NaN
df.loc[:,uk] = meta_dict.apply(lambda x: x.get(uk,np.NaN))
try:
meta_dict = df.loc[:,name].apply(lambda x: dict([item.split(':') for item in x.split('_') if ':' in item]))
unique_keys = []
for k,v in meta_dict.items():
for kk,vv in v.items():
if kk not in fieldnames and kk not in unique_keys:
unique_keys.append(kk)
for uk in unique_keys:
if uk not in df.columns:
df.loc[:,uk] = np.NaN
df.loc[:,uk] = meta_dict.apply(lambda x: x.get(uk,np.NaN))
except Exception as e:
print("error parsing metadata from '{0}', continuing".format(name))


2 changes: 1 addition & 1 deletion pyemu/utils/geostats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pyemu.mat.mat_handler import Cov
from pyemu.utils.pp_utils import pp_file_to_dataframe
from ..pyemu_warnings import PyemuWarning
from flopy.discretization import StructuredGrid

EPSILON = 1.0e-7

Expand Down Expand Up @@ -587,6 +586,7 @@ def draw_conditional(self, seed, obs_points, sg, base_values_file, local=True, f
in arithmetic space
"""


# get a dataframe for the observation points, from file unless passed
if isinstance(obs_points, str):
obs_points = pp_file_to_dataframe(obs_points)
Expand Down
87 changes: 70 additions & 17 deletions pyemu/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5971,26 +5971,29 @@ def _set_vertices(self):
#
# return get_spatialreference(epsg, text='proj4')

def get_maha_obs_summary(sim_en):
def get_maha_obs_summary(sim_en,l1_crit_val=6.34,l2_crit_val=9.2):
""" calculate the 1-D and 2-D mahalanobis distance

Args:
sim_en (`pyemu.ObservationEnsemble`): a simulated outputs ensemble
l1_crit_val (`float1): the chi squared critical value for the 1-D
mahalanobis distance. Default is 6.4 (p=0.01,df=1)
l2_crit_val (`float1): the chi squared critical value for the 2-D
mahalanobis distance. Default is 9.2 (p=0.01,df=2)

Returns:

tuple containing

- **pandas.DataFrame**: 1-D subspace squared mahalanobis distances
that exceed the `l1_crit_val` threshold
- **pandas.DataFrame**: 2-D subspace squared mahalanobis distances
that exceed the `l2_crit_val` threshold

Note:
Noise realizations are added to `sim_en` to account for measurement
noise.




"""

if not isinstance(sim_en,pyemu.ObservationEnsemble):
Expand Down Expand Up @@ -6019,25 +6022,75 @@ def get_maha_obs_summary(sim_en):
#obsval_dict = obs.loc[nnz_en.columns.values,"obsval"].to_dict()

# first calculate the 1-D subspace maha distances
print("calculating L-1 maha distances")
sim_mean = nnz_en.mean()
obs_mean = obs.loc[nnz_en.columns.values,"obsval"]
simvar_inv = 1. / (nnz_en.std()**2)
res_mean = sim_mean - obs_mean
l1_maha_sq_df = res_mean**2 * simvar_inv

l1_maha_sq_df = l1_maha_sq_df.loc[l1_maha_sq_df > l1_crit_val]
# now calculate the 2-D subspace maha distances
onames1,onames2,l2_maha_sq_vals = [],[],[]
for i1,o1 in enumerate(nz_names):
r1 = res_mean[o1]
print("{0} of {1}".format(i1+1,len(nz_names)))
for i2,o2 in enumerate(nz_names[i1+1:]):
c_inv = np.linalg.inv(nz_cov_df.loc[[o1,o2],[o1,o2]].values)
r2 = res_mean[o2]
r_vec = np.array([r1,r2])
l2_maha_sq_val = np.dot(np.dot(r_vec,c_inv),r_vec.transpose())
onames1.append(o1)
onames2.append(o2)
l2_maha_sq_vals.append(l2_maha_sq_val)
print("preparing L-2 maha distance containers")
manager = mp.Manager()
ns = manager.Namespace()
results = manager.dict()
mean = manager.dict(res_mean.to_dict())
var = manager.dict()
cov = manager.dict()
var_arr = np.diag(nz_cov_df.values)
for i1, o1 in enumerate(nz_names):
var[o1] = var_arr[i1]

cov_vals = nz_cov_df.loc[o1, :].values[i1+1:]
ostr_vals = ["{0}_{1}".format(o1, o2) for o2 in nz_names[i1+1:]]
cd = {o:c for o,c in zip(ostr_vals,cov_vals)}
cov.update(cd)
print("starting L-2 maha distance parallel calcs")
#pool = mp.Pool(processes=5)
with mp.get_context("spawn").Pool() as pool:
for i1,o1 in enumerate(nz_names):
o2names = [o2 for o2 in nz_names[i1+1:]]
rresults = [pool.apply_async(_l2_maha_worker,args=(o1,o2names,mean,var,cov,results,l2_crit_val))]
[r.get() for r in rresults]

print("closing pool")
pool.close()

print("joining pool")
pool.join()

#print(results)
#print(len(results),len(ostr_vals))

keys = list(results.keys())
onames1 = [k.split('|')[0] for k in keys]
onames2 = [k.split('|')[1] for k in keys]
l2_maha_sq_vals = [results[k] for k in keys]
l2_maha_sq_df = pd.DataFrame({"obsnme_1":onames1,"obsnme_2":onames2,"sq_distance":l2_maha_sq_vals})

return l1_maha_sq_df,l2_maha_sq_df


def _l2_maha_worker(o1,o2names,mean,var,cov,results,l2_crit_val):

rresults = {}
v1 = var[o1]
c = np.zeros((2, 2))
c[0, 0] = v1
r1 = mean[o1]
for o2 in o2names:
ostr = "{0}_{1}".format(o1,o2)
cv = cov[ostr]
v2 = var[o2]
c[1,1] = v2
c[0,1] = cv
c[1,0] = cv
c_inv = np.linalg.inv(c)

r2 = mean[o2]
r_vec = np.array([r1, r2])
l2_maha_sq_val = np.dot(np.dot(r_vec, c_inv), r_vec.transpose())
if l2_maha_sq_val > l2_crit_val:
rresults[ostr] = l2_maha_sq_val
results.update(rresults)
print(o1,"done")
12 changes: 9 additions & 3 deletions pyemu/utils/pst_from.py
Original file line number Diff line number Diff line change
Expand Up @@ -1902,13 +1902,13 @@ def _write_direct_df_tpl(in_filename, tpl_filename,df,name,index_cols,typ,use_co
# get some index strings for naming
if longnames:
j = '_'
fmt = "{0}:{1}"
fmt = "{0}|{1}"
if isinstance(index_cols[0], str):
inames = index_cols
else:
inames = ["idx{0}".format(i) for i in range(len(index_cols))]
else:
fmt = "{1:3}"
fmt = "{1|3}"
j = ''
if isinstance(index_cols[0], str):
inames = index_cols
Expand All @@ -1922,6 +1922,9 @@ def _write_direct_df_tpl(in_filename, tpl_filename,df,name,index_cols,typ,use_co
lambda x: j.join([fmt.format(iname, xx)
for xx, iname in zip(x, inames)])).str.replace(' ', '')

df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace(":", "")
df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace("|", ":")

if get_xy is not None:
if xy_in_idx is not None:
# x and y already in index cols
Expand Down Expand Up @@ -2098,7 +2101,7 @@ def _get_tpl_or_ins_df(filenames, dfs, name, index_cols, typ, use_cols=None,
# get some index strings for naming
if longnames:
j = '_'
fmt = "{0}:{1}"
fmt = "{0}|{1}"
if isinstance(index_cols[0], str):
inames = index_cols
else:
Expand All @@ -2114,9 +2117,12 @@ def _get_tpl_or_ins_df(filenames, dfs, name, index_cols, typ, use_cols=None,
if not zero_based:
df_ti.loc[:, "sidx"] = df_ti.sidx.apply(
lambda x: tuple(xx - 1 for xx in x))

df_ti.loc[:, "idx_strs"] = df_ti.sidx.apply(
lambda x: j.join([fmt.format(iname, xx)
for xx, iname in zip(x, inames)])).str.replace(' ', '')
df_ti.loc[:,"idx_strs"] = df_ti.idx_strs.str.replace(":","")
df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace("|", ":")

if get_xy is not None:
if xy_in_idx is not None:
Expand Down