pypest · jwhite-usgs · Jul 27, 2020 · Jul 24, 2020 · Jul 24, 2020 · Jul 24, 2020
diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py
@@ -473,9 +473,12 @@ def freyberg_prior_build_test():
 
 def generic_function():
     import pandas as pd
-    onames = ["generic_obs_{0}".format(i) for i in range(100)]
-    df = pd.DataFrame({"obsval":1,"simval":2,"obsnme":onames},index=onames)
-    df.to_csv("generic.csv",index=False)
+    import numpy as np
+    #onames = ["generic_obs_{0}".format(i) for i in range(100)]
+    onames = pd.date_range("1-1-2020",periods=100,freq='d')
+    df = pd.DataFrame({"index_2":np.arange(100),"simval1":1,"simval2":2,"datetime":onames})
+    df.index = df.pop("datetime")
+    df.to_csv("generic.csv",date_format="%d-%m-%Y %H:%M:%S")
     return df
 
 
@@ -589,7 +592,7 @@ def mf6_freyberg_test():
     df = generic_function()
     os.chdir("..")
     # add the values in generic to the ctl file
-    pf.add_observations("generic.csv",insfile="generic.csv.ins",index_cols="obsnme",use_cols="simval")
+    pf.add_observations("generic.csv",insfile="generic.csv.ins",index_cols=["datetime","index_2"],use_cols=["simval1","simval2"])
     # add the function call to make generic to the forward run script
     pf.add_py_function("pst_from_tests.py","generic_function()",is_pre_cmd=False)
 
@@ -720,6 +723,7 @@ def mf6_freyberg_test():
     sfr_pars[['inst', 'usecol', '#rno']] = sfr_pars.parnme.apply(
         lambda x: pd.DataFrame([s.split(':') for s in x.split('_')
                                 if ':' in s]).set_index(0)[1])
+
     sfr_pars['#rno'] = sfr_pars['#rno'] .astype(int)
     os.chdir(pf.new_d)
     pst.write_input_files()
@@ -1299,7 +1303,7 @@ def mf6_freyberg_direct_test():
 if __name__ == "__main__":
     #freyberg_test()
     #freyberg_prior_build_test()
-    mf6_freyberg_test()
-    # mf6_freyberg_shortnames_test()
+    #mf6_freyberg_test()
+    mf6_freyberg_shortnames_test()
     #mf6_freyberg_da_test()
     #mf6_freyberg_direct_test()
diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py
@@ -1792,20 +1792,22 @@ def maha_pdc_test():
     l1_critical_value = 6.4 #chi squared value at df=1,p=0.01
     l2_critical_value = 9.2 #chi sqaured value at df=2,p=0.01
     pst = pyemu.Pst(os.path.join("la", "pest.pst"))
+    pst.observation_data.loc[:,"weight"] = 1.0
     en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst,num_reals=20)
     level_1,level_2 = pyemu.helpers.get_maha_obs_summary(en)
-    assert level_1.max() < l1_critical_value
-    assert level_2.sq_distance.max() < l2_critical_value
-
-    # pst = pyemu.Pst(os.path.join("pst","zoned_nz_64.pst"))
-    # en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst, num_reals=20)
-    # level_1, level_2 = pyemu.helpers.get_maha_obs_summary(en)
-    # level_1.sort_values(inplace=True)
-    # level_2.sort_values(by="sq_distance",inplace=True)
-    # print(level_1)
-    # print(level_2)
-    # assert level_1.max() < l1_critical_value
-    # assert level_2.sq_distance.max() < l2_critical_value
+    assert level_1.shape[0] == 0
+    assert level_2.shape[0] == 0
+
+
+    pst = pyemu.Pst(os.path.join("pst","zoned_nz_64.pst"))
+    en = pyemu.ObservationEnsemble.from_gaussian_draw(pst=pst, num_reals=20)
+    level_1, level_2 = pyemu.helpers.get_maha_obs_summary(en)
+    level_1.sort_values(inplace=True)
+    level_2.sort_values(by="sq_distance",inplace=True)
+    print(level_1)
+    print(level_2)
+    assert level_1.shape[0] == 0
+    assert level_2.shape[0] == 0
 
 
 if __name__ == "__main__":

diff --git a/pyemu/pst/pst_handler.py b/pyemu/pst/pst_handler.py
@@ -2818,14 +2818,18 @@ def try_parse_name_metadata(self):
         obs_cols = pst_utils.pst_config["obs_fieldnames"]
 
         for df,name,fieldnames in zip([par,obs],["parnme","obsnme"],[par_cols,obs_cols]):
-            meta_dict = df.loc[:,name].apply(lambda x: dict([item.split(':') for item in x.split('_') if ':' in item]))
-            unique_keys = []
-            for k,v in meta_dict.items():
-                for kk,vv in v.items():
-                    if kk not in fieldnames and kk not in unique_keys:
-                        unique_keys.append(kk)
-            for uk in unique_keys:
-                if uk not in df.columns:
-                    df.loc[:,uk] = np.NaN
-                df.loc[:,uk] = meta_dict.apply(lambda x: x.get(uk,np.NaN))
+            try:
+                meta_dict = df.loc[:,name].apply(lambda x: dict([item.split(':') for item in x.split('_') if ':' in item]))
+                unique_keys = []
+                for k,v in meta_dict.items():
+                    for kk,vv in v.items():
+                        if kk not in fieldnames and kk not in unique_keys:
+                            unique_keys.append(kk)
+                for uk in unique_keys:
+                    if uk not in df.columns:
+                        df.loc[:,uk] = np.NaN
+                    df.loc[:,uk] = meta_dict.apply(lambda x: x.get(uk,np.NaN))
+            except Exception as e:
+                print("error parsing metadata from '{0}', continuing".format(name))
+
 
diff --git a/pyemu/utils/geostats.py b/pyemu/utils/geostats.py
@@ -11,7 +11,6 @@
 from pyemu.mat.mat_handler import Cov
 from pyemu.utils.pp_utils import pp_file_to_dataframe
 from ..pyemu_warnings import PyemuWarning
-from flopy.discretization import StructuredGrid
 
 EPSILON = 1.0e-7
 
@@ -587,6 +586,7 @@ def draw_conditional(self, seed, obs_points, sg, base_values_file, local=True, f
             in arithmetic space
         """
 
+
         # get a dataframe for the observation points, from file unless passed
         if isinstance(obs_points, str):
             obs_points = pp_file_to_dataframe(obs_points)

diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py
@@ -5971,26 +5971,29 @@ def _set_vertices(self):
 # 
 #     return get_spatialreference(epsg, text='proj4')
 
-def get_maha_obs_summary(sim_en):
+def get_maha_obs_summary(sim_en,l1_crit_val=6.34,l2_crit_val=9.2):
     """ calculate the 1-D and 2-D mahalanobis distance
 
     Args:
         sim_en (`pyemu.ObservationEnsemble`): a simulated outputs ensemble
+        l1_crit_val (`float1): the chi squared critical value for the 1-D
+            mahalanobis distance.  Default is 6.4 (p=0.01,df=1)
+        l2_crit_val (`float1): the chi squared critical value for the 2-D
+            mahalanobis distance.  Default is 9.2 (p=0.01,df=2)
 
     Returns:
 
         tuple containing
 
         - **pandas.DataFrame**: 1-D subspace squared mahalanobis distances
+            that exceed the `l1_crit_val` threshold
         - **pandas.DataFrame**: 2-D subspace squared mahalanobis distances
+            that exceed the `l2_crit_val` threshold
 
     Note:
         Noise realizations are added to `sim_en` to account for measurement
             noise.
 
-
-
-
     """
 
     if not isinstance(sim_en,pyemu.ObservationEnsemble):
@@ -6019,25 +6022,75 @@ def get_maha_obs_summary(sim_en):
     #obsval_dict = obs.loc[nnz_en.columns.values,"obsval"].to_dict()
 
     # first calculate the 1-D subspace maha distances
+    print("calculating L-1 maha distances")
     sim_mean = nnz_en.mean()
     obs_mean = obs.loc[nnz_en.columns.values,"obsval"]
     simvar_inv = 1. / (nnz_en.std()**2)
     res_mean = sim_mean - obs_mean
     l1_maha_sq_df = res_mean**2 * simvar_inv
-
+    l1_maha_sq_df = l1_maha_sq_df.loc[l1_maha_sq_df > l1_crit_val]
     # now calculate the 2-D subspace maha distances
-    onames1,onames2,l2_maha_sq_vals = [],[],[]
-    for i1,o1 in enumerate(nz_names):
-        r1 = res_mean[o1]
-        print("{0} of {1}".format(i1+1,len(nz_names)))
-        for i2,o2 in enumerate(nz_names[i1+1:]):
-            c_inv = np.linalg.inv(nz_cov_df.loc[[o1,o2],[o1,o2]].values)
-            r2 = res_mean[o2]
-            r_vec = np.array([r1,r2])
-            l2_maha_sq_val = np.dot(np.dot(r_vec,c_inv),r_vec.transpose())
-            onames1.append(o1)
-            onames2.append(o2)
-            l2_maha_sq_vals.append(l2_maha_sq_val)
+    print("preparing L-2 maha distance containers")
+    manager = mp.Manager()
+    ns = manager.Namespace()
+    results = manager.dict()
+    mean = manager.dict(res_mean.to_dict())
+    var = manager.dict()
+    cov = manager.dict()
+    var_arr = np.diag(nz_cov_df.values)
+    for i1, o1 in enumerate(nz_names):
+        var[o1] = var_arr[i1]
+
+        cov_vals = nz_cov_df.loc[o1, :].values[i1+1:]
+        ostr_vals = ["{0}_{1}".format(o1, o2) for o2 in nz_names[i1+1:]]
+        cd = {o:c for o,c in zip(ostr_vals,cov_vals)}
+        cov.update(cd)
+    print("starting L-2 maha distance parallel calcs")
+    #pool = mp.Pool(processes=5)
+    with mp.get_context("spawn").Pool() as pool:
+        for i1,o1 in enumerate(nz_names):
+            o2names = [o2 for o2 in nz_names[i1+1:]]
+            rresults = [pool.apply_async(_l2_maha_worker,args=(o1,o2names,mean,var,cov,results,l2_crit_val))]
+        [r.get() for r in rresults]
+
+        print("closing pool")
+        pool.close()
+
+        print("joining pool")
+        pool.join()
+
+    #print(results)
+    #print(len(results),len(ostr_vals))
+
+    keys = list(results.keys())
+    onames1 = [k.split('|')[0] for k in keys]
+    onames2 = [k.split('|')[1] for k in keys]
+    l2_maha_sq_vals = [results[k] for k in keys]
     l2_maha_sq_df = pd.DataFrame({"obsnme_1":onames1,"obsnme_2":onames2,"sq_distance":l2_maha_sq_vals})
 
     return l1_maha_sq_df,l2_maha_sq_df
+
+
+def _l2_maha_worker(o1,o2names,mean,var,cov,results,l2_crit_val):
+
+    rresults = {}
+    v1 = var[o1]
+    c = np.zeros((2, 2))
+    c[0, 0] = v1
+    r1 = mean[o1]
+    for o2 in o2names:
+        ostr = "{0}_{1}".format(o1,o2)
+        cv = cov[ostr]
+        v2 = var[o2]
+        c[1,1] = v2
+        c[0,1] = cv
+        c[1,0] = cv
+        c_inv = np.linalg.inv(c)
+
+        r2 = mean[o2]
+        r_vec = np.array([r1, r2])
+        l2_maha_sq_val = np.dot(np.dot(r_vec, c_inv), r_vec.transpose())
+        if l2_maha_sq_val > l2_crit_val:
+            rresults[ostr] = l2_maha_sq_val
+    results.update(rresults)
+    print(o1,"done")
diff --git a/pyemu/utils/pst_from.py b/pyemu/utils/pst_from.py
@@ -1902,13 +1902,13 @@ def _write_direct_df_tpl(in_filename, tpl_filename,df,name,index_cols,typ,use_co
     # get some index strings for naming
     if longnames:
         j = '_'
-        fmt = "{0}:{1}"
+        fmt = "{0}|{1}"
         if isinstance(index_cols[0], str):
             inames = index_cols
         else:
             inames = ["idx{0}".format(i) for i in range(len(index_cols))]
     else:
-        fmt = "{1:3}"
+        fmt = "{1|3}"
         j = ''
         if isinstance(index_cols[0], str):
             inames = index_cols
@@ -1922,6 +1922,9 @@ def _write_direct_df_tpl(in_filename, tpl_filename,df,name,index_cols,typ,use_co
         lambda x: j.join([fmt.format(iname, xx)
                           for xx, iname in zip(x, inames)])).str.replace(' ', '')
 
+    df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace(":", "")
+    df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace("|", ":")
+
     if get_xy is not None:
         if xy_in_idx is not None:
             # x and y already in index cols
@@ -2098,7 +2101,7 @@ def _get_tpl_or_ins_df(filenames, dfs, name, index_cols, typ, use_cols=None,
     # get some index strings for naming
     if longnames:
         j = '_'
-        fmt = "{0}:{1}"
+        fmt = "{0}|{1}"
         if isinstance(index_cols[0], str):
             inames = index_cols
         else:
@@ -2114,9 +2117,12 @@ def _get_tpl_or_ins_df(filenames, dfs, name, index_cols, typ, use_cols=None,
     if not zero_based:
         df_ti.loc[:, "sidx"] = df_ti.sidx.apply(
             lambda x: tuple(xx - 1 for xx in x))
+
     df_ti.loc[:, "idx_strs"] = df_ti.sidx.apply(
         lambda x: j.join([fmt.format(iname, xx)
                           for xx, iname in zip(x, inames)])).str.replace(' ', '')
+    df_ti.loc[:,"idx_strs"] = df_ti.idx_strs.str.replace(":","")
+    df_ti.loc[:, "idx_strs"] = df_ti.idx_strs.str.replace("|", ":")
 
     if get_xy is not None:
         if xy_in_idx is not None: