solved bug in get_dt for dataframes with repeated entries, improve co…

…verage of the same function
pyMBE-dev · Jun 4, 2024 · be1c5fe · be1c5fe
1 parent 91da8c7
commit be1c5fe
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 102 deletions.
diff --git a/lib/analysis.py b/lib/analysis.py
@@ -57,76 +57,7 @@ def analyze_time_series(path_to_datafolder):
                                         index=[len(data)])   
     return data
 
-def get_params_from_dir_name(name):
-    """
-    Gets the parameters from name assuming a structure 
-    name=obsname1-value1_obsname2-value2...
-    
-    Args:
-        name(`str`): name of the directory
-    
-    Returns:
-        params(`dict`): dictionary with the labels and values of the parameters
-    """
-    entries = name.split('_')
-    params = {}
-    for entry in entries:
-        sp_entry = entry.split('-', 1)
-        params[sp_entry[0]] = sp_entry[-1]     #float(sp_entry[-1])     # creates a dictionary of parameters and their values.
-    return params
-
-
-def split_dataframe_in_equal_blocks(df, start_row, end_row, block_size):
-    """
-    Splits a Pandas dataframe in equally spaced blocks.
-
-    Args:
-        - df (`obj`): PandasDataframe
-        - start_row (`int`): index of the first row
-        - end_row (`int`): index of the last row
-        - block_size (`int`): number of rows per block
-
-    Returns:
-        - (`list`): array of PandasDataframe of equal size
-    """
-    return [df[row:row+block_size] for row in range(start_row,end_row,block_size)]
-
-def split_dataframe(df,n_blocks):
-    """
-    Splits a Pandas Dataframe in n_blocks of approximately the same size.
-
-    Args:
-        - df (`obj`): PandasDataframe
-        - n_blocks (`int`): Number of blocks
-
-    Returns:
-        - (`list`): array of PandasDataframe 
-
-    Notes:
-        - For a `df` of length `l` that should be split into n_blocks, it returns l % n_blocks sub-arrays of size l//n_blocks + 1 and the rest of size l//n_blocks.
-        - The behaviour of this function is the same as numpy.array_split for backwards compatibility, see [docs](https://numpy.org/doc/stable/reference/generated/numpy.array_split.html)
-
-    """
-
-    # Blocks of size 1 (s1) =  df.shape[0]//n_blocks+1
-
-    n_blocks_s1=df.shape[0] % n_blocks
-    block_size_s1=df.shape[0]//n_blocks+1
-    blocks=split_dataframe_in_equal_blocks(df=df,
-                                           start_row=0,
-                                           end_row=n_blocks_s1*block_size_s1,
-                                           block_size=block_size_s1)
-
-
-    # Blocks of size 2 (s2) =  df.shape[0]//n_blocks
-    block_size_s2=df.shape[0]//n_blocks
-    blocks+=split_dataframe_in_equal_blocks(df=df,
-                                           start_row=n_blocks_s1*block_size_s1,
-                                           end_row=df.shape[0],
-                                           block_size=block_size_s2)
-    return blocks
-
-def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns_to_analyze = "all", verbose = False):
+def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns_to_analyze = "all", verbose = False, dt=None):
     """
     Analyzes the data in `full_data` using a binning procedure.
 
@@ -137,12 +68,16 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns
         - equil(`float`,opt): fraction of the data discarded as part of the equilibration. Defaults to 0.1.
         - columns_to_analyze(`list`): array of column names to be analyzed. Defaults to "all".
         - verbose(`bool`): switch to activate/deactivate printing the block size. Defaults to False.
+        - dt(`float`): time step in which the data was stored. If no value is provided, is calculated from the time series. Defaults to None.
 
     Returns:
         `result`: pandas dataframe with the mean (mean), statistical error (err_mean), number of effective samples (n_eff) and correlation time (tau_int) of each observable.
     """
+    if dt is None:
+        dt, n_warnings = get_dt(full_data, time_col, verbose = verbose) # check that the data was stored with the same time interval dt
+        if n_warnings > 0 and verbose:
+            print(f"Warning: looks like a repeated time value was encountered {n_warnings} times")
 
-    dt = get_dt(full_data) # check that the data was stored with the same time interval dt
     drop_rows = int(full_data.shape[0]*equil) # calculate how many rows should be dropped as equilibration
     # drop the rows that will be discarded as equlibration
     data = full_data.drop(range(0,drop_rows))
@@ -184,34 +119,82 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1,  columns
     result = pd.concat( [ pd.Series([n_blocks,block_size], index=[('n_blocks',),('block_size',)]), result])
     return result
 
-def get_dt(data):
+def built_output_name(input_dict):
     """
-    Sorts data to calculate the time step of the simulation.
+    Builts the output name for a given set of input parameters.
+
+    Args:
+        input_dict (`dict`): dictionary with all terminal inputs.
+
+    Returns:
+        output_name (`str`): name used for the output files
+
+    Note:
+        The standard formatting rule is parametername1-parametervalue1_parametername2-parametervalue2
+    """
+    output_name=""
+    for label in input_dict:
+        if type(input_dict[label]) in [str,bool]:
+            formatted_variable=f"{input_dict[label]:}"
+        else:
+            formatted_variable=f"{input_dict[label]:.3g}"
+        output_name+=f"{label}-{formatted_variable}_"
+    return output_name[:-1]
+
+def get_dt(data, time_col = "time", relative_tolerance = 0.01, verbose = False):
+    """ 
+    Calculate the time difference between the lines of the time series
 
     Args:
-        - data (`obj`): pandas dataframe with the observables time series
+        - data (`obj`): PandasDataFrame containing the time series
+        - time_col (`str`): string denoting the name of the column which contains the time data
+        - relative_toleranace (`float`): threshold up to which an inconsistency between consecutive dt values is considered acceptable (see Notes below for more details)
+        - verbose (`bool`): Switch to activate/deactivate printing of warning messages.
 
     Returns:
-        dt (`float`): simulation time step.
+        - (`float`, `int`) 1. value of time difference between consecutive lines, 2. number of repeated time values
+
+    Notes:
+        relative_tolerance is used in two ways to check for irregularities in time intervals of between consecutive lines:
+        1. Sometimes, small inconsistencies in time intervals may occur due to rounding errors or because we run a short tuning of cells or Ewald parameters. These inconsistencies are considered negligible, as long as they do not exceed dt * relative_tolerance
+        2. Sometimes, consecutive values may have identical times, e.g. if only reactions are performed but there no MD evolution in beteen. In this case, dt is ill-defined. As of May 2024, we only produce warnings but do not raise an error in order to retain backward compatibility. We deprecate such use and plan to remove this compatibility in the future. If necessary, the user can always override get_dt() by providing a custom value on the input.
     """
-    if 'time' in data:
-        time = data['time']
-    elif 'MC sweep' in data:
-        time = data['MC sweep']
+    if time_col in data.columns.to_list():
+        time = data[ time_col ]
     else:
-        raise ValueError("neither 'time' nor 'MC sweep' column found in data, got " + str(data))
+        raise ValueError(f"Column \'{time_col}\' not found in columns: "+str( data.columns.to_list() ) )
     imax = data.shape[0]
     dt_init = time[1] - time[0]
     warn_lines = []
     for i in range(1,imax):
         dt = time[i] - time[i-1]
-        if np.abs((dt_init - dt)/dt) > 0.01:
-            warn_lines.append("Row {} dt = {} = {} - {} not equal to dt_init = {}")
-    if len(warn_lines) > 20:
-        print("\n")
-        for line in warn_lines:
-            print(line)
-    return dt
+        if np.abs( (dt_init - dt)/dt_init ) > relative_tolerance : # dt is different from before
+            if np.abs( dt/dt_init ) < relative_tolerance : # looks like a repeated value
+                warn_lines.append(f"Row {i} dt = {dt} = {time[i]} - {time[i-1]} not equal to dt_init = {dt_init}")
+                dt = max(dt,dt_init)
+            else:
+                raise ValueError(f"\nget_dt: Row {i} dt = {dt} = {time[i]} - {time[i-1]} not equal to dt_init = {dt_init}\n")
+    if verbose:
+        print(warn_lines)
+    return dt, len(warn_lines)
+
+def get_params_from_dir_name(name):
+    """
+    Gets the parameters from name assuming a structure 
+    name=obsname1-value1_obsname2-value2...
+    
+    Args:
+        name(`str`): name of the directory
+    
+    Returns:
+        params(`dict`): dictionary with the labels and values of the parameters
+    """
+    entries = name.split('_')
+    params = {}
+    for entry in entries:
+        sp_entry = entry.split('-', 1)
+        params[sp_entry[0]] = sp_entry[-1]     #float(sp_entry[-1])     # creates a dictionary of parameters and their values.
+    return params
 
 def read_csv_file(path):
     """
@@ -229,26 +212,61 @@ def read_csv_file(path):
     else:
         return None
 
-def built_output_name(input_dict):
+
+def split_dataframe(df,n_blocks):
     """
-    Builts the output name for a given set of input parameters.
+    Splits a Pandas Dataframe in n_blocks of approximately the same size.
 
     Args:
-        input_dict (`dict`): dictionary with all terminal inputs.
+        - df (`obj`): PandasDataframe
+        - n_blocks (`int`): Number of blocks
 
     Returns:
-        output_name (`str`): name used for the output files
+        - (`list`): array of PandasDataframe 
+
+    Notes:
+        - For a `df` of length `l` that should be split into n_blocks, it returns l % n_blocks sub-arrays of size l//n_blocks + 1 and the rest of size l//n_blocks.
+        - The behaviour of this function is the same as numpy.array_split for backwards compatibility, see [docs](https://numpy.org/doc/stable/reference/generated/numpy.array_split.html)
 
-    Note:
-        The standard formatting rule is parametername1-parametervalue1_parametername2-parametervalue2
     """
-    output_name=""
-    for label in input_dict:
-        if type(input_dict[label]) in [str,bool]:
-            formatted_variable=f"{input_dict[label]:}"
-        else:
-            formatted_variable=f"{input_dict[label]:.3g}"
-        output_name+=f"{label}-{formatted_variable}_"
-    return output_name[:-1]
+
+    # Blocks of size 1 (s1) =  df.shape[0]//n_blocks+1
+
+    n_blocks_s1=df.shape[0] % n_blocks
+    block_size_s1=df.shape[0]//n_blocks+1
+    blocks=split_dataframe_in_equal_blocks(df=df,
+                                           start_row=0,
+                                           end_row=n_blocks_s1*block_size_s1,
+                                           block_size=block_size_s1)
+
+
+    # Blocks of size 2 (s2) =  df.shape[0]//n_blocks
+    block_size_s2=df.shape[0]//n_blocks
+    blocks+=split_dataframe_in_equal_blocks(df=df,
+                                           start_row=n_blocks_s1*block_size_s1,
+                                           end_row=df.shape[0],
+                                           block_size=block_size_s2)
+    return blocks
+
+def split_dataframe_in_equal_blocks(df, start_row, end_row, block_size):
+    """
+    Splits a Pandas dataframe in equally spaced blocks.
+
+    Args:
+        - df (`obj`): PandasDataframe
+        - start_row (`int`): index of the first row
+        - end_row (`int`): index of the last row
+        - block_size (`int`): number of rows per block
+
+    Returns:
+        - (`list`): array of PandasDataframe of equal size
+    """
+    return [df[row:row+block_size] for row in range(start_row,end_row,block_size)]
+
+
+
+
+
+
 
 
diff --git a/testsuite/analysis_tests.py b/testsuite/analysis_tests.py
@@ -6,9 +6,34 @@
 class Serialization(ut.TestCase):
 
     def test_get_dt(self):
+        print("*** Unit test: test that analysis.get_dt returns the right time step ***")
         data = pd.DataFrame.from_dict( {'time': [0, 1, 2,], 'obs': ['1.0', '2.0', '4.0']} )
-        dt = ana.get_dt(data)
+        dt, n_warnings = ana.get_dt(data)
         self.assertAlmostEqual(dt, 1.0, delta = 1e-7)
+        self.assertEqual(n_warnings, 0)
+        print("*** Unit passed ***")
+
+        print("*** Unit test: test that analysis.get_dt prints a warning if there are values with repeated time steps ***")
+        data = pd.DataFrame.from_dict( {'time': [0, 1, 1,], 'obs': ['1.0', '2.0', '4.0']} )
+        dt, n_warnings = ana.get_dt(data,verbose=True)
+        self.assertAlmostEqual(dt, 1.0, delta = 1e-7)
+        self.assertEqual(n_warnings, 1)
+        print("*** Unit passed ***")
+
+        print("*** Unit test: test that analysis.get_dt raises a ValueError if the column with the time is not found ***")
+        data = pd.DataFrame.from_dict( {'ns': [0, 1, 2,], 'obs': ['1.0', '2.0', '4.0']} )
+        inputs = {"data": data}
+        self.assertRaises(ValueError, ana.get_dt, **inputs)
+
+        print("*** Unit passed ***")
+
+        print("*** Unit test: test that analysis.get_dt raises a ValueError if the time is stored at uneven intervals ***")
+        data = pd.DataFrame.from_dict( {'time': [0, 1, 4,], 'obs': ['1.0', '2.0', '4.0']} )
+        inputs = {"data": data}
+        self.assertRaises(ValueError, ana.get_dt, **inputs)
+
+        print("*** Unit passed ***")
 
 if __name__ == "__main__":
+    print("*** lib.analysis unit tests ***")
     ut.main()