Skip to content

Commit

Permalink
solved bug in get_dt for dataframes with repeated entries, improve co…
Browse files Browse the repository at this point in the history
…verage of the same function
  • Loading branch information
pm-blanco committed Jun 4, 2024
1 parent 91da8c7 commit be1c5fe
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 102 deletions.
220 changes: 119 additions & 101 deletions lib/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,76 +57,7 @@ def analyze_time_series(path_to_datafolder):
index=[len(data)])
return data

def get_params_from_dir_name(name):
"""
Gets the parameters from name assuming a structure
name=obsname1-value1_obsname2-value2...
Args:
name(`str`): name of the directory
Returns:
params(`dict`): dictionary with the labels and values of the parameters
"""
entries = name.split('_')
params = {}
for entry in entries:
sp_entry = entry.split('-', 1)
params[sp_entry[0]] = sp_entry[-1] #float(sp_entry[-1]) # creates a dictionary of parameters and their values.
return params


def split_dataframe_in_equal_blocks(df, start_row, end_row, block_size):
"""
Splits a Pandas dataframe in equally spaced blocks.
Args:
- df (`obj`): PandasDataframe
- start_row (`int`): index of the first row
- end_row (`int`): index of the last row
- block_size (`int`): number of rows per block
Returns:
- (`list`): array of PandasDataframe of equal size
"""
return [df[row:row+block_size] for row in range(start_row,end_row,block_size)]

def split_dataframe(df,n_blocks):
"""
Splits a Pandas Dataframe in n_blocks of approximately the same size.
Args:
- df (`obj`): PandasDataframe
- n_blocks (`int`): Number of blocks
Returns:
- (`list`): array of PandasDataframe
Notes:
- For a `df` of length `l` that should be split into n_blocks, it returns l % n_blocks sub-arrays of size l//n_blocks + 1 and the rest of size l//n_blocks.
- The behaviour of this function is the same as numpy.array_split for backwards compatibility, see [docs](https://numpy.org/doc/stable/reference/generated/numpy.array_split.html)
"""

# Blocks of size 1 (s1) = df.shape[0]//n_blocks+1

n_blocks_s1=df.shape[0] % n_blocks
block_size_s1=df.shape[0]//n_blocks+1
blocks=split_dataframe_in_equal_blocks(df=df,
start_row=0,
end_row=n_blocks_s1*block_size_s1,
block_size=block_size_s1)


# Blocks of size 2 (s2) = df.shape[0]//n_blocks
block_size_s2=df.shape[0]//n_blocks
blocks+=split_dataframe_in_equal_blocks(df=df,
start_row=n_blocks_s1*block_size_s1,
end_row=df.shape[0],
block_size=block_size_s2)
return blocks

def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1, columns_to_analyze = "all", verbose = False):
def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1, columns_to_analyze = "all", verbose = False, dt=None):
"""
Analyzes the data in `full_data` using a binning procedure.
Expand All @@ -137,12 +68,16 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1, columns
- equil(`float`,opt): fraction of the data discarded as part of the equilibration. Defaults to 0.1.
- columns_to_analyze(`list`): array of column names to be analyzed. Defaults to "all".
- verbose(`bool`): switch to activate/deactivate printing the block size. Defaults to False.
- dt(`float`): time step in which the data was stored. If no value is provided, is calculated from the time series. Defaults to None.
Returns:
`result`: pandas dataframe with the mean (mean), statistical error (err_mean), number of effective samples (n_eff) and correlation time (tau_int) of each observable.
"""
if dt is None:
dt, n_warnings = get_dt(full_data, time_col, verbose = verbose) # check that the data was stored with the same time interval dt
if n_warnings > 0 and verbose:
print(f"Warning: looks like a repeated time value was encountered {n_warnings} times")

dt = get_dt(full_data) # check that the data was stored with the same time interval dt
drop_rows = int(full_data.shape[0]*equil) # calculate how many rows should be dropped as equilibration
# drop the rows that will be discarded as equlibration
data = full_data.drop(range(0,drop_rows))
Expand Down Expand Up @@ -184,34 +119,82 @@ def block_analyze(full_data, n_blocks=16, time_col = "time", equil=0.1, columns
result = pd.concat( [ pd.Series([n_blocks,block_size], index=[('n_blocks',),('block_size',)]), result])
return result

def get_dt(data):
def built_output_name(input_dict):
"""
Sorts data to calculate the time step of the simulation.
Builts the output name for a given set of input parameters.
Args:
input_dict (`dict`): dictionary with all terminal inputs.
Returns:
output_name (`str`): name used for the output files
Note:
The standard formatting rule is parametername1-parametervalue1_parametername2-parametervalue2
"""
output_name=""
for label in input_dict:
if type(input_dict[label]) in [str,bool]:
formatted_variable=f"{input_dict[label]:}"
else:
formatted_variable=f"{input_dict[label]:.3g}"
output_name+=f"{label}-{formatted_variable}_"
return output_name[:-1]

def get_dt(data, time_col = "time", relative_tolerance = 0.01, verbose = False):
"""
Calculate the time difference between the lines of the time series
Args:
- data (`obj`): pandas dataframe with the observables time series
- data (`obj`): PandasDataFrame containing the time series
- time_col (`str`): string denoting the name of the column which contains the time data
- relative_toleranace (`float`): threshold up to which an inconsistency between consecutive dt values is considered acceptable (see Notes below for more details)
- verbose (`bool`): Switch to activate/deactivate printing of warning messages.
Returns:
dt (`float`): simulation time step.
- (`float`, `int`) 1. value of time difference between consecutive lines, 2. number of repeated time values
Notes:
relative_tolerance is used in two ways to check for irregularities in time intervals of between consecutive lines:
1. Sometimes, small inconsistencies in time intervals may occur due to rounding errors or because we run a short tuning of cells or Ewald parameters. These inconsistencies are considered negligible, as long as they do not exceed dt * relative_tolerance
2. Sometimes, consecutive values may have identical times, e.g. if only reactions are performed but there no MD evolution in beteen. In this case, dt is ill-defined. As of May 2024, we only produce warnings but do not raise an error in order to retain backward compatibility. We deprecate such use and plan to remove this compatibility in the future. If necessary, the user can always override get_dt() by providing a custom value on the input.
"""
if 'time' in data:
time = data['time']
elif 'MC sweep' in data:
time = data['MC sweep']
if time_col in data.columns.to_list():
time = data[ time_col ]
else:
raise ValueError("neither 'time' nor 'MC sweep' column found in data, got " + str(data))
raise ValueError(f"Column \'{time_col}\' not found in columns: "+str( data.columns.to_list() ) )
imax = data.shape[0]
dt_init = time[1] - time[0]
warn_lines = []
for i in range(1,imax):
dt = time[i] - time[i-1]
if np.abs((dt_init - dt)/dt) > 0.01:
warn_lines.append("Row {} dt = {} = {} - {} not equal to dt_init = {}")
if len(warn_lines) > 20:
print("\n")
for line in warn_lines:
print(line)
return dt
if np.abs( (dt_init - dt)/dt_init ) > relative_tolerance : # dt is different from before
if np.abs( dt/dt_init ) < relative_tolerance : # looks like a repeated value
warn_lines.append(f"Row {i} dt = {dt} = {time[i]} - {time[i-1]} not equal to dt_init = {dt_init}")
dt = max(dt,dt_init)
else:
raise ValueError(f"\nget_dt: Row {i} dt = {dt} = {time[i]} - {time[i-1]} not equal to dt_init = {dt_init}\n")
if verbose:
print(warn_lines)
return dt, len(warn_lines)

def get_params_from_dir_name(name):
"""
Gets the parameters from name assuming a structure
name=obsname1-value1_obsname2-value2...
Args:
name(`str`): name of the directory
Returns:
params(`dict`): dictionary with the labels and values of the parameters
"""
entries = name.split('_')
params = {}
for entry in entries:
sp_entry = entry.split('-', 1)
params[sp_entry[0]] = sp_entry[-1] #float(sp_entry[-1]) # creates a dictionary of parameters and their values.
return params

def read_csv_file(path):
"""
Expand All @@ -229,26 +212,61 @@ def read_csv_file(path):
else:
return None

def built_output_name(input_dict):

def split_dataframe(df,n_blocks):
"""
Builts the output name for a given set of input parameters.
Splits a Pandas Dataframe in n_blocks of approximately the same size.
Args:
input_dict (`dict`): dictionary with all terminal inputs.
- df (`obj`): PandasDataframe
- n_blocks (`int`): Number of blocks
Returns:
output_name (`str`): name used for the output files
- (`list`): array of PandasDataframe
Notes:
- For a `df` of length `l` that should be split into n_blocks, it returns l % n_blocks sub-arrays of size l//n_blocks + 1 and the rest of size l//n_blocks.
- The behaviour of this function is the same as numpy.array_split for backwards compatibility, see [docs](https://numpy.org/doc/stable/reference/generated/numpy.array_split.html)
Note:
The standard formatting rule is parametername1-parametervalue1_parametername2-parametervalue2
"""
output_name=""
for label in input_dict:
if type(input_dict[label]) in [str,bool]:
formatted_variable=f"{input_dict[label]:}"
else:
formatted_variable=f"{input_dict[label]:.3g}"
output_name+=f"{label}-{formatted_variable}_"
return output_name[:-1]

# Blocks of size 1 (s1) = df.shape[0]//n_blocks+1

n_blocks_s1=df.shape[0] % n_blocks
block_size_s1=df.shape[0]//n_blocks+1
blocks=split_dataframe_in_equal_blocks(df=df,
start_row=0,
end_row=n_blocks_s1*block_size_s1,
block_size=block_size_s1)


# Blocks of size 2 (s2) = df.shape[0]//n_blocks
block_size_s2=df.shape[0]//n_blocks
blocks+=split_dataframe_in_equal_blocks(df=df,
start_row=n_blocks_s1*block_size_s1,
end_row=df.shape[0],
block_size=block_size_s2)
return blocks

def split_dataframe_in_equal_blocks(df, start_row, end_row, block_size):
"""
Splits a Pandas dataframe in equally spaced blocks.
Args:
- df (`obj`): PandasDataframe
- start_row (`int`): index of the first row
- end_row (`int`): index of the last row
- block_size (`int`): number of rows per block
Returns:
- (`list`): array of PandasDataframe of equal size
"""
return [df[row:row+block_size] for row in range(start_row,end_row,block_size)]








27 changes: 26 additions & 1 deletion testsuite/analysis_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,34 @@
class Serialization(ut.TestCase):

def test_get_dt(self):
print("*** Unit test: test that analysis.get_dt returns the right time step ***")
data = pd.DataFrame.from_dict( {'time': [0, 1, 2,], 'obs': ['1.0', '2.0', '4.0']} )
dt = ana.get_dt(data)
dt, n_warnings = ana.get_dt(data)
self.assertAlmostEqual(dt, 1.0, delta = 1e-7)
self.assertEqual(n_warnings, 0)
print("*** Unit passed ***")

print("*** Unit test: test that analysis.get_dt prints a warning if there are values with repeated time steps ***")
data = pd.DataFrame.from_dict( {'time': [0, 1, 1,], 'obs': ['1.0', '2.0', '4.0']} )
dt, n_warnings = ana.get_dt(data,verbose=True)
self.assertAlmostEqual(dt, 1.0, delta = 1e-7)
self.assertEqual(n_warnings, 1)
print("*** Unit passed ***")

print("*** Unit test: test that analysis.get_dt raises a ValueError if the column with the time is not found ***")
data = pd.DataFrame.from_dict( {'ns': [0, 1, 2,], 'obs': ['1.0', '2.0', '4.0']} )
inputs = {"data": data}
self.assertRaises(ValueError, ana.get_dt, **inputs)

print("*** Unit passed ***")

print("*** Unit test: test that analysis.get_dt raises a ValueError if the time is stored at uneven intervals ***")
data = pd.DataFrame.from_dict( {'time': [0, 1, 4,], 'obs': ['1.0', '2.0', '4.0']} )
inputs = {"data": data}
self.assertRaises(ValueError, ana.get_dt, **inputs)

print("*** Unit passed ***")

if __name__ == "__main__":
print("*** lib.analysis unit tests ***")
ut.main()

0 comments on commit be1c5fe

Please sign in to comment.