In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import combinations
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)

In [None]:
# matplotlib.rcParams.update(matplotlib.rcParamsDefault)
plt.rc('text', usetex=True)
plt.rc('font', family='cmr10', size=12)
plt.rcParams["axes.formatter.use_mathtext"] = True
# plt.rcParams["figure.figsize"] = (20,3)
# plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"] # to reset

In [None]:
import glob
single_thread = False
if single_thread:
    prefix = "."
    suffix = ""
else:
    prefix = ""
    suffix = "_8threads"
files = glob.glob(prefix+"benchmarks"+suffix+"/*/*_*.json")

data_combined = []
for file in files:
    data_combined.append(json.load(open(file)))


In [None]:
benchmarks = []
for data in data_combined:
	cpu_arch = data["machine_info"]["cpu"]["arch"]
	brand_raw = data["machine_info"]["cpu"]["brand_raw"]
	cpu_cores = data["machine_info"]["cpu"]["count"]
	for bench in data["benchmarks"]:
		benchmarks.append({
			'group': bench['group'],
			'cpu_arch': cpu_arch,
			'brand': brand_raw,
			'cpu_cores': cpu_cores,
			'stats': bench['stats'],
			'params': bench['params'],
			'extra_info': bench["extra_info"]
		})

In [None]:
pd.DataFrame(benchmarks)
## separate each sub dictionary into a new column
df = pd.json_normalize(benchmarks)
## separate params.width into three columents, labelled width_2-body, width_3-body, width_4-body
df = pd.concat([df, df['params.width'].apply(pd.Series)], axis=1)
## add width label to the new columns
df = df.rename(columns={0: 'width_2-body', 1: 'width_3-body', 2: 'width_4-body'})


## drop the original params.width column
df = df.drop(columns=['params.width'])

## rename all params.* columns to remove the params. prefix
df = df.rename(columns=lambda x: x.replace('params.', ''))
df = df.rename(columns=lambda x: x.replace('extra_info.', ''))
# delete duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# remove "12th Gen " from brand
df['brand'] = df['brand'].str.replace('12th Gen ', '')
df['brand'] = df['brand'].str.replace('(R)', u"\u00AE")
df['brand'] = df['brand'].str.replace('(TM)', u"\u2122")

# reorder so brand is in alphabetical order
df = df.sort_values(by='brand').reset_index(drop=True)


In [None]:
df

In [None]:
grouped_dfs = {group: df_group for group, df_group in df.groupby('group')}

In [None]:
grouped_dfs.keys()

In [None]:
# filtered_grouped_dfs = {group: df_group[df_group['kBT'] == 0.4][df_group['xyz_file'].str.contains("diamond.xyz")] for group, df_group in grouped_dfs.items()}
# filtered_grouped_dfs

# box_data = []
# x_values = []
# positions = []
# brands = []
# ibrand = 0
# for idx, row in filtered_grouped_dfs[group_name].iterrows():
# 	x_values.append(row[column])
# 	if row['brand'] not in times:
# 		brands.append(row['brand'])
# 		ibrand = len(brands) - 1
# 	else:
# 		ibrand = brands.index(row['brand'])
# 	positions.append(row[column] + 0.1 * ibrand)
# 	box_data.append((row['stats.min'], row['stats.q1'], row['stats.median'], row['stats.q3'], row['stats.max']))
# box_data = np.array(box_data).T

# plt.figure(figsize=(8, 5))

# plt.boxplot(box_data, vert=True, patch_artist=True, positions=positions, widths=0.1)
# plt.xticks(ticks=np.arange(min(positions), max(positions) + 0.1, 0.5))
# plt.ylabel("Time (s)")
# plt.title("Boxplot for Each Index")
# plt.grid(axis="y", linestyle="--", alpha=0.7)
# plt.show()

In [None]:
column = "cutoff_max"
tmp_df = grouped_dfs['cutoff_in_database']
filtered_df = tmp_df[tmp_df['kBT'] == 0.4]
times = {}
for idx, row in filtered_df.iterrows():
	if row['brand'] not in times:
		times[row['brand']] = []
	times[row['brand']].append((row[column], row['stats.median']))

In [None]:
fig = plt.figure(figsize=(10, 6))

for brand, time_data in times.items():
  time_data = sorted(time_data, key=lambda x: x[0])
  x, y = zip(*time_data)
  plt.plot(x, [val * 1000 for val in y], marker='o', linestyle='--', label=brand)

plt.tick_params(axis='x', labelbottom=True, top=True, labelsize=20)
plt.tick_params(axis='y', labelbottom=True, right=True, labelsize=20)
plt.tick_params(axis='both', which="both", direction='in')
# set major ticks to every 50
plt.gca().yaxis.set_major_locator(MultipleLocator(50))
# add one minor tick for each major tick
plt.gca().yaxis.set_minor_locator(AutoMinorLocator(2))
# make major and minor ticks longer
plt.tick_params(which='both', width=1, length=6)
plt.tick_params(which='minor', width=1, length=3)

plt.ylabel("Median execution time (ms)", fontsize=25)
plt.xlabel("Upper bond length cutoff (Å)", fontsize=25)
# set aspet ratio so that the output is a square
plt.legend(facecolor='white', framealpha=1.0, edgecolor='black', fancybox=False, fontsize=16)
# plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.gca().set_aspect(1.0/plt.gca().get_data_ratio(), adjustable='box')
fig.show()

plt.savefig('benchmark_cutoff_max_database'+suffix+'.pdf', bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
column = "cutoff_max"
tmp_df = grouped_dfs['cutoff_in_evaluator']
filtered_df = tmp_df[tmp_df['kBT'] == 0.4]
times = {}
for idx, row in filtered_df.iterrows():
	if row['brand'] not in times:
		times[row['brand']] = []
	times[row['brand']].append((row[column], row['stats.median']))

In [None]:
fig = plt.figure(figsize=(10, 6))

for brand, time_data in times.items():
  time_data = sorted(time_data, key=lambda x: x[0])
  x, y = zip(*time_data)
  plt.plot(x, [val * 1000 for val in y], marker='o', linestyle='--', label=brand)


plt.tick_params(axis='x', labelbottom=True, top=True, labelsize=20)
plt.tick_params(axis='y', labelbottom=True, right=True, labelsize=20)
plt.tick_params(axis='both', which="both", direction='in')
# set major ticks to every 0.2
plt.gca().yaxis.set_major_locator(MultipleLocator(0.2))
# add one minor tick for each major tick
plt.gca().yaxis.set_minor_locator(AutoMinorLocator(2))
# make major and minor ticks longer
plt.tick_params(which='both', width=1, length=6)
plt.tick_params(which='minor', width=1, length=3)

plt.ylabel("Median execution time (ms)", fontsize=25)
plt.xlabel("Upper bond length cutoff (Å)", fontsize=25)
plt.legend(facecolor='white', framealpha=1.0, edgecolor='black', fancybox=False, loc='lower right', fontsize=16)
# plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.gca().set_aspect(1.0/plt.gca().get_data_ratio(), adjustable='box')
fig.show()

plt.savefig('benchmark_cutoff_max_evaluator'+suffix+'.pdf', bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
column = "grid_spacing"
tmp_df = grouped_dfs['placement_methods']
filtered_df = tmp_df[tmp_df['method_ratio.rand'] == 1.0]
times = {}
for idx, row in filtered_df.iterrows():
	if row['brand'] not in times:
		times[row['brand']] = []
	times[row['brand']].append((row[column], row['stats.median']))

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = plt.gca()

i = 0
for brand, time_data in times.items():
  time_data = sorted(time_data, key=lambda x: x[0])
  x, y = zip(*time_data)
  plt.plot(x, [val * 1000 for val in y], marker='o', linestyle='--', label=brand)


plt.tick_params(axis='x', labelbottom=True, top=True, labelsize=20)
plt.tick_params(axis='y', labelbottom=True, right=True, labelsize=20)
plt.tick_params(axis='both', which="both", direction='in')
# set major ticks to every 0.2
plt.gca().yaxis.set_major_locator(MultipleLocator(0.2))
# add one minor tick for each major tick
plt.gca().yaxis.set_minor_locator(AutoMinorLocator(2))
# make major and minor ticks longer
plt.tick_params(which='both', width=1, length=6)
plt.tick_params(which='minor', width=1, length=3)

ymin, ymax = ax.get_ylim()
yrange = ymax - ymin
order_of_magnitude = 10 ** int(np.floor(np.log10(yrange)))
tick_spacing = order_of_magnitude / 10
print(tick_spacing)
while yrange / tick_spacing > 6:
    tick_spacing *= 2
while yrange / tick_spacing < 3:
    tick_spacing /= 2
ax.yaxis.set_major_locator(MultipleLocator(tick_spacing))
ax.yaxis.set_minor_locator(AutoMinorLocator(2))

plt.ylabel("Median execution time (ms)", fontsize=25)
plt.xlabel("Grid spacing (Å)", fontsize=25)
plt.legend(facecolor='white', framealpha=1.0, edgecolor='black', fancybox=False, loc='center right', fontsize=16)
# plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.gca().set_aspect(1.0/plt.gca().get_data_ratio(), adjustable='box')
fig.show()

plt.savefig('benchmark_min_placement'+suffix+'.pdf', bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
column = "grid_spacing"
tmp_df = grouped_dfs['placement_methods']

for method in ['void', 'rand', 'walk', 'grow', 'min']:
  filtered_df = tmp_df[tmp_df['method_ratio.'+method] == 1.0]
  times = {}
  for idx, row in filtered_df.iterrows():
    if row['brand'] not in times:
      times[row['brand']] = []
    times[row['brand']].append((row[column], row['stats.median']))

  fig = plt.figure(figsize=(10, 6))
  ax = plt.gca()

  i = 0
  for brand, time_data in times.items():
    time_data = sorted(time_data, key=lambda x: x[0])
    x, y = zip(*time_data)
    plt.plot(x, [val * 1000 for val in y], marker='o', linestyle='--', label=brand)
                        

  ymin, ymax = ax.get_ylim()
  yrange = ymax - ymin
  order_of_magnitude = 10 ** int(np.floor(np.log10(yrange)))
  tick_spacing = order_of_magnitude / 10
  print(tick_spacing)
  while yrange / tick_spacing > 6:
      tick_spacing *= 2
  while yrange / tick_spacing < 3:
      tick_spacing /= 2
  ax.yaxis.set_major_locator(MultipleLocator(tick_spacing))
  ax.yaxis.set_minor_locator(AutoMinorLocator(2))


  plt.tick_params(axis='x', labelbottom=True, top=True, labelsize=20)
  plt.tick_params(axis='y', labelbottom=True, right=True, labelsize=20)
  plt.tick_params(axis='both', which="both", direction='in')
  # make major and minor ticks longer
  plt.tick_params(which='both', width=1, length=6)
  plt.tick_params(which='minor', width=1, length=3)

  if method in ['void', 'min']:
    #   plt.yscale('log')
      loc = 'upper right'
  elif method in ['rand', 'walk', 'grow']:
      loc = 'center right'
    #   plt.ylim(0.3, 0.52)
    #   plt.gca().yaxis.set_major_locator(MultipleLocator(0.05))
    #   plt.gca().yaxis.set_minor_locator(AutoMinorLocator(2))
#   if method == 'void':
#       plt.ylim(0.1, 10)


  plt.ylabel("Median execution time (ms)", fontsize=25)
  plt.xlabel("Grid spacing (Å)", fontsize=25)
  plt.legend(facecolor='white', framealpha=1.0, edgecolor='black', fancybox=False, loc='upper right', fontsize=16)
  # plt.grid(axis="y", linestyle="--", alpha=0.7)
  plt.gca().set_aspect(1.0/plt.gca().get_data_ratio(), adjustable='box')
  fig.show()

  plt.savefig('benchmark_placement_method_'+method+suffix+'.pdf', bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
columns_to_plot = ["kBT", "width_2-body", "width_3-body", "width_4-body", "avg_num_atoms"]
column_label = ["$k_{\\mathrm{B}}T$ (eV)", "2-body width (Å)", "3-body width (Å)", "4-body width (Å)", "Total number of atoms in dataset"]
tmp_df = grouped_dfs['distributions']
# Filter rows where kBT is 0.4
tmp_df = tmp_df[tmp_df['kBT'] == 0.4]

unique_vals_dict = {}
for varying_col in columns_to_plot:
    remaining_cols = [col for col in columns_to_plot if col != varying_col]
    
    # Find unique pairs of remaining columns
    unique_vals = set()
    
    for cols in combinations(remaining_cols, 4):
        unique_vals.update(tmp_df[list(cols)].drop_duplicates().itertuples(index=False, name=None))
    
    unique_vals_dict[varying_col] = unique_vals

for column, label in zip(columns_to_plot, column_label):
  if column in tmp_df.columns:

    if "kBT" in column:
       continue
    if "width" in column:
      continue
    for value in unique_vals_dict[column]:
      filtered_df = tmp_df.copy()
      remaining_cols = [col for col in columns_to_plot if col != column]
      for col, val in zip(remaining_cols, value):
        filtered_df = filtered_df[filtered_df[col] == val]

      # check if all of the values in the "width_2-body" column are close to 0.025. If not, continue
      if not all(abs(filtered_df['width_2-body'] - 0.025) < 1e-2):
        continue

      times = {}
      for idx, row in filtered_df.iterrows():
        if row['brand'] not in times:
          times[row['brand']] = []
        if column == "avg_num_atoms":
          times[row['brand']].append((row[column] * row['num_structures'], row['stats.median']))
        else:
          times[row['brand']].append((row[column], row['stats.median']))

      fig = plt.figure(figsize=(10, 6))
      ax = fig.add_subplot(111)

      for brand, time_data in times.items():
        time_data = sorted(time_data, key=lambda x: x[0])
        x, y = zip(*time_data)
        print(x)
        if 20000 < x[-2] < 35000:
            # drop the x and y values for the the second to last point
            x = x[:-2] + (x[-1],)
            y = y[:-2] + (y[-1],)
        plt.plot(x, [val * 1 for val in y], marker='o', linestyle='--', label=brand)

      ymin, ymax = ax.get_ylim()
      yrange = ymax - ymin
      order_of_magnitude = 10 ** int(np.floor(np.log10(yrange)))
      tick_spacing = order_of_magnitude / 10
      while yrange / tick_spacing > 10:
          tick_spacing *= 2
      while yrange / tick_spacing < 5:
          tick_spacing /= 2
      ax.yaxis.set_major_locator(MultipleLocator(tick_spacing))
      ax.yaxis.set_minor_locator(AutoMinorLocator(2))

      if column == "avg_num_atoms":
        ax.set_ylim([-1,40])

      plt.tick_params(axis='x', labelbottom=True, top=True, labelsize=20)
      plt.tick_params(axis='y', labelbottom=True, right=True, labelsize=20)
      plt.tick_params(axis='both', which="both", direction='in')
      # set major ticks to every 0.2
      plt.gca().yaxis.set_major_locator(MultipleLocator(10))
      # add one minor tick for each major tick
      plt.gca().yaxis.set_minor_locator(AutoMinorLocator(2))
    #   make major and minor ticks longer
      plt.tick_params(which='both', width=1, length=6)
      plt.tick_params(which='minor', width=1, length=3)

      plt.ylabel("Median execution time (s)", fontsize=25)
      plt.xlabel(label, fontsize=25)
      plt.legend(facecolor='white', framealpha=1.0, edgecolor='black', fancybox=False, loc='upper left', fontsize=16)
      # plt.grid(axis="y", linestyle="--", alpha=0.7)
      plt.gca().set_aspect(1.0/plt.gca().get_data_ratio(), adjustable='box')

      plt.savefig(f'benchmark_distributions_{column}{suffix}.pdf', bbox_inches='tight', pad_inches=0, facecolor=fig.get_facecolor(), edgecolor='none')
      print(row['kBT'], row['width_2-body'], row['width_3-body'], row['width_4-body'], row['avg_num_atoms'])

In [None]:
# Temporarily set display options to show all rows
with pd.option_context('display.max_rows', None, 
                      'display.max_columns', None,
                      'display.width', None):
    print(tmp_df)