In [11]:
# load data
import os
from tqdm import tqdm
import pandas as pd

if os.path.exists("df_speedups.csv"):
    df_speedups = pd.read_csv("df_speedups.csv")
else:
    df_list = []
    wd = 'speedups' 
    for file in tqdm(os.listdir(wd)):
        if file.endswith(".csv"):
            df_speedups = pd.read_csv(f"{wd}/{file}")
            df_speedups['file'] = file
            df_list.append(df_speedups)

    df_speedups = pd.concat(df_list)

    # add edge labels
    labels = {
        -1: "others",
        0: "HA_9_H",
        1: "HA_6_HA",
        2: "C_4_CA",    
    }

    df_speedups['label'] = df_speedups['code'].map(labels)

    df_speedups.to_csv("df_speedups.csv", index=False)

display(df_speedups.head())

display(df_speedups.columns)

Unnamed: 0,i,j,code,edge_time_dfs,edge_niters_dfs,edge_time_fbs,edge_niters_fbs,speed_up,speed_up_niters,file,label
0,1,7,1,2.720001e-07,6,1.72e-07,4,1.581396,1.5,1qfr_model1_chainA_segment8.csv,HA_6_HA
1,2,6,-1,0.0,0,0.0,0,1.0,1.0,1qfr_model1_chainA_segment8.csv,others
2,5,9,-1,5.099992e-08,1,7.299968e-08,1,0.698632,1.0,1qfr_model1_chainA_segment8.csv,others
3,7,13,1,9.099995e-08,7,1.589997e-07,6,0.572328,1.166667,1qfr_model1_chainA_segment8.csv,HA_6_HA
4,4,10,-1,6.000005e-08,3,1.320004e-07,3,0.454544,1.0,1qfr_model1_chainA_segment8.csv,others


Index(['i', 'j', 'code', 'edge_time_dfs', 'edge_niters_dfs', 'edge_time_fbs',
       'edge_niters_fbs', 'speed_up', 'speed_up_niters', 'file', 'label'],
      dtype='object')

In [35]:
# cols (file, label, edge_time_dfs, edge_time_fbs), group (file, label), agg=(sum, count)
df_sections = df_speedups.groupby(['file', 'label']).agg({'edge_time_dfs': 'sum', 'edge_time_fbs': 'sum'}).reset_index()
# display(df_sections.head())

# group by file and sum the edge_time_dfs and edge_time_fbs
df_total_time = df_sections.groupby('file').agg({'edge_time_dfs': 'sum', 'edge_time_fbs': 'sum'}).reset_index()
# display(df_total_time.head())

print("Total elapsed time:")
print(df_total_time[['edge_time_dfs', 'edge_time_fbs']].sum())

# add total time to df_sections
df_sections = df_sections.merge(df_total_time, on='file', suffixes=('', '_sum'))
# display(df_sections.head())

df_sections['relative_time_dfs'] = df_sections['edge_time_dfs'] / df_sections['edge_time_dfs_sum']
df_sections['relative_time_fbs'] = df_sections['edge_time_fbs'] / df_sections['edge_time_fbs_sum']

# display(df_sections.head())

# average relative time for each label
df_sections_avg = df_sections.groupby('label').agg({'relative_time_dfs': 'mean', 'relative_time_fbs': 'mean'}).reset_index()
df_sections_median = df_sections.groupby('label').agg({'relative_time_dfs': 'median', 'relative_time_fbs': 'median'}).reset_index()
df_sections_std = df_sections.groupby('label').agg({'relative_time_dfs': 'std', 'relative_time_fbs': 'std'}).reset_index()
df_sections = df_sections_avg.merge(df_sections_std, on='label', suffixes=('', '_std'))
df_sections = df_sections.merge(df_sections_median, on='label', suffixes=('', '_median'))
df_sections = df_sections.rename(
    columns={
        'relative_time_dfs': 'dfs_avg', 
        'relative_time_fbs': 'fbs_avg', 
        'relative_time_dfs_std': 'dfs_std', 
        'relative_time_fbs_std': 'fbs_std', 
        'relative_time_dfs_median': 'dfs_median', 
        'relative_time_fbs_median': 'fbs_median'
    }
)
display(df_sections)

print("Sum of relative times:")
# sum (avg, median)
print(df_sections[['dfs_avg', 'fbs_avg', 'dfs_median', 'fbs_median']].sum())

Total elapsed time:
edge_time_dfs    0.047504
edge_time_fbs    0.042141
dtype: float64


Unnamed: 0,label,dfs_avg,fbs_avg,dfs_std,fbs_std,dfs_median,fbs_median
0,C_4_CA,0.019439,0.029843,0.028993,0.034611,0.011457,0.018193
1,HA_6_HA,0.274364,0.280092,0.247755,0.215749,0.172904,0.219538
2,HA_9_H,0.400652,0.172843,0.215264,0.14344,0.390783,0.123563
3,others,0.454331,0.633444,0.227076,0.22594,0.40055,0.642875


Sum of relative times:
dfs_avg       1.148785
fbs_avg       1.116222
dfs_median    0.975695
fbs_median    1.004170
dtype: float64


In [52]:
import plotly.express as px

df_ = []
for _, row in df_sections.iterrows():
    df_.append({"label": row['label'], "algorithm": "dfs", "median": row['dfs_median'], "std": row['dfs_std']})
    df_.append({"label": row['label'], "algorithm": "fbs", "median": row['fbs_median'], "std": row['fbs_std']})
df_ = pd.DataFrame(df_)
# round 2
# df_['median'] = df_['median'].round(2)
#df_['std'] = df_['std'].round(2)
display(df_)

# bar, x=label, y=median, color=algorithm, title="Relative Time by Label"
fig = px.bar(df_, x='label', y='median', color='algorithm', title="Relative Time by Label", barmode='group',
             text='median')
# set fig size
fig.update_layout(autosize=False, width=600, height=400)
# set text format
fig.update_traces(texttemplate='%{text:.2f}')
# set y-title="Median Relative Time"
fig.update_yaxes(title="Median Relative Time")
fig.show()

Unnamed: 0,label,algorithm,median,std
0,C_4_CA,dfs,0.011457,0.028993
1,C_4_CA,fbs,0.018193,0.034611
2,HA_6_HA,dfs,0.172904,0.247755
3,HA_6_HA,fbs,0.219538,0.215749
4,HA_9_H,dfs,0.390783,0.215264
5,HA_9_H,fbs,0.123563,0.14344
6,others,dfs,0.40055,0.227076
7,others,fbs,0.642875,0.22594


min: 0.0006162173478962, max: 1148.334367066628
log_min_floor: -4, log_max_ceil: 4


In [117]:
import numpy as np
import plotly.graph_objects as go

data = df_speedups['speed_up']
print(f"min: {data.min()}, max: {data.max()}")

# Define the number of bins
num_bins = 10

# Ensure that all data points are positive since log scale cannot handle zero or negative values
if (data <= 0).any():
    raise ValueError("All data points must be positive for a logarithmic scale.")

# Create log-spaced bin edges
log_min_floor = int(np.floor(np.log10(data.min())))
log_max_ceil = int(np.ceil(np.log10(data.max())))
print(f"log_min_floor: {log_min_floor}, log_max_ceil: {log_max_ceil}")

bin_size = log_max_ceil/ num_bins
bin_edge = log_max_ceil
bins = [bin_edge]
while bin_edge > log_min_floor:
    bin_edge -= bin_size
    bins.append(bin_edge)
bins = 10**np.array(bins)
bins = sorted(bins)

xticks = 10**np.linspace(log_min_floor, log_max_ceil, log_max_ceil - log_min_floor + 1)

## Compute the histogram using numpy
hist, bin_edges = np.histogram(data, bins=bins)

# Calculate bin centers for the x-axis
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

# Calculate bin widths for the bar widths
bin_widths = np.diff(bin_edges)

# Create the bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=bin_centers,
    y=hist,
    width=bin_widths,
    marker_color='teal',
    opacity=0.75,
    hovertemplate=
        '<b>Value:</b> %{x:.2f}<br>' +
        '<b>Count:</b> %{y}<extra></extra>',
    name='Histogram'
))

# Create a mask for non-empty bins
non_empty_mask = hist > 0
x = bin_centers[non_empty_mask]
y = hist[non_empty_mask]

# Add the smooth line trace
fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode='lines',
    name='Smooth Line (Non-Empty Bins)',
    line=dict(color='red', width=2, smoothing=1.3),
    line_shape='spline'  # This makes the line smooth
))

# Update layout to use logarithmic scales
fig.update_layout(
    title='Speed Up',
    xaxis=dict(
        title='Speed Up',
        type='log',
        tickmode='array',
        tickvals=xticks,
        ticktext=[f'{tick:g}' for tick in xticks],  # Changed to scientific notation
        showgrid=True
    ),
    yaxis=dict(
        title='Count',
        type='log',
        rangemode='tozero',
        showgrid=True
    ),
    bargap=0.1,
    bargroupgap=0.1,
    template='plotly_white',
    showlegend=False,  # Turn off the legends
    width=600,  # Set figure width to 600 pixels
    height=400  # Set figure height to 400 pixels
)

fig.show()


min: 0.0006162173478962, max: 1148.334367066628
log_min_floor: -4, log_max_ceil: 4
