In [None]:
import plotly.express as px
import pandas as pd
from pathlib import Path
from IPython.display import display
from typing import List, Dict, Any, Tuple
from pathlib import Path
import plotly.io as pio
import re

In [None]:
logs_path = Path("logs")
img_path = Path("..", "..", "report", "images")

# Useful functions

In [None]:
def compute_time_elapsed(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]]) -> pd.DataFrame:
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df[f"time_elapsed.{event_name}"] = (df[end_column] - df[start_column]).dt.total_seconds()
    return df

def convert_all_pairs_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
    for columns in df.columns:
        if columns.startswith("start") or columns.startswith("end"):
            df[columns] = pd.to_datetime(df[columns], unit="s")
    return df

def retrieve_start_end_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    start_end_pairs: List[Tuple[str, str]] = []
    for column in df.columns:
        if column.startswith("start"):
            start_column = column
            end_column = column.replace("start", "end")
            start_end_pairs.append((start_column, end_column))
    return start_end_pairs

def dataset_for_every_events(df: pd.DataFrame, columns_pairs: List[Tuple[str, str]], name: str) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    for column_pair in columns_pairs:
        start_column, end_column = column_pair
        event_name = start_column.replace("start.", "")
        df_event = df[[start_column, end_column]]
        df_event.columns = ["start", "end"]
        df_event = df_event.dropna()
        df_event["event"] = event_name
        df_event["dataset"] = df["dataset"]
        # substract the start time to the first event (datetime object) to get the time elapsed
        df_event["time_elapsed"] = (df_event["end"] - df_event["start"]).dt.total_seconds()
        df_event["name"] = name
        df_event["legend"] = f"{event_name} - {name}"
        df_event["index"] = df_event.index
        dfs.append(df_event)
    return pd.concat(dfs)

def align_start_times(diff_time: float, df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        if column.startswith("start") or column.startswith("end"):
            df[column] = df[column] + pd.Timedelta(seconds=diff_time)
    return df

def convert_name_to_human_friendly(name: str) -> str:
    args = name.split(".")
    args.pop()
    print(args)
    if "worker" in name:
        return f"Worker {args[-1]} ({args[2]}, {args[1]})"
    if "server" in name:
        return f"Server ({args[2]}, {args[1]})"
    if "standalone" in name:
        return f"Standalone ({args[0]})"

# Load log files

In [None]:
workers_files = list(logs_path.glob("*.*.*.worker.*.logs.csv"))
workers_files.sort(key=lambda x: int(x.stem.split(".")[-2]))
print(workers_files)

workers_events_dfs: List[pd.DataFrame] = []
workers_dfs: List[pd.DataFrame] = []
for log in workers_files:
    dataset = str(log).split(".")[-5]
    worker = str(log).split(".")[-2]
    world_size = str(log).split(".")[-6]
    with open(log) as f:
        print(log)
        df = pd.read_csv(f)
        name = convert_name_to_human_friendly(log.stem)
        df["dataset"] = f"{dataset} ({world_size})"
        df["worker"] = worker
        df["log"] = name
        df["world_size"] = world_size
        df = convert_all_pairs_to_datetime(df)
        workers_dfs.append(df)
        workers_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), name))
workers_df = pd.concat(workers_dfs)
workers_events_pairs = retrieve_start_end_pairs(workers_df)
print(workers_events_pairs)
workers_df = compute_time_elapsed(workers_df, workers_events_pairs)
workers_events_df = pd.concat(workers_events_dfs)

display(workers_events_df)
display(workers_df)

In [None]:
server_files = list(logs_path.glob("*.*.*.server.logs.csv"))
server_files.sort(key=lambda x: int(x.stem.split(".")[1]))
print(server_files)

server_events_dfs: List[pd.DataFrame] = []
server_dfs = []
for log in server_files:
    dataset = str(log).split(".")[-4]
    world_size = str(log).split(".")[-5]
    with open(log) as f:
        df = pd.read_csv(f)
        name = convert_name_to_human_friendly(log.stem)
        df["dataset"] = f"{dataset} ({world_size})"
        df["world_size"] = world_size
        df["log"] = name
        df["end.recv_data"] = df["start.agg_gradients"]
        df = convert_all_pairs_to_datetime(df)

        if len(server_dfs) > 0:
            start_time_server: pd.Timedelta = server_dfs[0]["start.epoch"].min()
            start_time_standalone: pd.Timedelta = df["start.epoch"].min()
            diff_time = start_time_server - start_time_standalone
            print(f"diff_time: {diff_time}")
            df = align_start_times(diff_time.total_seconds(), df)

        server_dfs.append(df)
        server_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), name))
server_df = pd.concat(server_dfs)
server_events_pairs = retrieve_start_end_pairs(server_df)
print(server_events_pairs)
server_df = compute_time_elapsed(server_df, server_events_pairs)

server_events_df = pd.concat(server_events_dfs)

display(server_events_df)
display(server_df)

In [None]:
data_size = server_df["size.data"].iloc[0]
feedback_size = server_df["size.feedback"].iloc[0]
model_size = workers_df["size.model"].iloc[0]

print(f"Data size: {data_size:.2f}MB")
print(f"Feedback size: {feedback_size:.2f}MB")
print(f"Model size: {model_size:.2f}MB")

In [None]:
logs_standalones = list(logs_path.glob("*.standalone.logs.csv"))

standalone_dfs = []
standalone_events_dfs = []
for log in logs_standalones:
    dataset = log.stem.split(".")[0]
    with open(log) as f:
        df = pd.read_csv(f)
        name = convert_name_to_human_friendly(log.stem)
        df["dataset"] = f"{dataset}"
        df["log"] = name
        df = convert_all_pairs_to_datetime(df)

        corresponding_server = server_df[(server_df["dataset"] == dataset)]
        start_time_server: pd.Timedelta = server_df["start.epoch"].min()
        start_time_standalone: pd.Timedelta = df["start.epoch"].min()
        diff_time = start_time_server - start_time_standalone
        print(f"diff_time: {diff_time}")
        standalone_df = align_start_times(diff_time.total_seconds(), df)

        standalone_dfs.append(df)
        standalone_events_dfs.append(dataset_for_every_events(df, retrieve_start_end_pairs(df), name))
standalone_df = pd.concat(standalone_dfs)
standalone_events_pairs = retrieve_start_end_pairs(standalone_df)
print(standalone_events_pairs)
standalone_df = compute_time_elapsed(standalone_df, standalone_events_pairs)

standalone_events_df = pd.concat(standalone_events_dfs)

display(standalone_df)
display(standalone_events_df)

In [None]:
all_events_df = pd.concat([standalone_events_df, workers_events_df, server_events_df])
all_df = pd.concat([standalone_df, workers_df, server_df])
display(all_events_df)
display(all_df)

In [None]:
labels = {
    "epoch": "Epoch",
    "mean_d_loss": "Mean Discriminator Loss",
    "mean_g_loss": "Mean Generator Loss",
    "fid": "FID",
    "is": "IS",
    "time_elapsed.epoch_calculation": "Epoch duration (s)",
    "start.epoch": "Epoch start time",
    "log": "Actor",
    "cumsum": "Time elapsed (s)",
    "world_size": "Number of workers",
    "dataset": "Dataset",
}
rename_columns = {
    "mean_d_loss": "Discriminator Loss",
    "mean_g_loss": "Generator Loss",
}
standalone_df = standalone_df.rename(columns=rename_columns)
all_df = all_df.rename(columns=rename_columns)
standalone_server_df = pd.concat([standalone_df, server_df])
standalone_server_df = standalone_server_df.rename(columns=rename_columns)

display(all_df)
display(all_events_df)
display(standalone_server_df)

# Plots

In [None]:
pio.templates.default = "plotly_white"
WIDTH = 1200
HEIGHT = 600

In [None]:
all_df.iloc[all_df["world_size"].isna().values, all_df.columns.get_loc("world_size")] = 0
all_df["world_size"] = all_df["world_size"].astype(int)

server_df.iloc[server_df["world_size"].isna().values, server_df.columns.get_loc("world_size")] = 0
server_df["world_size"] = server_df["world_size"].astype(int)

workers_df.iloc[workers_df["world_size"].isna().values, workers_df.columns.get_loc("world_size")] = 0
workers_df["world_size"] = workers_df["world_size"].astype(int)

In [None]:
labels = {
    "epoch": "Epoch",
    "size.sent": "Size sent (MB)",
    "size.recv": "Size received (MB)",
    "log": "Actor",
    "dataset": "Dataset",
    "world_size": "Number of workers",
}

all_df_size = all_df[["log", "epoch", "size.sent", "size.recv", "dataset"]]
all_df_size = all_df_size.dropna()
px.line(all_df_size, x="epoch", y=["size.sent", "size.recv"], color="log", title="Size sent and received per epoch", width=WIDTH, height=HEIGHT).show()

server_all_sizes = server_df[["epoch", "size.sent", "size.recv", "world_size"]]
server_all_sizes = server_all_sizes.groupby(["world_size"]).mean().reset_index()
server_all_sizes = server_all_sizes.sort_values(by="size.sent")

f = px.line(server_all_sizes, x="world_size", y=["size.sent", "size.recv"], title="Size sent and received per epoch (Server)", color="world_size", width=WIDTH, height=HEIGHT, labels=labels, markers=True,)
f.update_traces(marker=dict(size=10), textposition='top right')
for i, d in enumerate(f.data):
    for j, a in enumerate(d.x):
        f.add_annotation(x=a, y=d.y[j], text=f"{d['y'][0]:.2f}", showarrow=False, xshift=-15, yshift=15)
f2 = px.line(server_all_sizes, x="world_size", y=["size.sent", "size.recv"], title="Size sent and received per epoch (Server)", width=WIDTH, height=HEIGHT, labels=labels)
f.add_traces(f2.data)
f.write_image(str(img_path / "size_sent_recv_server.png"))
f.show()

workers_all_sizes = workers_df[["epoch", "size.sent", "size.recv", "world_size"]]
workers_all_sizes = workers_all_sizes.groupby(["world_size"]).mean().reset_index()
workers_all_sizes = workers_all_sizes.sort_values(by="size.sent")

f = px.line(workers_all_sizes, x="world_size", y=["size.sent", "size.recv"], title="Size sent and received per epoch (Workers)", color="world_size", width=WIDTH, height=HEIGHT, labels=labels, markers=True,)
f.update_traces(marker=dict(size=10), textposition='top right')
for i, d in enumerate(f.data):
    for j, a in enumerate(d.x):
        f.add_annotation(x=a, y=d.y[j], text=f"{d['y'][0]:.2f}", showarrow=False, xshift=-15, yshift=15)
f2 = px.line(workers_all_sizes, x="world_size", y=["size.sent", "size.recv"], title="Size sent and received per epoch (Workers)", width=WIDTH, height=HEIGHT, labels=labels)
f.add_traces(f2.data)
f.write_image(str(img_path / "size_sent_recv_workers.png"))
f.show()

In [None]:
MAX_EPOCHS_CROP = 5000

In [None]:
px.line(standalone_df, x="epoch", y=["Discriminator Loss", "Generator Loss"], title="Losses standalone", labels=labels, width=WIDTH, height=HEIGHT).show()
px.line(all_df, x="epoch", y=["Discriminator Loss"], color="log", title="Losses discriminators", labels=labels, width=WIDTH, height=HEIGHT).show()

selected_log = [
    "Standalone (CIFAR10)",
    "Server (CIFAR10, 4)",
    "Server (CIFAR10, 10)",
]

fid_df = all_df[["epoch", "log", "fid"]].dropna()
f = px.line(fid_df, x="epoch", y="fid", color="log", title="Fréchet Inception Distance", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "fid.png")
f.show()


fid_df_10_and_standalone = all_df[["epoch", "log", "fid"]].dropna()
fid_df_10_and_standalone = fid_df_10_and_standalone[fid_df_10_and_standalone["log"].isin(["Server (CIFAR10, 10)", "Standalone (CIFAR10)"])]
f = px.line(fid_df_10_and_standalone, x="epoch", y="fid", color="log", title="Fréchet Inception Distance", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "fid_4_10_standalone.png")
f.show()

fid_df_cropped = fid_df[fid_df["epoch"] < MAX_EPOCHS_CROP]
f = px.line(fid_df_cropped, x="epoch", y="fid", color="log", title="Fréchet Inception Distance", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "fid_cropped.png")
f.show()

fid_df = fid_df[fid_df["log"].isin(selected_log)]
fid_df = fid_df[fid_df["epoch"] % 900 == 0]
f = px.line(fid_df, x="epoch", y="fid", color="log", title="Fréchet Inception Distance", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "fid_filtered.png")
f.show()

is_df = all_df[["epoch", "log", "is"]].dropna()
f = px.line(is_df, x="epoch", y="is", color="log", title="Inception Score", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "is.png")
f.show()

is_df_10_and_standalone = all_df[["epoch", "log", "is"]].dropna()
is_df_10_and_standalone = is_df_10_and_standalone[is_df_10_and_standalone["log"].isin(["Server (CIFAR10, 10)", "Standalone (CIFAR10)"])]
f = px.line(is_df_10_and_standalone, x="epoch", y="is", color="log", title="Inception Score", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "is_4_10_standalone.png")
f.show()

is_df_cropped = is_df[is_df["epoch"] < MAX_EPOCHS_CROP]
f = px.line(is_df_cropped, x="epoch", y="is", color="log", title="Inception Score", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "is_cropped.png")
f.show()

is_df = is_df[is_df["log"].isin(selected_log)]
is_df = is_df[is_df["epoch"] % 900 == 0]
f = px.line(is_df, x="epoch", y="is", color="log", title="Inception Score", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "is_filtered.png")
f.show()

In [None]:
avg_time_epoch_calculation_standalone = standalone_df["time_elapsed.epoch_calculation"].mean()
print(f"Average time epoch calculation standalone: {avg_time_epoch_calculation_standalone:.5f}s")

standalone_df_without_outliers_dfs = []
avg_time_per_epoch = {
    "Standalone": avg_time_epoch_calculation_standalone,
}
world_sizes = [0]
for server in server_df["log"].unique():
    print()
    df = server_df[server_df["log"] == server].copy()
    world_sizes.append(int(df["world_size"].iloc[0]))
    # remove outliers
    df = df[df["time_elapsed.epoch_calculation"] < df["time_elapsed.epoch_calculation"].quantile(0.95)]
    df = df[df["time_elapsed.epoch_calculation"] > df["time_elapsed.epoch_calculation"].quantile(0.05)]
    standalone_df_without_outliers_dfs.append(df)

    avg_time_epoch_calculation_server = df["time_elapsed.epoch_calculation"].mean()
    print(f"Average time epoch calculation server ({server}): {avg_time_epoch_calculation_server:.5f}s")
    ratio = avg_time_epoch_calculation_server / avg_time_epoch_calculation_standalone
    print(f"Ratio: {ratio:.2f}, the server {server} is {ratio:.2f} times slower than the standalone")
    avg_time_per_epoch[server] = avg_time_epoch_calculation_server
standalone_df_without_outliers = pd.concat([standalone_df, *standalone_df_without_outliers_dfs])
df_avg_time_per_epoch = pd.DataFrame(avg_time_per_epoch.items(), columns=["Actor", "Average time per epoch (s)"])
df_avg_time_per_epoch["world_size"] = world_sizes

In [None]:
for log in standalone_server_df["log"].unique():
    cum_sum = standalone_server_df[standalone_server_df["log"] == log]["time_elapsed.epoch_calculation"].cumsum()
    standalone_server_df.loc[standalone_server_df["log"] == log, "cumsum"] = cum_sum

In [None]:
df_avg_time_per_epoch

In [None]:
f = px.line(standalone_server_df, x="epoch", y="time_elapsed.epoch_calculation", color="log", title="Epoch duration", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_duration.png")
f.show()

standalone_server_df_cropped = standalone_server_df[standalone_server_df["epoch"] <= MAX_EPOCHS_CROP]
f = px.line(standalone_server_df_cropped, x="epoch", y="time_elapsed.epoch_calculation", color="log", title="Epoch duration", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_duration_cropped.png")
f.show()

f = px.line(standalone_server_df, x="epoch", y="cumsum", color="log", title="Epoch achieve as the time pass", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_start_time.png")
f.show()

standalone_server_df_cropped = standalone_server_df[standalone_server_df["epoch"] <= MAX_EPOCHS_CROP]
f = px.line(standalone_server_df_cropped, x="epoch", y="cumsum", color="log", title="Epoch achieve as the time pass", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_start_time_cropped.png")
f.show()

f = px.line(standalone_df_without_outliers, x="epoch", y="time_elapsed.epoch_calculation", color="log", title="Epoch duration (without outliers)", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_duration_without_outliers.png")
f.show()

standalone_df_without_outliers_cropped = standalone_df_without_outliers[standalone_df_without_outliers["epoch"] <= MAX_EPOCHS_CROP]
f = px.line(standalone_df_without_outliers_cropped, x="epoch", y="time_elapsed.epoch_calculation", color="log", title="Epoch duration (without outliers)", labels=labels, width=WIDTH, height=HEIGHT)
f.write_image(img_path / "epoch_duration_without_outliers_cropped.png")
f.show()

f = px.bar(df_avg_time_per_epoch, x="Actor", y="Average time per epoch (s)", title="Average time per epoch (s)", text=df_avg_time_per_epoch["Average time per epoch (s)"].apply(lambda x: f"{x:.2f}s"), width=WIDTH, height=HEIGHT, text_auto=True, color="Actor")
f.write_image(img_path / "average_time_per_epoch.png")
f.show()

f = px.line(df_avg_time_per_epoch, x="world_size", y="Average time per epoch (s)", text=df_avg_time_per_epoch["Average time per epoch (s)"].apply(lambda x: f"{x:.2f}s"), color="Actor", title="Average time per epoch (s)", labels=labels, width=WIDTH, height=HEIGHT, markers=True)
f.update_traces(marker=dict(size=10), textposition='top right')
f2 = px.line(df_avg_time_per_epoch, x="world_size", y="Average time per epoch (s)", title="Average time per epoch (s)", width=WIDTH, height=HEIGHT)
f2.data[0].update(opacity=0.5)
f.add_trace(f2.data[0])
f.write_image(img_path / "average_time_per_epoch_relation.png")
f.show()

In [None]:
selected_columns = [
    "generate_data",
    "discriminator_train",
    "generator_train",
]
rename_events = {
    "generate_data": "Generate data",
    "calc_gradients": "Perform optimization step",
    "discriminator_train": "Perform an optimization step on the discriminator",
    "generator_train": "Perform an optimization step on the generator",
}
labels = {
    "time_elapsed": "Average time elapsed (s)",
    "event": "Event",
    "dataset": "Dataset",
}

mean_time_elapsed = standalone_events_df[["event", "dataset", "time_elapsed"]].groupby(["event", "dataset"]).mean().sort_values(by="time_elapsed").reset_index()
mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", color="dataset", title="Mean time elapsed per operations (Server)", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
f.write_image(img_path / "mean_time_elapsed_bar_standalone.png")
f.show()

In [None]:
selected_columns = [
    "generate_data",
    "send_data",
    "recv_data",
    "agg_gradients",
    "calc_gradients",
    "swap"
]
rename_events = {
    "generate_data": "Generate data",
    "send_data": "Send data to workers",
    "recv_data": "Receive data from workers",
    "agg_gradients": "Aggregate gradients",
    "calc_gradients": "Perform optimization step",
    "swap": "Send swap instruction to workers",
}
labels = {
    "time_elapsed": "Average time elapsed (s)",
    "event": "Event",
    "dataset": "Dataset",
}

# for server in server_events_df["name"].unique():
#     df = server_events_df[server_events_df["name"] == server]
#     mean_time_elapsed = df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
#     mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
#     mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
    
#     f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", title=f"Mean time elapsed {server}", color="event", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
#     f.write_image(img_path / f"mean_time_elapsed_bar_{server}.png")
#     f.show()

#     f = px.pie(mean_time_elapsed, values="time_elapsed", names="event", title=f"Mean time elapsed {server}", labels=labels, width=WIDTH, height=HEIGHT)
#     f.write_image(img_path / f"mean_time_elapsed_pie_{server}.png")
#     f.show()
outliers_server_epochs_dfs = []
normal_server_epochs_dfs = []
for server in server_events_df["name"].unique():
    df = server_events_df[server_events_df["name"] == server]
    # identify the epochs where event epoch_calculation is an outlier
    event_df = df[df["event"] == "epoch_calculation"]
    outliers_epochs = event_df[(event_df["time_elapsed"] < event_df["time_elapsed"].quantile(0.05)) | (event_df["time_elapsed"] > event_df["time_elapsed"].quantile(0.95))]["index"]
    outliers_server_epochs_dfs.append(df[df["index"].isin(outliers_epochs)])
    normal_server_epochs_dfs.append(df[~df["index"].isin(outliers_epochs)])
outliers_server_epochs_df = pd.concat(outliers_server_epochs_dfs)
normal_server_epochs_df = pd.concat(normal_server_epochs_dfs)

mean_time_elapsed = normal_server_epochs_df[["event", "dataset", "time_elapsed"]].groupby(["event", "dataset"]).mean().sort_values(by="time_elapsed").reset_index()
mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", color="dataset", title="Mean time elapsed per operations (Server)", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
f.write_image(img_path / "mean_time_elapsed_bar_server.png")
f.show()

mean_time_elapsed = outliers_server_epochs_df[["event", "dataset", "time_elapsed"]].groupby(["event", "dataset"]).mean().sort_values(by="time_elapsed").reset_index()
mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", color="dataset", title="Outliers - mean time elapsed per operations (Server)", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
f.write_image(img_path / "mean_time_elapsed_bar_server_outliers.png")
f.show()

In [None]:
selected_columns = [
    "recv_data",
    "calc_gradients",
    "send",
    "swap_recv_instructions",
    "swap_send",
    "swap_recv",
    "load_state_dict",
]
rename_events = {
    "recv_data": "Receive data from server",
    "calc_gradients": "Perform optimization step",
    "send": "Send gradients to server",
    "swap_recv_instructions": "Receive swap instructions from server",
    "swap_send": "Send discriminator weights to other worker",
    "swap_recv": "Receive discriminator weights from other worker",
    "load_state_dict": "Load received model (swap)",
}
labels = {
    "time_elapsed": "Average time elapsed (s)",
    "event": "Event",
    "dataset": "Dataset",
}
# for worker in workers_events_df["dataset"].unique():
#     dataset, world_size = worker.split(" ")
#     world_size = int(world_size.replace("(", "").replace(")", ""))
#     df = workers_events_df[workers_events_df["dataset"] == worker]
#     mean_time_elapsed = df[["event", "time_elapsed"]].groupby("event").mean().sort_values(by="time_elapsed").reset_index()
#     mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
#     mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)

#     f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", title=f"Mean time elapsed workers ({dataset}, {world_size})", color="event", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
#     f.write_image(img_path / f"mean_time_elapsed_bar_workers_{dataset}_{world_size}.png")
#     f.show()

#     f = px.pie(mean_time_elapsed, values="time_elapsed", names="event", title=f"Mean time elapsed workers ({dataset}, {world_size})", labels=labels, width=WIDTH, height=HEIGHT)
#     f.write_image(img_path / f"mean_time_elapsed_bar_workers_{dataset}_{world_size}.png")
#     f.show()
    
#     display(mean_time_elapsed)
outliers_workers_epochs_dfs = []
normal_workers_epochs_dfs = []
for worker in workers_events_df["dataset"].unique():
    df = workers_events_df[workers_events_df["dataset"] == worker]
    # identify the epochs where event epoch_calculation is an outlier
    event_df = df[df["event"] == "epoch"]
    outliers_epochs = event_df[(event_df["time_elapsed"] < event_df["time_elapsed"].quantile(0.05)) | (event_df["time_elapsed"] > event_df["time_elapsed"].quantile(0.95))]["index"]
    outliers_workers_epochs_dfs.append(df[df["index"].isin(outliers_epochs)])
    normal_workers_epochs_dfs.append(df[~df["index"].isin(outliers_epochs)])
outliers_workers_epochs_df = pd.concat(outliers_workers_epochs_dfs)
normal_workers_epochs_df = pd.concat(normal_workers_epochs_dfs)

mean_time_elapsed = normal_workers_epochs_df[["event", "dataset", "time_elapsed"]].groupby(["event", "dataset"]).mean().sort_values(by="time_elapsed").reset_index()
mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", title=f"Mean time elapsed per operations (Workers)", color="dataset", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
f.write_image(img_path / f"mean_time_elapsed_bar_workers.png")
f.show()

mean_time_elapsed = outliers_workers_epochs_df[["event", "dataset", "time_elapsed"]].groupby(["event", "dataset"]).mean().sort_values(by="time_elapsed").reset_index()
mean_time_elapsed = mean_time_elapsed[mean_time_elapsed["event"].isin(selected_columns)].reset_index(drop=True)
mean_time_elapsed["event"] = mean_time_elapsed["event"].replace(rename_events)
f = px.bar(mean_time_elapsed, x="event", y="time_elapsed", color="dataset", title="Outliers - mean time elapsed per operations (Worker)", labels=labels, width=WIDTH, height=HEIGHT, text_auto=True)
f.write_image(img_path / "mean_time_elapsed_bar_workers_outliers.png")
f.show()

In [None]:
MAX_EPOCHS = 10
events = [
    "generate_data",
    "send_data",
    "recv_data",
    "calc_gradients",
    "apply_gradients",
    "swap",
    "fid",
    "is",
    "send",
]
rename_events = {
    "generate_data": "(Standalone/Server) Generate data",
    "send_data": "(Server) Send data to workers",
    "recv_data": "(Server/Worker) Receive feedbacks/data from workers/server",
    "apply_gradients": "(Server) Perform optimization step",
    "swap": "Send swap instruction to workers",
    "calc_gradients": "(Standalone/Worker) Perform optimization step / (Server) Aggregate gradients",
    "send": "(Worker) Send gradients to server",
    "fid": "(Standalone/Server) Calculate FID",
    "is": "(Standalone/Server) Calculate IS",
}
labels = {
    "event": "Event",
}
selected_dataset = "CIFAR10 (4)"
all_events_df_timeline = all_events_df[all_events_df["event"].isin(events)].copy()
all_events_df_timeline["event"] = all_events_df_timeline["event"].replace(rename_events)
all_events_df_timeline = all_events_df_timeline[all_events_df_timeline["dataset"] == selected_dataset]

timeline_start = px.timeline(
    all_events_df_timeline[(all_events_df_timeline["index"] > 10) & (all_events_df_timeline["index"] <= 10 + MAX_EPOCHS)],
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
    labels=labels,
    width=WIDTH*2,
    height=HEIGHT,
    title=f"Timeline of events for {selected_dataset} - {MAX_EPOCHS} epochs"
)
timeline_start.write_image(img_path / "timeline_10.png")
timeline_start.show()


timeline_start = px.timeline(
    all_events_df_timeline[(all_events_df_timeline["index"] > 10) & (all_events_df_timeline["index"] <= 10 + 1)],
    x_start="start",
    x_end="end",
    color="name",
    y="event",
    opacity=0.5,
    template="plotly_white",
    labels=labels,
    width=WIDTH*2,
    height=HEIGHT,
    title=f"Timeline of events for {selected_dataset} - one epoch"
)
timeline_start.write_image(img_path / "timeline_1.png")
timeline_start.show()