In [163]:
import re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from textwrap import indent

In [164]:
LOG_DIR = Path("logs/runs/exp2")

PATTERNS = {
    "wall": re.compile(r"CORE::TIME::WALL\s+([0-9.]+)"),
    "scatter": re.compile(r"CORE::TIME::SCATTER\s+([0-9.]+)"),
    "gather": re.compile(r"CORE::TIME::GATHER\s+([0-9.]+)"),
    "phases": re.compile(r"CORE::PHASES\s+([0-9]+)"),
    "edges_streamed": re.compile(r"CORE::BYTES::EDGES_STREAMED\s+([0-9]+)"),
    "updates_in": re.compile(r"CORE::BYTES::UPDATES_IN\s+([0-9]+)"),
    "updates_out": re.compile(r"CORE::BYTES::UPDATES_OUT\s+([0-9]+)")
}

def parse_log(path: Path) -> dict:
    text = path.read_text()
    out = {"log": path.name}
    for key, pat in PATTERNS.items():
        m = pat.findall(text)
        if not m:
            out[key] = None
        else:
            out[key] = sum(float(x) for x in m)
    return out


In [165]:
# graph = "cit-Patents"
INPUT_GRAPH = "*"
ALGORITHM = "*"
RUN_ID = "*"

PATTERN = f"{INPUT_GRAPH}-{ALGORITHM}-{RUN_ID}.log"
# PATTERN = f"soc-LiveJournal1-*k32-r0.log"

logs = sorted(LOG_DIR.glob(PATTERN))
records = [parse_log(p) for p in logs]
df = pd.DataFrame(records)

EDGE_REC_BYTES = 12  # type=1 COMPACT (@IIf)
UPDATE_REC_BYTES = 8

df["# iters"] = df["phases"].astype("Int64")

df["streaming_time"] = df["scatter"] + df["gather"]
df["ratio"] = df["wall"] / df["streaming_time"]

df["edges_streamed_cnt"] = df["edges_streamed"] / EDGE_REC_BYTES
df["updates_out_cnt"] = df["updates_out"] / UPDATE_REC_BYTES
df["wasted %"] = 100 * (1 - (df["updates_out_cnt"] / df["edges_streamed_cnt"]))

df

Unnamed: 0,log,wall,scatter,gather,phases,edges_streamed,updates_in,updates_out,# iters,streaming_time,ratio,edges_streamed_cnt,updates_out_cnt,wasted %
0,soc-LiveJournal1-wcc-mem128m-r0.log,146.298,124.147,21.383,13.0,17927340000.0,5103943000.0,5103943000.0,13,145.53,1.005277,1493945000.0,637992817.0,57.294752
1,soc-LiveJournal1-wcc-mem1g-r0.log,119.243,103.748,13.5582,13.0,17927340000.0,5103943000.0,5103943000.0,13,117.3062,1.016511,1493945000.0,637992817.0,57.294752
2,soc-LiveJournal1-wcc-mem256m-r0.log,131.342,115.915,14.2702,13.0,17927340000.0,5103943000.0,5103943000.0,13,130.1852,1.008886,1493945000.0,637992817.0,57.294752
3,soc-LiveJournal1-wcc-mem2g-r0.log,107.882,92.2,10.8968,13.0,17927340000.0,5103943000.0,5103943000.0,13,103.0968,1.046415,1493945000.0,637992817.0,57.294752
4,soc-LiveJournal1-wcc-mem4g-r0.log,104.438,85.26,10.2867,13.0,17927340000.0,5103943000.0,5103943000.0,13,95.5467,1.093057,1493945000.0,637992817.0,57.294752
5,soc-LiveJournal1-wcc-mem512m-r0.log,115.118,99.586,14.3921,13.0,17927340000.0,5103943000.0,5103943000.0,13,113.9781,1.010001,1493945000.0,637992817.0,57.294752
6,soc-LiveJournal1-wcc-mem64m-r0.log,240.746,181.552,37.1452,13.0,17927340000.0,5103943000.0,5103943000.0,13,218.6972,1.100819,1493945000.0,637992817.0,57.294752


In [166]:
# # Experiment 1b
# df_wcc = df[df["log"].str.contains("-wcc-")]
# df_wcc

In [None]:
# # Experiment 1

# GRAPH_NAMES = [
#     "amazon0601",
#     "cit-Patents",
#     "dimacs-usa",
#     "soc-LiveJournal1",
# ]

# RUN_RE = re.compile(r"-r(?P<run>\d+)\.log$")

# def split_log_name(filename: str) -> dict:
#     # find graph by prefix match (graph names may include '-')
#     graph = None
#     for g in GRAPH_NAMES:
#         prefix = g + "-"
#         if filename.startswith(prefix):
#             graph = g
#             rest = filename[len(prefix):]  # e.g. "wcc-r0.log"
#             break
#     if graph is None:
#         raise ValueError(f"Unknown graph prefix in filename: {filename}")

#     m = RUN_RE.search(filename)
#     if not m:
#         raise ValueError(f"Cannot parse run id from filename: {filename}")
#     run = int(m.group("run"))

#     alg = rest[: rest.rfind(f"-r{run}.log")]

#     return {"graph": graph, "algorithm": alg, "run": run}

# meta = df["log"].apply(split_log_name).apply(pd.Series)
# df = pd.concat([df, meta], axis=1)

# df["run"] = df["run"].astype(int)

# METRICS = [
#     "wall",
#     "scatter",
#     "gather",
#     "streaming_time",
#     "ratio",
#     "wasted %",
#     "# iters",
#     "edges_streamed_cnt",
#     "updates_out_cnt",
# ]

# summary = (
#     df.groupby(["graph", "algorithm"], dropna=False)[METRICS]
#       .agg(["count", "mean", "std"])
#       .reset_index()
# )

# summary

In [None]:
# summary_wcc = summary[summary["algorithm"].str.contains("wcc")]
# with pd.option_context(
#     "display.max_rows", None,
#     "display.max_columns", None,
#     "display.max_colwidth", None,
# ):
#     display(summary_wcc)

In [None]:
# # Experiment 1 (pretty)

# def mean_pm_sd(s: pd.Series) -> str:
#     s = s.dropna()
#     if len(s) == 0:
#         return ""
#     if len(s) == 1:
#         return f"{s.iloc[0]:.4g}"
#     return f"{s.mean():.4g} ± {s.std(ddof=1):.2g}"

# pretty = (
#     df.groupby(["graph", "algorithm"])
#       .agg(
#           n=("wall", lambda x: x.notna().sum()),
#           wall_s=("wall", mean_pm_sd),
#           streaming_s=("streaming_time", mean_pm_sd),
#           wasted_s=("wasted %", mean_pm_sd),
#           iters_s=("# iters", mean_pm_sd),
#           ratio_s=("ratio", mean_pm_sd),
#       )
#       .reset_index()
#       .sort_values(["graph", "algorithm"])
# )

# pretty


In [None]:
# # Experiment 1: Comparison with Paper

# paper_memory = {
#     ("amazon0601", "wcc"): 0.61,
#     ("amazon0601", "scc"): 1.12,
#     ("amazon0601", "sssp"): 0.83,
#     ("amazon0601", "mcst"): 0.37,
#     ("amazon0601", "mis"): 3.31,
#     ("amazon0601", "cond"): 0.07,
#     ("amazon0601", "spmv"): 0.09,
#     ("amazon0601", "pagerank"): 0.25,

#     ("cit-Patents", "wcc"): 2.98,
#     ("cit-Patents", "scc"): 0.69,
#     ("cit-Patents", "sssp"): 0.29,
#     ("cit-Patents", "mcst"): 2.35,
#     ("cit-Patents", "mis"): 3.72,
#     ("cit-Patents", "cond"): 0.19,
#     ("cit-Patents", "spmv"): 0.19,
#     ("cit-Patents", "pagerank"): 0.74,

#     ("soc-LiveJournal1", "wcc"): 7.22,
#     ("soc-LiveJournal1", "scc"): 11.12,
#     ("soc-LiveJournal1", "sssp"): 9.60,
#     ("soc-LiveJournal1", "mcst"): 7.66,
#     ("soc-LiveJournal1", "mis"): 15.54,
#     ("soc-LiveJournal1", "cond"): 0.78,
#     ("soc-LiveJournal1", "spmv"): 0.74,
#     ("soc-LiveJournal1", "pagerank"): 2.90,

#     ("dimacs-usa", "wcc"): 372.0,
#     ("dimacs-usa", "scc"): 594.0,
#     ("dimacs-usa", "sssp"): 2312.0,
#     ("dimacs-usa", "mcst"): 4.68,
#     ("dimacs-usa", "mis"): 9.60,
#     ("dimacs-usa", "cond"): 0.26,
#     ("dimacs-usa", "spmv"): 0.65,
#     ("dimacs-usa", "pagerank"): 2.58,
# }

# paper_df = (
#     pd.Series(paper_memory, name="paper_wall_s")
#       .rename_axis(["graph", "algorithm"])
#       .reset_index()
# )

# agg = (
#     df.groupby(["graph", "algorithm"])
#       .agg(
#           n=("wall", "count"),
#           wall_mean=("wall", "mean"),
#           wall_std=("wall", lambda x: x.std(ddof=1)),
#       )
#       .reset_index()
# )

# agg["ours_wall_s"] = (
#     agg["wall_mean"].map(lambda x: f"{x:.2f}")
#     + " ± "
#     + agg["wall_std"].fillna(0).map(lambda x: f"{x:.2f}")
# )

# compare = (
#     paper_df
#     .merge(agg, on=["graph", "algorithm"], how="left")
#     .sort_values(["graph", "algorithm"])
#     .reset_index(drop=True)
# )

# compare["factor_vs_paper"] = compare["wall_mean"] / compare["paper_wall_s"]
# compare["factor_vs_paper"] = compare["factor_vs_paper"].round(2)

# compare


In [None]:
# import pandas as pd
# import numpy as np

# graphs = ["amazon0601", "cit-Patents", "soc-LiveJournal1", "dimacs-usa"]
# alg_order = ["wcc", "scc", "sssp", "mcst", "mis", "cond", "spmv", "pagerank"]

# cmp = (
#     compare
#     .query("graph in @graphs and algorithm in @alg_order")
#     .copy()
# )

# if "factor_vs_paper" not in cmp.columns:
#     cmp["factor_vs_paper"] = cmp["wall_mean"] / cmp["paper_wall_s"]

# cmp = cmp.set_index(["graph", "algorithm"])

# def fmt_paper_cell(paper_s):
#     if pd.isna(paper_s):
#         return ""
#     return f"{paper_s:.2f}s"

# def fmt_ours_cell(mean_s, std_s, factor):
#     if pd.isna(mean_s):
#         return ""
#     std_s = 0.0 if pd.isna(std_s) else float(std_s)
#     factor = float(factor) if pd.notna(factor) else np.nan

#     return (
#         r"\begin{tabular}{@{}c@{}}"
#         f"{mean_s:.2f}s" r"\\"
#         r"$\pm$ " f"{std_s:.2f}s" r"\\"
#         + (f"({factor:.2f}" + r"$\times$)" if np.isfinite(factor) else "()")
#         + r"\end{tabular}"
#     )

# def row_cells(graph, kind):
#     out = []
#     for alg in alg_order:
#         if (graph, alg) not in cmp.index:
#             out.append("")
#             continue
#         r = cmp.loc[(graph, alg)]
#         if kind == "paper":
#             out.append(fmt_paper_cell(r["paper_wall_s"]))
#         else:
#             out.append(fmt_ours_cell(r["wall_mean"], r.get("wall_std", np.nan), r.get("factor_vs_paper", np.nan)))
#     return out

# lines = []
# lines += [
# r"\begin{table*}[t]",
# r"\centering",
# r"\scriptsize",
# r"\setlength{\tabcolsep}{3pt}",
# r"\renewcommand{\arraystretch}{1.1}",
# r"\begin{tabularx}{\textwidth}{@{} l *{8}{>{\centering\arraybackslash}X} @{} }",
# r"\toprule",
# r"& \textbf{WCC} & \textbf{SCC} & \textbf{SSSP} & \textbf{MCST} & \textbf{MIS} & \textbf{Cond.} & \textbf{SpMV} & \textbf{Pagerank} \\",
# r"\midrule",
# r"\multicolumn{9}{@{}l}{\textbf{memory}} \\",
# r"\midrule",
# ]

# for g in graphs:
#     paper = row_cells(g, "paper")
#     ours  = row_cells(g, "ours")

#     lines.append(rf"\textit{{{g} (paper)}} & " + " & ".join(paper) + r" \\")
#     lines.append(rf"\textbf{{{g} (ours)}} & " + " & ".join(ours)  + r" \\")
#     lines.append(r"\addlinespace")

# lines += [
# r"\bottomrule",
# r"\end{tabularx}",
# r"\caption{Execution times for the \textbf{memory} setting. Paper rows reproduce the X-Stream paper values. Our rows report mean and sample SD over three runs; parentheses show the slowdown factor vs.\ the paper baseline.}",
# r"\label{tab:xstream-memory-paper-vs-ours}",
# r"\end{table*}",
# ]

# print("\n".join(lines))


In [172]:
# Experiment 2
pattern = (
    r"-"                             # dash before algorithm
    r"(?P<algorithm>[a-zA-Z0-9_]+)"  # algorithm name
    r"-mem(?P<memory>\d+[a-zA-Z])"   # memory (e.g. 1g, 8g)
    r"-r(?P<run>\d+)"                # run id
)

df[["algorithm", "memory", "run"]] = (
    df["log"]
    .str.extract(pattern)
)

def mem_to_mb(s: str) -> int:
    s = s.lower()
    if s.endswith("g"):
        return int(s[:-1]) * 1024
    if s.endswith("m"):
        return int(s[:-1])
    raise ValueError(f"Unknown memory format: {s}")

df["memory"] = df["memory"].apply(mem_to_mb)

front = ["algorithm", "memory", "run", "wall"]
df = df[front + [c for c in df.columns if c not in front]]

df.sort_values(by=["algorithm", "memory"])

Unnamed: 0,algorithm,memory,run,wall,log,scatter,gather,phases,edges_streamed,updates_in,updates_out,# iters,streaming_time,ratio,edges_streamed_cnt,updates_out_cnt,wasted %
6,wcc,64,0,240.746,soc-LiveJournal1-wcc-mem64m-r0.log,181.552,37.1452,13.0,17927340000.0,5103943000.0,5103943000.0,13,218.6972,1.100819,1493945000.0,637992817.0,57.294752
0,wcc,128,0,146.298,soc-LiveJournal1-wcc-mem128m-r0.log,124.147,21.383,13.0,17927340000.0,5103943000.0,5103943000.0,13,145.53,1.005277,1493945000.0,637992817.0,57.294752
2,wcc,256,0,131.342,soc-LiveJournal1-wcc-mem256m-r0.log,115.915,14.2702,13.0,17927340000.0,5103943000.0,5103943000.0,13,130.1852,1.008886,1493945000.0,637992817.0,57.294752
5,wcc,512,0,115.118,soc-LiveJournal1-wcc-mem512m-r0.log,99.586,14.3921,13.0,17927340000.0,5103943000.0,5103943000.0,13,113.9781,1.010001,1493945000.0,637992817.0,57.294752
1,wcc,1024,0,119.243,soc-LiveJournal1-wcc-mem1g-r0.log,103.748,13.5582,13.0,17927340000.0,5103943000.0,5103943000.0,13,117.3062,1.016511,1493945000.0,637992817.0,57.294752
3,wcc,2048,0,107.882,soc-LiveJournal1-wcc-mem2g-r0.log,92.2,10.8968,13.0,17927340000.0,5103943000.0,5103943000.0,13,103.0968,1.046415,1493945000.0,637992817.0,57.294752
4,wcc,4096,0,104.438,soc-LiveJournal1-wcc-mem4g-r0.log,85.26,10.2867,13.0,17927340000.0,5103943000.0,5103943000.0,13,95.5467,1.093057,1493945000.0,637992817.0,57.294752


In [None]:
# # Experiment 3
# pattern = (
#     r"-"                             # dash before algorithm
#     r"(?P<algorithm>[a-zA-Z0-9_]+)"  # algorithm name
#     r"-th(?P<threads>\d+)"           # threads
#     r"-mem(?P<memory>\d+[a-zA-Z])"   # memory (e.g. 1g, 8g)
#     r"-r(?P<run>\d+)"                # run id
# )

# df[["algorithm", "threads", "memory", "run"]] = (
#     df["log"]
#     .str.extract(pattern)
# )

# def mem_to_mb(s: str) -> int:
#     s = s.lower()
#     if s.endswith("g"):
#         return int(s[:-1]) * 1024
#     if s.endswith("m"):
#         return int(s[:-1])
#     raise ValueError(f"Unknown memory format: {s}")

# df["memory"] = df["memory"].apply(mem_to_mb)

# df["threads"] = df["threads"].astype(int)

# front = ["algorithm", "memory", "threads", "run", "wall"]
# df = df[front + [c for c in df.columns if c not in front]]

# df.sort_values(by=["algorithm", "memory", "threads"])

In [None]:
# # Experiment 3 contd.
# df = df.sort_values(["algorithm", "memory", "threads", "run"])

# agg = (df.groupby(["algorithm", "memory", "threads"], as_index=False)
#          .agg(wall_mean=("wall", "mean"),
#               wall_std=("wall", "std"),
#               n=("wall", "size")))

# plt.figure(figsize=(9, 5))

# for (alg, mem), sub in agg.groupby(["algorithm", "memory"]):
#     sub = sub.sort_values("threads")
#     label = f"{alg} @ {mem} MB"
#     plt.plot(sub["threads"], sub["wall_mean"], marker="o", label=label)

#     # Shaded +/- 1 std with multiple runs
#     if sub["wall_std"].notna().any() and (sub["n"] > 1).any():
#         plt.fill_between(
#             sub["threads"],
#             sub["wall_mean"] - sub["wall_std"],
#             sub["wall_mean"] + sub["wall_std"],
#             alpha=0.15
#         )

# plt.xscale("log", base=2)
# plt.xticks(sorted(df["threads"].unique()), sorted(df["threads"].unique()))
# plt.xlabel("Threads")
# plt.ylabel("Wall time (s)")
# plt.title("Scaling with threads (wall time) — all algorithms & memory configs")
# plt.grid(True, which="both", linestyle="--", alpha=0.3)
# plt.legend(ncol=2, fontsize=9)
# plt.tight_layout()
# plt.savefig("./exp3.png")
# plt.show()
