In [1]:
from __future__ import annotations

import pandas as pd


def compute_throughput_performance(csv_file, key: str, data_samples: int):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    print(df)
    print(csv_file)

    # Parse the finished-parsing tag to extract the number of workflow tasks
    # Note: this is a boolean mask with True for rows containing the tag
    finished_tasks = df["tags"].str.contains(key)

    # Filter rows where the tag contains "finished-parsing" to get the total runtime
    total_runtime = df[finished_tasks]["elapsed_s"].sum()

    # Calculate the total finished tasks
    total_finished_tasks = finished_tasks.sum()

    # Calculate the total data processed (tasks * num examples per task)
    total_data_processed = data_samples * total_finished_tasks

    print(df["end_unix"].max())

    # Calculate the total time of the workflow
    total_time = df["end_unix"].max() - df["start_unix"].min()

    # Compute the throughput performance
    throughput_performance = total_data_processed / total_time

    # Create a dictionary to store the results
    results = {
        "total_runtime": total_runtime,
        "total_finished_tasks": total_finished_tasks,
        "total_data_processed": total_data_processed,
        "throughput_performance": throughput_performance,
    }

    print(results)

    return results

In [10]:
import matplotlib

matplotlib.use("Agg")
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import font_manager

for fontpath in font_manager.findSystemFonts(fontpaths=None, fontext="ttf"):
    fontlist = []
    if "avenir" in fontpath.lower():
        print(fontpath)
        font_manager.fontManager.addfont(fontpath)
        # font_manager.fontManager.ttflist.extend(fontpath)
        plt.rcParams["font.family"] = "Avenir"

# Set font sizes globally
plt.rcParams["font.size"] = 24
plt.rcParams["axes.titlesize"] = 24
plt.rcParams["axes.labelsize"] = 18
plt.rcParams["xtick.labelsize"] = 16
plt.rcParams["ytick.labelsize"] = 16
plt.rcParams["legend.fontsize"] = 16


def ideal_scaling(x_values: list[Any], y0: float) -> list[Any]:
    """
    return linear strong scaling curve based on x_values, y_values
    """
    x_values = np.array(x_values)
    return [y0 * (x / x_values[0]) for x in x_values]


def plot_scaling_relations(
    x_values: list[list[Any]],
    y_values: list[list[Any]],
    labels: list[str],
    image_name: str,
    ylabel: str = "Throughput (Samples/sec)",
) -> None:
    """
    plot multiple scaling curves for
        each sublist in x_values and y_values
        add ideal scaling based on x_values[1], y_values[1]
        add legend with labels under plot
        save plot to image_name
    """
    # plt.rcParams["font.family"] = "Avenir Regular"
    plt.rcParams["font.size"] = 14
    markers = ["o", "s", "d", "p", "*", "v", "^", "<"]
    colors = [
        "mediumpurple",
        "cornflowerblue",
        "mediumaquamarine",
        "lightcoral",
    ]
    fig, ax = plt.subplots(figsize=(8, 6))
    for x, y, label, marker, color in zip(x_values, y_values, labels, markers, colors):
        ax.plot(x, y, label=label, marker=marker, color=color)
        ax.plot(x, ideal_scaling(x, y[0]), linestyle="--", color="gray", alpha=0.5)
    # set legend below plot
    ax.legend(
        loc="upper left",
        # bbox_to_anchor=(0.5, -0.1),
        shadow=True,
        ncol=1,
        frameon=False,
    )
    ax.set_ylabel(ylabel)
    ax.set_xlabel("Number of accelerators")
    ax.set_yscale("log")
    # step_size=500
    # max_y = max([max(y) for y in y_values])
    # plt.yticks(np.arange(0, max_y+1, step_size))
    ax.set_xscale("log")
    # ax.set_xticks(x_values[0])
    plt.savefig(image_name, bbox_inches="tight")
    plt.show()

/System/Library/Fonts/Avenir.ttc
/System/Library/Fonts/Avenir Next.ttc
/System/Library/Fonts/Avenir Next Condensed.ttc


In [127]:
polaris_semantic_chunking = [
    "scaling/semantic-chunking-polaris/nougat.sfr-mistral.polaris.nodes2.csv",
    "scaling/semantic-chunking-polaris/nougat.sfr-mistral.polaris.nodes32.csv",
    "scaling/semantic-chunking-polaris/nougat.sfr-mistral.polaris.nodes64.csv",
    "scaling/semantic-chunking-polaris/nougat.sfr-mistral.polaris.nodes128.csv",
    "scaling/semantic-chunking-polaris/nougat.sfr-mistral.polaris.nodes256.csv",
]

sunspot_semantic_chunking = [
    "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes1.csv",
    "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes4.csv",
    "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes16.csv",
    "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes32.csv",
    # "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes64.csv",
    "scaling/semantic-chunking-sunspot/nougat.sfr-mistral.sunspot.nodes96.csv",
]

polaris_semantic_chunking_throughput = [
    compute_throughput_performance(file, "finished-embedding", 93)[
        "throughput_performance"
    ]
    for file in polaris_semantic_chunking
]

sunspot_semantic_chunking_throughput = [
    compute_throughput_performance(file, "finished-embedding", 93)[
        "throughput_performance"
    ]
    for file in sunspot_semantic_chunking
]

polaris_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]
sunspot_ngpus = [1 * 12, 4 * 12, 16 * 12, 32 * 12, 96 * 12]

x_values = [polaris_ngpus, sunspot_ngpus]
y_values = [
    polaris_semantic_chunking_throughput,
    sunspot_semantic_chunking_throughput,
]
labels = ["Polaris Semantic Chunking", "Sunspot Semantic Chunking"]
image_name = "scaling_relations_semantic_chunking.pdf"
plot_scaling_relations(x_values, y_values, labels, image_name)

print(
    "Polaris Semantic Chunking Throughput:",
    polaris_semantic_chunking_throughput,
)
print(
    "Sunspot Semantic Chunking Throughput:",
    sunspot_semantic_chunking_throughput,
)

                                                  tags  elapsed_s  \
0    ['loaded-encoder', '/lus/eagle/projects/tpc/br...      16.56   
1    ['loaded-dataset', '/lus/eagle/projects/tpc/br...       0.17   
2    ['loaded-encoder', '/lus/eagle/projects/tpc/br...      16.76   
3    ['loaded-dataset', '/lus/eagle/projects/tpc/br...       0.19   
4    ['loaded-encoder', '/lus/eagle/projects/tpc/br...      16.75   
..                                                 ...        ...   
338  ['computed-embeddings', '/lus/eagle/projects/t...     207.77   
339  ['computed-embeddings', '/lus/eagle/projects/t...     207.52   
340  ['computed-embeddings', '/lus/eagle/projects/t...     197.77   
341  ['computed-embeddings', '/lus/eagle/projects/t...     198.26   
342  ['computed-embeddings', '/lus/eagle/projects/t...     198.47   

       start_unix      end_unix  
0    1.713085e+09  1.713085e+09  
1    1.713085e+09  1.713085e+09  
2    1.713085e+09  1.713085e+09  
3    1.713085e+09  1.713085e+09  
4

  plt.show()


In [11]:
polaris_oreo = [
    "scaling/pdfparsing-polaris/oreo.polaris.nodes2.csv",
    # "scaling/pdfparsing-polaris/oreo.polaris.nodes8.csv",
    "scaling/pdfparsing-polaris/oreo.polaris.nodes32.csv",
    "scaling/pdfparsing-polaris/oreo.polaris.nodes64.csv",
    "scaling/pdfparsing-polaris/oreo.polaris.nodes128.csv",
    "scaling/pdfparsing-polaris/oreo.polaris.nodes256.csv",
]


polaris_nougat = [
    "scaling/pdfparsing-polaris/nougat.polaris.nodes2.csv",
    # "scaling/pdfparsing-polaris/nougat.polaris.nodes8.csv",
    "scaling/pdfparsing-polaris/nougat.polaris.nodes32.csv",
    "scaling/pdfparsing-polaris/nougat.polaris.nodes64.csv",
    "scaling/pdfparsing-polaris/nougat.polaris.nodes128.csv",
    "scaling/pdfparsing-polaris/nougat.polaris.nodes256.csv",
]

sunspot_nougat = [
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes1.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes2.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes4.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes8.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes16.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes32.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes64.csv",
    "scaling/pdfparsing-sunspot/nougat.sunspot.nodes96.csv",
]

polaris_oreo_throughput = [
    compute_throughput_performance(file, "finished-parsing", 10)[
        "throughput_performance"
    ]
    for file in polaris_oreo
]

polaris_nougat_throughput = [
    compute_throughput_performance(file, "finished-parsing", 10)[
        "throughput_performance"
    ]
    for file in polaris_nougat
]

sunspot_nougat_throughput = [
    compute_throughput_performance(file, "finished-parsing", 10)[
        "throughput_performance"
    ]
    for file in sunspot_nougat
]

polaris_oreo_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]
polaris_nougat_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]
sunspot_ngpus = [
    1 * 12,
    2 * 12,
    4 * 12,
    8 * 12,
    16 * 12,
    32 * 12,
    64 * 12,
    96 * 12,
]

x_values = [polaris_oreo_ngpus, polaris_nougat_ngpus, sunspot_ngpus]
y_values = [
    polaris_oreo_throughput,
    polaris_nougat_throughput,
    sunspot_nougat_throughput,
]
labels = ["Polaris Oreo", "Polaris Nougat", "Sunspot Nougat"]
image_name = "scaling_relations_pdfparsing-big-font.pdf"
plot_scaling_relations(x_values, y_values, labels, image_name)

print("Polaris Oreo Throughout: ", polaris_oreo_throughput)
print("Polaris Nougat Throughout: ", polaris_nougat_throughput)
print("Sunspot Nougat Throughout: ", sunspot_nougat_throughput)

                                                  tags  elapsed_s  \
0    ['initialize-parser', '2a2f70d5-03bf-46ab-a71d...     144.50   
1    ['initialize-parser', '9c8a8392-0dd6-456e-90be...     144.12   
2    ['initialize-parser', '32c44aea-13ef-4199-ad3c...     144.40   
3    ['initialize-parser', '47982988-ecdc-4573-a98c...     144.19   
4    ['initialize-parser', 'b08e83e6-0254-4e20-b8b0...     144.12   
..                                                 ...        ...   
443  ['parser-parse', '50bac34d-a26e-4f22-9ba6-363b...      56.82   
444  ['write-jsonl', '50bac34d-a26e-4f22-9ba6-363b6...       0.00   
445  ['parse-pdfs', '/lus/eagle/projects/tpc/bracea...      56.83   
446  ['finished-parsing', '/lus/eagle/projects/tpc/...      58.33   
447  ['initialize-parser', '877ef0ae-a665-4786-8de3...       0.00   

       start_unix      end_unix  
0    1.713122e+09  1.713122e+09  
1    1.713122e+09  1.713122e+09  
2    1.713122e+09  1.713122e+09  
3    1.713122e+09  1.713122e+09  
4

  plt.show()


In [110]:
# Peak flops for pdf parsing

# Peak flops on Polaris
peak_polaris = 44 * 1000 * 16  # TFLOPS * fp16

# Peak flops per GPU
nougat_single_gpu_flops = 430.85 * 0.001  # GFLOPS to TFLOPS

polaris_nougat_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]

peak_polaris_flops = [peak_polaris for _ in polaris_oreo_ngpus]
polaris_nougat_peak_flops = [
    nougat_single_gpu_flops * ngpus for ngpus in polaris_nougat_ngpus
]

x_values = [polaris_nougat_ngpus, polaris_oreo_ngpus]
y_values = [polaris_nougat_peak_flops, peak_polaris_flops]
labels = ["Polaris Nougat", "Peak Polaris"]
image_name = "scaling_relations_nougat_flops.pdf"
ylabel = "Peak TFLOPS"
plot_scaling_relations(x_values, y_values, labels, image_name, ylabel)

print("Num accelerators: ", polaris_ngpus)
print("Polaris Nougat Parser Peak TFlops: ", polaris_nougat_peak_flops)
print("Peak Polaris TFlops (fp16): ", peak_polaris)

Num accelerators:  [8, 128, 256, 512, 1024]
Polaris Nougat Parser Peak TFlops:  [3.4468, 55.1488, 110.2976, 220.5952, 441.1904]
Peak Polaris TFlops (fp16):  704000


  plt.show()


In [111]:
# Peak flops on Polaris
peak_polaris = 44 * 1000 * 8  # TFLOPS * fp32

# Peak flops per GPU
oreo_single_gpu_flops = 40.73  # TFLOPS

polaris_oreo_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]

peak_polaris_flops = [peak_polaris for _ in polaris_oreo_ngpus]
polaris_oreo_peak_flops = [oreo_single_gpu_flops * ngpu for ngpu in polaris_oreo_ngpus]

x_values = [polaris_oreo_ngpus, polaris_oreo_ngpus]
y_values = [polaris_oreo_peak_flops, peak_polaris_flops]
labels = ["Polaris Oreo", "Peak Polaris"]
image_name = "scaling_relations_oreo_flops.pdf"
ylabel = "Peak TFLOPS"
plot_scaling_relations(x_values, y_values, labels, image_name, ylabel)

print("Num accelerators: ", polaris_ngpus)
print("Polaris Oreo Parser Peak TFlops: ", polaris_oreo_peak_flops)
print("Peak Polaris TFlops (fp32): ", peak_polaris)

Num accelerators:  [8, 128, 256, 512, 1024]
Polaris Oreo Parser Peak TFlops:  [325.84, 5213.44, 10426.88, 20853.76, 41707.52]
Peak Polaris TFlops (fp32):  352000


  plt.show()


In [112]:
# Peak flops for semantic chunking

# Peak flops on Polaris (44pflops double precision)
peak_polaris = 44 * 1000 * 64  # TFLOPS * int8 quantization

# Peak flops per GPU
sfr_mistral_single_gpu_flops = 76.98  # GFLOPS to TFLOPS
pubmedbert_single_gpu_flops = 60.36  # GFLOPS to TFLOPS

polaris_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]

peak_polaris_flops = [peak_polaris for _ in polaris_oreo_ngpus]
mistral_peak_flops = [sfr_mistral_single_gpu_flops * ngpus for ngpus in polaris_ngpus]
pubmedbert_peak_flops = [pubmedbert_single_gpu_flops * ngpus for ngpus in polaris_ngpus]

x_values = [polaris_ngpus, polaris_ngpus, polaris_ngpus]
y_values = [mistral_peak_flops, pubmedbert_peak_flops, peak_polaris_flops]
labels = ["Polaris Mistral", "Polaris Pubmedbert", "Peak Polaris"]
image_name = "scaling_relations_peak_flops_semantic_chunking.pdf"
ylabel = "Peak TFLOPS"
plot_scaling_relations(x_values, y_values, labels, image_name, ylabel)

print("Num accelerators: ", polaris_ngpus)
print("Polaris Mistral Peak TFlops: ", mistral_peak_flops)
print("Polaris Pubmedbert Peak TFlops: ", pubmedbert_peak_flops)
print("Peak Polaris TFlops (int8): ", peak_polaris)

Num accelerators:  [8, 128, 256, 512, 1024]
Polaris Mistral Peak TFlops:  [615.84, 9853.44, 19706.88, 39413.76, 78827.52]
Polaris Pubmedbert Peak TFlops:  [482.88, 7726.08, 15452.16, 30904.32, 61808.64]
Peak Polaris TFlops (int8):  2816000


  plt.show()


In [103]:
polaris_nougat_ngpus = [2 * 4, 32 * 4, 64 * 4, 128 * 4, 256 * 4]
sunspot_ngpus = [
    1 * 12,
    2 * 12,
    4 * 12,
    8 * 12,
    16 * 12,
    32 * 12,
    64 * 12,
    96 * 12,
]

print("Polaris Accelerators", polaris_nougat_ngpus)
print("Sunspot Accelerators", sunspot_ngpus)

Polaris Accelerators [8, 128, 256, 512, 1024]
Sunspot Accelerators [12, 24, 48, 96, 192, 384, 768, 1152]
