In [1]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
import re
import glob
import numpy as np

In [2]:
def load_gemm_data(path: str):
    files = glob.glob(path + "/*.jsonl")
    pattern = re.compile(r"M(\d+)_N(\d+)_K(\d+)")
    
    dfs = []
    for file in files:
        match = pattern.search(file)
        if match:
            M, N, K = match.groups()
            df = pl.read_ndjson(file).filter(pl.col("is_best") == "true")
            dfs.append(df)
    
    if dfs:
        combined_df = pl.concat(dfs)
    else:
        raise RuntimeError("No data loaded")
    return combined_df

In [3]:
def calculate_roofline_data(df, peak_compute, peak_bandwidth):
    df = df.with_columns([
        (
            (pl.col("M") * pl.col("N") * pl.col("K")) /
            (pl.col("M") * pl.col("K") + pl.col("N") * pl.col("K") + pl.col("M") * pl.col("N"))
        ).alias("gemm_arithmetic_intensity"),
        (pl.col("M").cast(pl.Utf8) + "_" + pl.col("N").cast(pl.Utf8) + "_" + pl.col("K").cast(pl.Utf8)).alias("group")
    ])

    df = df.with_columns([
        (peak_bandwidth * pl.col("gemm_arithmetic_intensity")).alias("memory_constraint")
    ])

    df = df.with_columns([
        pl.min_horizontal(
            peak_compute,
            peak_bandwidth * pl.col("gemm_arithmetic_intensity")
        ).alias("attainable_performance")
    ])
    return df

In [4]:
def generate_roofline_plot(combined_df):    
    # Apply square root scaling to 'time_ms'
    combined_df = combined_df.with_columns(
        (pl.col("time_ms").sqrt().alias("time_scaled"))
    )

    # force categorical coloring of "M"
    combined_df = combined_df.sort("M")
    combined_df = combined_df.cast({"M": pl.String})
  
    # Create main scatter plot 
    fig = px.scatter(
        combined_df, 
        x="gemm_arithmetic_intensity",
        y="tflops",
        log_x=True,
        log_y=True,
        color="M",
        color_discrete_sequence=px.colors.qualitative.G10,
        size="time_ms",
        hover_data={
            "time_scaled": False,
            "time_ms": True,
            "M": True,
            "N": True,
            "K": True
        },
        labels={
            "arithmetic_intensity": "arithmetic Intensity (TFLOPs/GB/s)",
            "tflops": "Performance (TFLOPs)",
            "time_ms": "Execution Time (ms)",
        },
        title="AMD RX 7900 XTX Roofline Analysis",
        height=600
    )
    
    # Add roofline
    fig.add_trace(go.Scatter(
        x=combined_df["gemm_arithmetic_intensity"].to_list(),
        y=combined_df["attainable_performance"].to_list(),
        mode='lines',
        line=dict(color='tomato', width=1),
        opacity=0.45,
        name='Theoretical Roofline'
    ))
    
    return fig


In [5]:
def generate_roofline_plots_by_group(combined_df):
    # Apply square root scaling to 'time_ms'
    combined_df = combined_df.with_columns(
        (pl.col("time_ms").sqrt().alias("time_scaled"))
    )

    # Force categorical coloring of "M"
    combined_df = combined_df.sort("M")
    combined_df = combined_df.cast({"M": pl.String})

    figures = {}
    groups = combined_df.select("M").unique().to_series().to_list()

    for group in groups:
        group_df = combined_df.filter(pl.col("M") == group)
        group_df = group_df.sort("gemm_arithmetic_intensity")

        # Create individual figure for each group
        fig = px.scatter(
            group_df,
            x="gemm_arithmetic_intensity",
            y="tflops",
            log_x=True,
            log_y=True,
            color="M",
            color_discrete_sequence=px.colors.qualitative.G10,
            size="time_scaled",
            hover_data={
                "time_scaled": False,
                "time_ms": True,
                "M": True,
                "N": True,
                "K": True
            },
            labels={
                "gemm_arithmetic_intensity": "Arithmetic Intensity (TFLOPs/GB/s)",
                "tflops": "Performance (TFLOPs)",
                "time_ms": "Execution Time (ms)",
            },
            title=f"AMD RX 7900 XTX Roofline Analysis: M = {group}, N=K={N}",
            height=600
        )

        # Add group-specific roofline
        fig.add_trace(go.Scatter(
            x=group_df["gemm_arithmetic_intensity"].to_list(),
            y=group_df["attainable_performance"].to_list(),
            mode='lines',
            line=dict(width=1),
            opacity=0.45,
            name=f'Roofline'
        ))

        figures[group] = fig

    return figures


In [6]:
# 61.44 TFLOPS = 61.44 * 1000 GFLOPS
peak_compute = 61.44
peak_bandwidth = 960

# Roofline for RX 7900 XTX GPU

Spec from https://en.wikipedia.org/wiki/List_of_AMD_graphics_processing_units

Processing power for RX 7900 XTX in TFLPOS

|Half|Single|Double|
|---|---|---|
|61.44|61.44|0.960|

Memory bandwidth in GB/s

960

# Kernel Scan Results

# INT8

In [7]:
df = load_gemm_data("results/**/INT8/**/")
df = calculate_roofline_data(df, peak_compute, peak_bandwidth)
figs = generate_roofline_plots_by_group(df)

for group_name, fig in figs.items():
    fig.show()

RuntimeError: No data loaded

# FLOAT16

In [None]:
df = load_gemm_data("results/**/FLOAT16/**/")
df = calculate_roofline_data(df, peak_compute, peak_bandwidth)
figs = generate_roofline_plots_by_group(df)

for group_name, fig in figs.items():
    fig.show()

# BFLOAT16

In [None]:
df = load_gemm_data("results/**/BFLOAT16/**/")
df = calculate_roofline_data(df, peak_compute, peak_bandwidth)
figs = generate_roofline_plots_by_group(df)

for group_name, fig in figs.items():
    fig.show()

# Float 32

`M=1,2,4` failed so we only see 6 scans

In [None]:
df = load_gemm_data("results/**/FLOAT32/**/")
df = calculate_roofline_data(df, peak_compute, peak_bandwidth)
figs = generate_roofline_plots_by_group(df)

for group_name, fig in figs.items():
    fig.show()