In [1]:
from pathlib import Path
import re
import pandas as pd
from typing import Union, List, Optional

def _to_number(s: str) -> Union[int, float]:
    """Cast '256.0' → 256 (int) and '256.5' → 256.5 (float)."""
    if "." in s:
        as_float = float(s)
        return int(as_float) if as_float.is_integer() else as_float
    return int(s)

def _parse_cu_mask(raw: str, *, to_int: bool) -> Optional[Union[int, float, str]]:
    """Handle hex/dec/nan mask; return int, float('nan'), or raw string."""
    if raw.lower() == "nan":
        return float("nan") if to_int else raw
    if raw.startswith("0x"):
        return int(raw, 16) if to_int else raw
    return _to_number(raw) if to_int else raw

def load_kernel_traces(
    directory: Path | str = ".",
    *,
    as_concat: bool = True,
    cu_mask_as_int: bool = False,
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
    directory = Path(directory)
    trace_files = sorted(directory.glob("*_kernel_trace.csv"))


    phase_choices = r"(?P<mode>prefill_and_decode|prefill|decode)"

    pattern = re.compile(
        rf"^standalone_attn_{phase_choices}_"        # ← exact phase
        r"(?P<prefill_batch>\d+(?:\.\d+)?)_"
        r"(?P<prefill_len>\d+(?:\.\d+)?)_"
        r"(?P<decode_batch_size>\d+(?:\.\d+)?)_"
        r"(?P<decode_len>\d+(?:\.\d+)?)_"
        r"(?P<cu_mask>0x[0-9A-Fa-f]+|\d+(?:\.\d+)?|nan)"
        r"_kernel_trace\.csv$"
    )



    frames = []
    for f in trace_files:
        m = pattern.match(f.name)
        if not m:
            print(f"⚠️  Skipping file that doesn’t match pattern: {f.name}")
            continue

        df = pd.read_csv(f)
        df["mode"] = m["mode"]

        # numeric pieces
        df["prefill_batch"]     = _to_number(m["prefill_batch"])
        df["prefill_len"]       = _to_number(m["prefill_len"])
        df["decode_batch_size"] = _to_number(m["decode_batch_size"])
        df["decode_len"]        = _to_number(m["decode_len"])

        # mask and phase
        df["cu_mask"] = _parse_cu_mask(m["cu_mask"], to_int=cu_mask_as_int)
        df["source_file"] = f.name

        frames.append(df)

    if not frames:
        raise FileNotFoundError("No *_kernel_trace.csv files matched the new pattern")

    return pd.concat(frames, ignore_index=True) if as_concat else frames


if __name__ == "__main__":
    kernel_df = load_kernel_traces(".", cu_mask_as_int=True)
    print(f"Loaded {kernel_df.source_file.nunique()} files; shape = {kernel_df.shape}")


Loaded 499 files; shape = (24283, 25)


In [2]:
mask = kernel_df["Kernel_Name"].str.startswith("kernel_unified_attention", na=False)
attn_df = kernel_df[mask]

In [3]:
attn_df['mode'].value_counts()

mode
prefill_and_decode    1660
prefill                835
decode                 830
Name: count, dtype: int64

In [4]:
## Ensuring Corerct number of rows and in correct order

import numpy as np
import pandas as pd

# ---------------------------------------------------------------------
# 0)  Filter to attention kernels, keep original order marker
# ---------------------------------------------------------------------
comb_cols = [
    "mode",
    "prefill_batch",
    "prefill_len",
    "decode_batch_size",
    "decode_len",
    "cu_mask",
]

attn_df = (
    kernel_df[kernel_df["Kernel_Name"]
              .str.startswith("kernel_unified_attention", na=False)]
    .copy()
    .reset_index(drop=False)
    .rename(columns={"index": "_orig"})
)

# ---------------------------------------------------------------------
# 1)  Check row-count rule: 5 for prefill/decode, 10 for prefill_and_decode
# ---------------------------------------------------------------------
size_df = (
    attn_df.groupby(comb_cols, sort=False)
           .size()
           .rename("actual")
           .reset_index()
)

# expected size depends on mode
size_df["expected"] = np.where(
    size_df["mode"] == "prefill_and_decode",
    10,
    5,
)

violations = size_df.query("actual != expected")
if not violations.empty:
    raise ValueError(
        "Row-count check failed for these combinations:\n"
        f"{violations.to_string(index=False)}"
    )

# ---------------------------------------------------------------------
# 2)  Re-assemble so each combination’s rows stay together
# ---------------------------------------------------------------------
ordered_blocks = (
    attn_df
    .groupby(comb_cols, sort=False, as_index=False)   # preserve first-seen order
    .apply(lambda g: g.sort_values("_orig"))          # keep original row order
)

ordered_attn_df = (
    ordered_blocks
    .droplevel(0)
    .reset_index(drop=True)
)

print(f"Final shape: {ordered_attn_df.shape}")
# ordered_attn_df now has blocks of 5 or 10 rows in the right order


Final shape: (3325, 26)


  .apply(lambda g: g.sort_values("_orig"))          # keep original row order


In [5]:
ordered_attn_df["duration_us"] = (
    ordered_attn_df["End_Timestamp"] - ordered_attn_df["Start_Timestamp"]
) / 1_000

In [6]:
ordered_attn_df

Unnamed: 0,_orig,Kind,Agent_Id,Queue_Id,Thread_Id,Dispatch_Id,Kernel_Id,Kernel_Name,Correlation_Id,Start_Timestamp,...,Grid_Size_Y,Grid_Size_Z,mode,prefill_batch,prefill_len,decode_batch_size,decode_len,cu_mask,source_file,duration_us
0,42,KERNEL_DISPATCH,7,2,203023,43,273,kernel_unified_attention_2d,43,270611927066532,...,8,1,decode,1,1024,128,256,32,standalone_attn_decode_1.0_1024.0_128.0_256.0_...,741.297
1,43,KERNEL_DISPATCH,7,2,203023,44,273,kernel_unified_attention_2d,44,270611928074879,...,8,1,decode,1,1024,128,256,32,standalone_attn_decode_1.0_1024.0_128.0_256.0_...,754.807
2,44,KERNEL_DISPATCH,7,2,203023,45,273,kernel_unified_attention_2d,45,270611928956255,...,8,1,decode,1,1024,128,256,32,standalone_attn_decode_1.0_1024.0_128.0_256.0_...,738.249
3,45,KERNEL_DISPATCH,7,2,203023,46,273,kernel_unified_attention_2d,46,270611929802391,...,8,1,decode,1,1024,128,256,32,standalone_attn_decode_1.0_1024.0_128.0_256.0_...,754.166
4,46,KERNEL_DISPATCH,7,2,203023,47,273,kernel_unified_attention_2d,47,270611930662118,...,8,1,decode,1,1024,128,256,32,standalone_attn_decode_1.0_1024.0_128.0_256.0_...,760.541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3320,24278,KERNEL_DISPATCH,7,1,245278,48,274,kernel_unified_attention_2d,48,276467464811420,...,8,1,prefill_and_decode,8,512,64,512,32,standalone_attn_prefill_and_decode_8.0_512.0_6...,1908.126
3321,24279,KERNEL_DISPATCH,7,2,245278,49,273,kernel_unified_attention_2d,49,276467466839981,...,8,1,prefill_and_decode,8,512,64,512,32,standalone_attn_prefill_and_decode_8.0_512.0_6...,708.421
3322,24280,KERNEL_DISPATCH,7,1,245278,50,274,kernel_unified_attention_2d,50,276467466923893,...,8,1,prefill_and_decode,8,512,64,512,32,standalone_attn_prefill_and_decode_8.0_512.0_6...,1915.142
3323,24281,KERNEL_DISPATCH,7,2,245278,51,273,kernel_unified_attention_2d,51,276467468953495,...,8,1,prefill_and_decode,8,512,64,512,32,standalone_attn_prefill_and_decode_8.0_512.0_6...,690.339


In [7]:
ordered_attn_df.to_csv('ordered_attn_df.csv')

In [8]:
def _summarise_group(g: pd.DataFrame) -> pd.Series:
    g = g.sort_values("_orig")                # keep original chronological order
    mode = g["mode"].iat[0]

    # initialise with NaN – will be overwritten as needed
    out = {
        "avg_duration_us":          pd.NA,
        "avg_decode_duration_us":   pd.NA,
        "avg_prefill_duration_us":  pd.NA,
    }
    dur = g["duration_us"]
    decode_vals  = dur[g["Kernel_Id"] == 273]
    prefill_vals = dur[g["Kernel_Id"] == 274]

    if mode in {"prefill", "decode"}:
        out["avg_duration_us"] = g["duration_us"].iloc[-3:].mean()
    else:      
        out["avg_decode_duration_us"]  = decode_vals.iloc[-3:].mean()
        out["avg_prefill_duration_us"] = prefill_vals.iloc[-3:].mean()
        
        
        # a=sub.iloc[[0, 2, 4]]["duration_us"].mean()
        # b=sub.iloc[[1, 3, 5]]["duration_us"].mean()
        # out["avg_decode_duration_us"]  = min(a,b)
        # out["avg_prefill_duration_us"] = max(a,b)

    return pd.Series(out)


summary_df = (
    ordered_attn_df
    .groupby(
        ["mode","prefill_batch","prefill_len","decode_batch_size","decode_len","cu_mask"],
        sort=False,
    )
    .apply(_summarise_group)
    .reset_index()
)


# ── 3. build summary – keep keys as columns, THEN reset_index ──────────────────
summary_df = (
    ordered_attn_df
    .groupby(comb_cols, sort=False)      # as_index=True by default
    .apply(_summarise_group)
    .reset_index()                       # keys become real columns
)

print(summary_df.head())

  .apply(_summarise_group)
  .apply(_summarise_group)


     mode  prefill_batch  prefill_len  decode_batch_size  decode_len  cu_mask  \
0  decode              1         1024                128         256       32   
1  decode              1         1024                128         512       32   
2  decode              1         1024                256         256       32   
3  decode              1         1024                256         512       32   
4  decode              1         1024                 32         256       32   

  avg_duration_us avg_decode_duration_us avg_prefill_duration_us  
0      750.985333                   <NA>                    <NA>  
1     1326.648667                   <NA>                    <NA>  
2     1488.378667                   <NA>                    <NA>  
3     2622.133333                   <NA>                    <NA>  
4      189.981333                   <NA>                    <NA>  


In [9]:
summary_df.tail()

Unnamed: 0,mode,prefill_batch,prefill_len,decode_batch_size,decode_len,cu_mask,avg_duration_us,avg_decode_duration_us,avg_prefill_duration_us
494,prefill_and_decode,8,512,32,512,32,,361.560667,1907.056667
495,prefill_and_decode,8,512,512,256,32,,3090.230667,1870.359333
496,prefill_and_decode,8,512,512,512,32,,5451.580333,2055.904
497,prefill_and_decode,8,512,64,256,32,,398.538667,1900.067333
498,prefill_and_decode,8,512,64,512,32,,714.715333,1903.221333


In [10]:
summary_df[summary_df['mode']=='prefill']

Unnamed: 0,mode,prefill_batch,prefill_len,decode_batch_size,decode_len,cu_mask,avg_duration_us,avg_decode_duration_us,avg_prefill_duration_us
166,prefill,1,1024,128,256,32,952.967,,
167,prefill,1,1024,128,512,32,957.19,,
168,prefill,1,1024,256,256,32,962.388667,,
169,prefill,1,1024,256,512,32,974.59,,
170,prefill,1,1024,32,256,32,953.648333,,
...,...,...,...,...,...,...,...,...,...
328,prefill,8,512,32,512,32,1905.012,,
329,prefill,8,512,512,256,32,1909.128,,
330,prefill,8,512,512,512,32,1903.261667,,
331,prefill,8,512,64,256,32,1898.289667,,


In [11]:
summary_df.to_csv('result.csv')

In [12]:
df_res=pd.read_csv('result.csv')
df_res

Unnamed: 0.1,Unnamed: 0,mode,prefill_batch,prefill_len,decode_batch_size,decode_len,cu_mask,avg_duration_us,avg_decode_duration_us,avg_prefill_duration_us
0,0,decode,1,1024,128,256,32,750.985333,,
1,1,decode,1,1024,128,512,32,1326.648667,,
2,2,decode,1,1024,256,256,32,1488.378667,,
3,3,decode,1,1024,256,512,32,2622.133333,,
4,4,decode,1,1024,32,256,32,189.981333,,
...,...,...,...,...,...,...,...,...,...,...
494,494,prefill_and_decode,8,512,32,512,32,,361.560667,1907.056667
495,495,prefill_and_decode,8,512,512,256,32,,3090.230667,1870.359333
496,496,prefill_and_decode,8,512,512,512,32,,5451.580333,2055.904000
497,497,prefill_and_decode,8,512,64,256,32,,398.538667,1900.067333


In [13]:
df_exp=pd.read_csv("../Decode Mask Experiment.csv")
df_exp

Unnamed: 0,Prefill Len,Prefill Batch,Prefill Tokens,Decode batch size,Decode len,CU mask,Prefill time in isolation,Prefill time with interference,Serial Prefill time,Slowdown in prefill (H-G),Decode time in isolation,Decode Time with interference,Serial Decode Time,Slowdown in decode(L-K),Total Serial Time,Total Time with Interference,Overlap,Unnamed: 17,Unnamed: 18,Slowdown in decode ()
0,256,1,256,32,256,32.0,,,,,,,,,,,,,,
1,256,4,1024,32,256,32.0,,,,,,,,,,,,,,
2,256,8,2048,32,256,32.0,,,,,,,,,,,,,,
3,256,16,4096,32,256,32.0,,,,,,,,,,,,,,
4,256,32,8192,32,256,32.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694,2048,1,2048,512,4096,,,,,,,,,,,,,,,
1695,2048,4,8192,512,4096,,,,,,,,,,,,,,,
1696,2048,8,16384,512,4096,,,,,,,,,,,,,,,
1697,4096,1,4096,512,4096,,,,,,,,,,,,,,,


In [14]:
import pandas as pd

# ── 1. load the two CSVs ──────────────────────────────────────────────────────
df_res = pd.read_csv("result.csv")                             # has mode + avg_* cols
df_exp = pd.read_csv("../Decode Mask Experiment.csv")

# ── 2. normalize the experiment sheet’s key names ────────────────────────────
df_exp = df_exp.rename(columns={
    "Prefill Len":       "prefill_len",
    "Prefill Batch":     "prefill_batch",
    "Decode batch size": "decode_batch_size",
    "Decode len":        "decode_len",
    "CU mask":           "cu_mask",
})
comb_cols = ["prefill_batch", "prefill_len", "decode_batch_size", "decode_len", "cu_mask"]
df_exp[comb_cols] = df_exp[comb_cols].apply(pd.to_numeric, errors="ignore")

# ── 3. carve out three summaries ───────────────────────────────────────────────
# 3a) prefill-only → Prefill time in isolation
df_pref = (
    df_res[df_res["mode"] == "prefill"]
    .loc[:, comb_cols + ["avg_duration_us"]]
    .rename(columns={"avg_duration_us": "Prefill time in isolation"})
)

# 3b) decode-only → Decode time in isolation
df_dec = (
    df_res[df_res["mode"] == "decode"]
    .loc[:, comb_cols + ["avg_duration_us"]]
    .rename(columns={"avg_duration_us": "Decode time in isolation"})
)

# 3c) prefill_and_decode → interference columns
df_both = (
    df_res[df_res["mode"] == "prefill_and_decode"]
    .loc[:, comb_cols + ["avg_prefill_duration_us", "avg_decode_duration_us"]]
    .rename(columns={
        "avg_prefill_duration_us": "Prefill time with interference",
        "avg_decode_duration_us":  "Decode Time with interference"
    })
)

# ── 4. stitch them into one wide summary ──────────────────────────────────────
df_summary = (
    df_pref
    .merge(df_dec, on=comb_cols, how="outer")
    .merge(df_both, on=comb_cols, how="outer")
)

# ── 5. merge wide summary into your experiment sheet ──────────────────────────
df_filled = df_exp.merge(df_summary, on=comb_cols, how="left")

# ── 6. save the filled sheet ─────────────────────────────────────────────────
df_filled.to_csv("Decode Mask Experiment - final.csv", index=False)
print("✅ Wrote Decode Mask Experiment - final.csv")


✅ Wrote Decode Mask Experiment - final.csv


  df_exp[comb_cols] = df_exp[comb_cols].apply(pd.to_numeric, errors="ignore")


In [15]:
df_filled[:20]

Unnamed: 0,prefill_len,prefill_batch,Prefill Tokens,decode_batch_size,decode_len,cu_mask,Prefill time in isolation_x,Prefill time with interference_x,Serial Prefill time,Slowdown in prefill (H-G),...,Total Serial Time,Total Time with Interference,Overlap,Unnamed: 17,Unnamed: 18,Slowdown in decode (),Prefill time in isolation_y,Decode time in isolation_y,Prefill time with interference_y,Decode Time with interference_y
0,256,1,256,32,256,32.0,,,,,...,,,,,,,71.590333,190.035,72.071667,193.095
1,256,4,1024,32,256,32.0,,,,,...,,,,,,,271.915333,190.155333,271.447667,193.857
2,256,8,2048,32,256,32.0,,,,,...,,,,,,,526.952,190.195333,534.195333,193.536
3,256,16,4096,32,256,32.0,,,,,...,,,,,,,1089.933667,189.728,,784.448
4,256,32,8192,32,256,32.0,,,,,...,,,,,,,2155.478,190.703,,1483.394
5,512,1,512,32,256,32.0,,,,,...,,,,,,,248.809,190.101667,248.141333,194.324667
6,512,4,2048,32,256,32.0,,,,,...,,,,,,,971.329333,189.927667,947.114,193.990333
7,512,8,4096,32,256,32.0,,,,,...,,,,,,,1913.497667,189.713667,1891.300667,213.381667
8,512,16,8192,32,256,32.0,,,,,...,,,,,,,3835.468333,190.409333,,2598.692333
9,1024,1,1024,32,256,32.0,,,,,...,,,,,,,953.648333,189.981333,963.497667,205.804333


In [29]:
df_filled['Slowdown in Prefill']=df_filled['Prefill time with interference_y']-df_filled['Prefill time in isolation_y']

In [17]:
df_filled['Slowdown in Decode']=df_filled['Decode Time with interference_y']-df_filled['Decode time in isolation_y']

In [22]:
df_filled

Unnamed: 0,prefill_len,prefill_batch,Prefill Tokens,decode_batch_size,decode_len,cu_mask,Prefill time in isolation_x,Prefill time with interference_x,Serial Prefill time,Slowdown in prefill (H-G),...,Overlap,Unnamed: 17,Unnamed: 18,Slowdown in decode (),Prefill time in isolation_y,Decode time in isolation_y,Prefill time with interference_y,Decode Time with interference_y,Slowdown in Prefill,Slowdown in Decode
0,256,1,256,32,256,32.0,,,,,...,,,,,71.590333,190.035000,72.071667,193.095,0.481333,3.060000
1,256,4,1024,32,256,32.0,,,,,...,,,,,271.915333,190.155333,271.447667,193.857,-0.467667,3.701667
2,256,8,2048,32,256,32.0,,,,,...,,,,,526.952000,190.195333,534.195333,193.536,7.243333,3.340667
3,256,16,4096,32,256,32.0,,,,,...,,,,,1089.933667,189.728000,,784.448,,594.720000
4,256,32,8192,32,256,32.0,,,,,...,,,,,2155.478000,190.703000,,1483.394,,1292.691000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694,2048,1,2048,512,4096,,,,,,...,,,,,,,,,,
1695,2048,4,8192,512,4096,,,,,,...,,,,,,,,,,
1696,2048,8,16384,512,4096,,,,,,...,,,,,,,,,,
1697,4096,1,4096,512,4096,,,,,,...,,,,,,,,,,


In [30]:
print(df_filled['Slowdown in Prefill'].mean())
print(df_filled['Prefill time in isolation_y'].mean())

69.09479166666672
7647.018798403194


In [31]:
print(df_filled['Slowdown in Decode'].mean())
print(df_filled['Decode time in isolation_y'].mean())

168.1874939759036
1510.0693373493978


In [27]:
!pip install ace_tools

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [28]:
import pandas as pd
import ace_tools
slow = df_filled.pivot_table(
    index='prefill_len',
    columns='prefill_batch',
    values='Slowdown in Prefill',
    aggfunc='mean'
)
iso = df_filled.pivot_table(
    index='prefill_len',
    columns='prefill_batch',
    values='Prefill time in isolation_y',
    aggfunc='mean'
)

# 2) Calculate the percentage slowdown matrix
pct_slowdown = slow / iso

# 3) Display the matrix
ace_tools.display_dataframe_to_user("Percentage Slowdown Matrix", pct_slowdown)

ModuleNotFoundError: No module named 'ace_tools'