Parse the profiler output to sum up stage durations vs job durations.

In [None]:
import os

import pandas as pd
from matplotlib import pyplot as plt

cpu_profile_dirs = ["/opt/data/profiles/dataproc-cpu-sf100",
                    "/opt/data/profiles/dataproc-cpu-sf3k",
                    "/opt/data/profiles/dataproc-cpu-sf5k",
                    "/opt/data/profiles/dataproc-cpu-sf10k"]
gpu_profile_dirs = ["/opt/data/profiles/dataproc-gpu-sf100",
                    "/opt/data/profiles/dataproc-gpu-sf3k",
                    "/opt/data/profiles/dataproc-gpu-sf5k",
                    "/opt/data/profiles/dataproc-gpu-sf10k"]
scale_factors = ["100", "3K", "5K", "10K"]

In [None]:
def sum_job_stage_durations(profile):
    df = pd.read_csv(f"{profile}/sql_to_stage_information.csv")
    filtered_groups = df.groupby('jobID').filter(lambda x: len(x) > 1)
    grouped = filtered_groups.groupby('jobID')['Stage Duration'].sum()
    return grouped.to_dict()


def collect_job_times(profile, stage_sums_map, job_times, stage_times):
    df = pd.read_csv(f"{profile}/job_information.csv")
    for _, row in df.iterrows():
        job_id = row['jobID']
        if job_id in stage_sums_map:
            job_times.append(row['endTime'] - row['startTime'])
            stage_times.append(stage_sums_map[job_id])


num_scale_factors = len(scale_factors)
# num_scale_factors = 1
num_cpu_stages = 0
num_gpu_stages = 0
cpu_job_times = [[] for _ in range(num_scale_factors)]
cpu_stage_times = [[] for _ in range(num_scale_factors)]
gpu_job_times = [[] for _ in range(num_scale_factors)]
gpu_stage_times = [[] for _ in range(num_scale_factors)]
for i in range(num_scale_factors):
    cpu_apps = os.listdir(cpu_profile_dirs[i])
    cpu_apps.sort()
    gpu_apps = os.listdir(gpu_profile_dirs[i])
    gpu_apps.sort()
    assert len(cpu_apps) == len(gpu_apps)
    for j in range(len(cpu_apps)):
        # for j in range(1):
        print(f"Processing cpu profile {cpu_apps[j]}, gpu profile {gpu_apps[j]}, scale factor {scale_factors[i]}...")
        cpu_stage_sums = sum_job_stage_durations(f"{cpu_profile_dirs[i]}/{cpu_apps[j]}")
        collect_job_times(f"{cpu_profile_dirs[i]}/{cpu_apps[j]}", cpu_stage_sums, cpu_job_times[i], cpu_stage_times[i])
        gpu_stage_sums = sum_job_stage_durations(f"{gpu_profile_dirs[i]}/{gpu_apps[j]}")
        collect_job_times(f"{gpu_profile_dirs[i]}/{gpu_apps[j]}", gpu_stage_sums, gpu_job_times[i], gpu_stage_times[i])

In [None]:
for i in range(num_scale_factors):
    plt.scatter(cpu_stage_times[i], cpu_job_times[i])
    plt.xscale('log')
    plt.yscale('log')
    plt.title(f"CPU Job vs Stage Durations - SF{scale_factors[i]}")
    plt.xlabel("Sum of Stage Durations")
    plt.ylabel("Job Duration")
    plt.show()

In [None]:
for i in range(num_scale_factors):
    plt.scatter(gpu_stage_times[i], gpu_job_times[i])
    plt.xscale('log')
    plt.yscale('log')
    plt.title(f"GPU Job vs Stage Durations - SF{scale_factors[i]}")
    plt.xlabel("Sum of Stage Durations")
    plt.ylabel("Job Duration")
    plt.show()

In [None]:
for i in range(num_scale_factors):
    plt.scatter(cpu_stage_times[i], cpu_job_times[i], label=f"SF{scale_factors[i]}")
plt.xscale('log')
plt.yscale('log')
plt.title("CPU Job vs Stage Durations - Combined")
plt.xlabel("Sum of Stage Durations")
plt.ylabel("SQL Duration")
plt.legend()
plt.show()

In [None]:
for i in range(num_scale_factors):
    plt.scatter(gpu_stage_times[i], gpu_job_times[i], label=f"SF{scale_factors[i]}")
plt.xscale('log')
plt.yscale('log')
plt.title("GPU Job vs Stage Durations - Combined")
plt.xlabel("Sum of Stage Durations")
plt.ylabel("SQL Duration")
plt.legend()
plt.show()

In [None]:
for i in range(num_scale_factors):
    plt.scatter(gpu_stage_times[i], cpu_job_times[i])
    plt.xscale('log')
    plt.yscale('log')
    plt.title(f"CPU Job vs GPU Stage Durations - SF{scale_factors[i]}")
    plt.xlabel("GPU Sum of Stage Durations")
    plt.ylabel("CPU Job Duration")
    plt.show()