Parse the raw event log to compare CPU and GPU scan parquet operations.

In [None]:
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

cpu_profiles = "/opt/data/profiles/dataproc-cpu-sf3k"
gpu_profiles = "/opt/data/profiles/dataproc-gpu-sf3k"
cpu_events = "/opt/data/events/dataproc-cpu-sf3k"
gpu_events = "/opt/data/events/dataproc-gpu-sf3k"

cpu_profile_dirs = os.listdir(cpu_profiles)
cpu_profile_dirs.sort()
gpu_profile_dirs = os.listdir(gpu_profiles)
gpu_profile_dirs.sort()

cpu_event_files = os.listdir(cpu_events)
cpu_event_files.sort()
gpu_event_files = os.listdir(gpu_events)
gpu_event_files.sort()

num_apps = len(cpu_event_files)
assert len(cpu_event_files) == num_apps
assert len(gpu_event_files) == num_apps
assert len(cpu_profile_dirs) == num_apps
assert len(gpu_profile_dirs) == num_apps
num_apps

In [None]:
def collect(node, min_times, median_times, max_times, total_times, min_map, median_map, max_map, total_map):
    if node['nodeName'] == 'GpuScan parquet ' or node['nodeName'] == 'Scan parquet ':
        for m in node['metrics']:
            if m['name'] == 'scan time':
                accumulator_id = m['accumulatorId']
                min_times.append(min_map.get(accumulator_id, 0))
                median_times.append(median_map.get(accumulator_id, 0))
                max_times.append(max_map.get(accumulator_id, 0))
                total_times.append(total_map.get(accumulator_id, 0))
    for child in node['children']:
        collect(child, min_times, median_times, max_times, total_times, min_map, median_map, max_map, total_map)


def collect_scan_times(profile_dir, event_file):
    sql_info = pd.read_csv(f"{profile_dir}/sql_plan_metrics_for_application.csv")
    filtered_df = sql_info[(sql_info["name"] == "scan time")]
    filtered_df.set_index("accumulatorId", inplace=True)
    min_dict = filtered_df["min"].to_dict()
    median_dict = filtered_df["median"].to_dict()
    max_dict = filtered_df["max"].to_dict()
    total_dict = filtered_df["total"].to_dict()

    min_times = []
    median_times = []
    max_times = []
    total_times = []
    with open(event_file, "r") as f:
        for line in f:
            event = json.loads(line)
            if "sparkPlanInfo" in event:
                min_times.clear()
                median_times.clear()
                max_times.clear()
                total_times.clear()
                collect(event["sparkPlanInfo"], min_times, median_times, max_times, total_times, min_dict, median_dict,
                        max_dict, total_dict)
    min_times.sort()
    median_times.sort()
    max_times.sort()
    total_times.sort()
    return min_times, median_times, max_times, total_times


cpu_min_times = []
cpu_median_times = []
cpu_max_times = []
cpu_total_times = []
gpu_min_times = []
gpu_median_times = []
gpu_max_times = []
gpu_total_times = []
cpu_query_total_times = []
gpu_query_total_times = []
for i in range(num_apps):
    print(f"Processing cpu profile {cpu_profile_dirs[i]}, event file {cpu_event_files[i]}")
    (cpu_min, cpu_median, cpu_max, cpu_total) = collect_scan_times(f"{cpu_profiles}/{cpu_profile_dirs[i]}",
                                                                   f"{cpu_events}/{cpu_event_files[i]}")
    print(f"Processing gpu profile {gpu_profile_dirs[i]}, event file {gpu_event_files[i]}")
    (gpu_min, gpu_median, gpu_max, gpu_total) = collect_scan_times(f"{gpu_profiles}/{gpu_profile_dirs[i]}",
                                                                   f"{gpu_events}/{gpu_event_files[i]}")
    if len(cpu_min) != len(gpu_min):
        print(f"Warning: cpu and gpu scan times have different lengths: {len(cpu_min)} vs {len(gpu_min)}")
        continue
    cpu_min_times.extend(cpu_min)
    cpu_median_times.extend(cpu_median)
    cpu_max_times.extend(cpu_max)
    cpu_total_times.extend(cpu_total)
    gpu_min_times.extend(gpu_min)
    gpu_median_times.extend(gpu_median)
    gpu_max_times.extend(gpu_max)
    gpu_total_times.extend(gpu_total)
    cpu_query_total_times.append(sum(cpu_total))
    gpu_query_total_times.append(sum(gpu_total))

In [None]:
# Create a scatter plot
plt.scatter(cpu_total_times, gpu_total_times)

# Calculate the linear regression line
slope, intercept = np.polyfit(cpu_total_times, gpu_total_times, 1)
print(slope, intercept)
regression_line = np.array(cpu_total_times) * slope + intercept

# Plot the linear regression line
plt.plot(cpu_total_times, regression_line, color='red')  # you can choose any color you like for the line

# Adding title and labels (optional)
plt.title("Total Scan Times")
plt.xlabel("CPU")
plt.ylabel("GPU")

# Show the plot
plt.show()

In [None]:
# Create a scatter plot
plt.scatter(cpu_query_total_times, gpu_query_total_times)
print(len(cpu_query_total_times), len(gpu_query_total_times))

# Calculate the linear regression line
slope, intercept = np.polyfit(cpu_query_total_times, gpu_query_total_times, 1)
print(slope, intercept)
regression_line = np.array(cpu_query_total_times) * slope + intercept

# Plot the linear regression line
plt.plot(cpu_query_total_times, regression_line, color='red')  # you can choose any color you like for the line

# Adding title and labels (optional)
plt.title("Total Scan Times Per Query")
plt.xlabel("CPU")
plt.ylabel("GPU")

# Show the plot
plt.show()