Parse the profiler output to compare input/output rows and op times for GPU filter operations.

In [None]:
%matplotlib notebook
import numpy as np
import json
import os

import matplotlib.pyplot as plt
import pandas as pd

profile_dirs = ["/opt/data/profiles/dataproc-gpu-sf100",
                "/opt/data/profiles/dataproc-gpu-sf3k",
                "/opt/data/profiles/dataproc-gpu-sf5k",
                "/opt/data/profiles/dataproc-gpu-sf10k"]
event_dirs = ["/opt/data/events/dataproc-gpu-sf100",
              "/opt/data/events/dataproc-gpu-sf3k",
              "/opt/data/events/dataproc-gpu-sf5k",
              "/opt/data/events/dataproc-gpu-sf10k"]
scale_factors = ["100", "3K", "5K", "10K"]

In [None]:
class Filter:
    def __init__(self, group):
        self.input_rows = 0
        self.output_rows = 0
        self.op_time = 0
        self.accumulator_id = 0
        for row in group.itertuples():
            if row.name == "output rows":
                self.output_rows = row.total
            elif row.name == "op time":
                self.op_time = row.total / 1000000.
                self.accumulator_id = row.accumulatorId

    def __str__(self):
        return f"{self.input_rows}, {self.output_rows}, {self.op_time}, {self.accumulator_id}"


def collect_output_rows(profile_dir):
    sql_info = pd.read_csv(f"{profile_dir}/sql_plan_metrics_for_application.csv")
    condition = sql_info["name"] == "output rows"
    filtered = sql_info[condition].set_index("accumulatorId")
    return filtered["total"].to_dict()


def collect_filters(profile_dir):
    sql_info = pd.read_csv(f"{profile_dir}/sql_plan_metrics_for_application.csv")
    condition = sql_info['nodeName'] == 'GpuFilter'
    filtered = sql_info[condition]
    grouped = filtered.groupby("nodeID")
    filter_map = {}
    for _, group in grouped:
        f = Filter(group)
        filter_map[f.accumulator_id] = f
    return filter_map


def collect(node, filter_list, output_rows_map, filter_map):
    if node['nodeName'] == 'GpuFilter':
        f = None
        for m in node['metrics']:
            if m['name'] == 'op time':
                accumulator_id = m['accumulatorId']
                if accumulator_id in filter_map:
                    f = filter_map[accumulator_id]
                else:
                    print(f"Accumulator ID {accumulator_id} not found in filter map")
        if f is not None:
            child = node['children'][0]
            for m in child['metrics']:
                if m['name'] == 'output rows':
                    accumulator_id = m['accumulatorId']
                    if accumulator_id in output_rows_map:
                        f.input_rows = output_rows_map[accumulator_id]
                    else:
                        print(f"Accumulator ID {f.accumulator_id} not found in output rows map")
                    filter_list.append(f)
    for child in node['children']:
        collect(child, filter_list, output_rows_map, filter_map)


def collect_input_rows(event_file, output_rows_map, filter_map):
    last_event = None
    with open(event_file, "r") as f:
        for line in f:
            event = json.loads(line)
            if "sparkPlanInfo" in event:
                last_event = event
    assert last_event is not None
    fs = []
    collect(last_event["sparkPlanInfo"], fs, output_rows_map, filter_map)
    return fs


num_scale_factors = len(scale_factors)
input_rows = [[] for _ in range(num_scale_factors)]
output_rows = [[] for _ in range(num_scale_factors)]
op_times = [[] for _ in range(num_scale_factors)]
for i in range(num_scale_factors):
    profile_apps = os.listdir(profile_dirs[i])
    profile_apps.sort()
    event_apps = os.listdir(event_dirs[i])
    event_apps.sort()
    assert len(profile_apps) == len(event_apps)
    for j in range(len(profile_apps)):
        print(f"Processing profile {profile_apps[j]}, event {event_apps[j]}, scale factor {scale_factors[i]}...")
        output_rows_dict = collect_output_rows(f"{profile_dirs[i]}/{profile_apps[j]}")
        filter_dict = collect_filters(f"{profile_dirs[i]}/{profile_apps[j]}")
        filters = collect_input_rows(f"{event_dirs[i]}/{event_apps[j]}", output_rows_dict, filter_dict)
        for fltr in filters:
            if fltr.input_rows == 0 or fltr.output_rows == 0 or fltr.op_time == 0:
                print(f"Invalid filter: {fltr}")
            else:
                input_rows[i].append(fltr.input_rows)
                output_rows[i].append(fltr.output_rows)
                op_times[i].append(fltr.op_time)

print(f"{[len(input_rows[i]) for i in range(num_scale_factors)]}")

In [None]:
for i in range(num_scale_factors):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(projection='3d')
    ax.scatter(np.log(input_rows[i]), np.log(output_rows[i]), np.log(op_times[i]))
    ax.set_xlabel("Input Rows (log)")
    ax.set_ylabel("Output Rows (log)")
    ax.set_zlabel("Op Time (ms) (log)")
    ax.set_title(f"GPU Filter Op Times - SF{scale_factors[i]}")
    plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(projection='3d')
for i in range(num_scale_factors):
    ax.scatter(np.log(input_rows[i]), np.log(output_rows[i]), np.log(op_times[i]), label=f"SF{scale_factors[i]}")
ax.set_xlabel("Input Rows (log)")
ax.set_ylabel("Output Rows (log)")
ax.set_zlabel("Op Time (ms) (log)")
ax.set_title(f"GPU Filter Op Times - Combined")
ax.legend()
plt.show()