Parse the profiler output to compare input rows and op times for GPU top N operations.

In [None]:
%matplotlib notebook
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

profile_dirs = ["/opt/data/profiles/dataproc-gpu-sf100",
                "/opt/data/profiles/dataproc-gpu-sf3k",
                "/opt/data/profiles/dataproc-gpu-sf5k",
                "/opt/data/profiles/dataproc-gpu-sf10k"]
scale_factors = ["100", "3K", "5K", "10K"]

In [None]:
num_scale_factors = len(profile_dirs)
input_rows = [[] for _ in range(num_scale_factors)]
output_rows = [[] for _ in range(num_scale_factors)]
op_times = [[] for _ in range(num_scale_factors)]
sort_times = [[] for _ in range(num_scale_factors)]
for i, profile_dir in enumerate(profile_dirs):
    apps = os.listdir(profile_dir)
    apps.sort()
    for app in apps:
        print(f"Processing gpu profile {profile_dir}/{app}")
        sql_info = pd.read_csv(f"{profile_dir}/{app}/sql_plan_metrics_for_application.csv")
        condition = (sql_info['nodeName'] == 'GpuTopN')
        filtered = sql_info[condition]
        grouped = filtered.groupby("nodeID")
        for _, group in grouped:
            input_rows_for_op = 0
            output_rows_for_op = 0
            op_time_for_op = 0
            sort_time_for_op = 0
            for row in group.itertuples():
                if row.name == "input rows":
                    input_rows_for_op = row.total
                elif row.name == "output rows":
                    output_rows_for_op = row.total
                elif row.name == "op time":
                    op_time_for_op = row.total / 1000000.
                elif row.name == "sort time":
                    sort_time_for_op = row.total / 1000000.
            if input_rows_for_op != 0 and output_rows_for_op != 0 and op_time_for_op != 0 and sort_time_for_op != 0:
                input_rows[i].append(input_rows_for_op)
                output_rows[i].append(output_rows_for_op)
                op_times[i].append(op_time_for_op)
                sort_times[i].append(sort_time_for_op)
print([len(x) for x in input_rows])
print([len(x) for x in output_rows])
print([len(x) for x in op_times])
print([len(x) for x in sort_times])

In [None]:
for i in range(num_scale_factors):
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(projection='3d')
    ax.scatter(np.log(input_rows[i]), np.log(output_rows[i]), np.log(op_times[i]))
    ax.set_xlabel("Input Rows (log)")
    ax.set_ylabel("Output Rows (log)")
    ax.set_zlabel("Op Time (ms) (log)")
    ax.set_title(f"GPU Top N Op Times - SF{scale_factors[i]}")
    plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(projection='3d')
for i in range(num_scale_factors):
    ax.scatter(np.log(input_rows[i]), np.log(output_rows[i]), np.log(op_times[i]), label=f"SF{scale_factors[i]}")
ax.set_xlabel("Input Rows (log)")
ax.set_ylabel("Output Rows (log)")
ax.set_zlabel("Op Time (ms) (log)")
ax.set_title("GPU Top N Op Times - Combined")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot()
for i in range(num_scale_factors):
    ax.scatter(sort_times[i], op_times[i], label=f"SF{scale_factors[i]}")
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel("Sort Time (ms)")
ax.set_ylabel("Op Time (ms)")
ax.set_title("GPU Top N Sort Time vs Op Time - Combined")
plt.legend()
plt.show()