Parse the profiler output to compare output rows and op times for GPU expand operations.

In [None]:
%matplotlib notebook
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

profile_dirs = ["/opt/data/profiles/dataproc-gpu-sf100",
                "/opt/data/profiles/dataproc-gpu-sf3k",
                "/opt/data/profiles/dataproc-gpu-sf5k",
                "/opt/data/profiles/dataproc-gpu-sf10k"]

In [None]:
input_rows = []
output_rows = []
op_times = []
for profile_dir in profile_dirs:
    apps = os.listdir(profile_dir)
    apps.sort()
    for app in apps:
        print(f"Processing gpu profile {app}")
        sql_info = pd.read_csv(f"{profile_dir}/{app}/sql_plan_metrics_for_application.csv")
        condition = (sql_info['nodeName'] == 'GpuExpand')
        filtered = sql_info[condition]
        grouped = filtered.groupby("nodeID")
        for _, group in grouped:
            for row in group.itertuples():
                if row.name == "input rows":
                    input_rows.append(row.total)
                elif row.name == "output rows":
                    output_rows.append(row.total)
                elif row.name == "op time":
                    op_times.append(row.total / 1000000.)
print(input_rows)
print(output_rows)
print(op_times)

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(projection='3d')
ax.scatter(np.log(input_rows), np.log(output_rows), np.log(op_times))
ax.set_xlabel("Input Rows (log)")
ax.set_ylabel("Output Rows (log)")
ax.set_zlabel("Op Time (ms) (log)")
ax.set_title(f"GPU Expand Op Times - Combined")
plt.show()