Analyze GPU project operator in Spark event logs.

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd

cpu_dir = "/opt/data/profiles/CPU-1k"
gpu_dir = "/opt/data/profiles/GPU-1k"
cpu_files = os.listdir(cpu_dir)
cpu_files.sort()
gpu_files = os.listdir(gpu_dir)
gpu_files.sort()

In [None]:
project_times = []
for i, app in enumerate(gpu_files):
    sql_info = pd.read_csv(gpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
    filtered_df = sql_info[(sql_info["name"] == "op time") & (sql_info["nodeName"] == "GpuProject")].copy()
    filtered_df["appIndex"] = i 
    filtered_df["app"] = app 
    project_times.append(filtered_df) 
project_times_df = pd.concat(project_times)
project_times_df["total milliseconds"] = project_times_df["total"] / 1000000
project_times_df

In [None]:
codegen_times = []
for i, app in enumerate(cpu_files) :
    sql_info = pd.read_csv(cpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
    filtered_df = sql_info[(sql_info["name"] == "duration") & sql_info["nodeName"].str.startswith("WholeStageCodegen")].copy()
    filtered_df["appIndex"] = i  
    filtered_df["app"] = app 
    codegen_times.append(filtered_df)
codegen_times_df = pd.concat(codegen_times)
codegen_times_df

In [None]:
mappings = []
for i, app in enumerate(cpu_files):
    mapping_info = pd.read_csv(cpu_dir + "/" + app + "/wholestagecodegen_mapping.csv")
    filtered_df = mapping_info[mapping_info["Child Node"] == "Project"].copy()
    filtered_df["appIndex"] = i  
    filtered_df["app"] = app 
    mappings.append(filtered_df)
mapping_df = pd.concat(mappings)
mapping_df[mapping_df["appIndex"] ==101]

In [None]:
combined_codegen = pd.merge(codegen_times_df, mapping_df, on=["appIndex", "sqlID", "nodeID"], how="right")
combined_codegen

In [None]:
project_counts = project_times_df.groupby("appIndex").size().reset_index(name="counts gpu")
codegen_counts = combined_codegen.groupby("appIndex").size().reset_index(name="counts cpu")
combined_counts = pd.merge(project_counts, codegen_counts, on="appIndex", how="outer")
combined_counts[combined_counts["counts gpu"] != combined_counts["counts cpu"]]

11 - query 12: 1 project falls back to cpu
20 - query 20: 1 project falls back to cpu
50 - query 47: missing a branch (3 projects), an extra gpu project
56 - query 53: 1 extra gpu project
60 - query 57: missing a branch (3 projects), an extra gpu project
61 - query 58: missing branches (4 projects), 1 project falls back to cpu
64 - query 61: 1 project falls back to cpu
66 - query 63: 1 extra gpu project after gpu window
92 - query 89: 1 extra gpu project after gpu window
101 - query 98: 1 project falls back to cpu