Analyze GPU project operator in Spark event logs.

In [14]:
import os

import matplotlib.pyplot as plt
import pandas as pd

cpu_dir = "/home/rou/src/spark-rapids-tools/dataproc-cpu-3k"
gpu_dir = "/home/rou/src/spark-rapids-tools/dataproc-gpu-3k"
cpu_files = os.listdir(cpu_dir)
cpu_files.sort()
gpu_files = os.listdir(gpu_dir)
gpu_files.sort()

In [15]:
project_times = []
for i, app in enumerate(gpu_files):
    sql_info = pd.read_csv(gpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
    filtered_df = sql_info[(sql_info["name"] == "op time") & (sql_info["nodeName"] == "GpuProject")].copy()
    filtered_df["appIndex"] = i 
    filtered_df["app"] = app 
    project_times.append(filtered_df) 
project_times_df = pd.concat(project_times)
project_times_df["total milliseconds"] = project_times_df["total"] / 1000000
project_times_df[project_times_df["appIndex"] == 2]

Unnamed: 0,appIndex,sqlID,nodeID,nodeName,accumulatorId,name,min,median,max,total,metricType,stageIds,app,total milliseconds
37,2,24,6,GpuProject,5649,op time,0,151041,1692498,53696281,nsTiming,102111,application_1701368813061_0003,53.696281
123,2,24,19,GpuProject,5226,op time,0,378235,1901274,73384334,nsTiming,51,application_1701368813061_0003,73.384334
171,2,24,26,GpuProject,4867,op time,0,652898,14195946,193721527,nsTiming,44,application_1701368813061_0003,193.721527
210,2,24,31,GpuProject,4444,op time,0,638620,17227634,224007359,nsTiming,39,application_1701368813061_0003,224.007359
249,2,24,36,GpuProject,3920,op time,0,494634,4207484,113717746,nsTiming,36,application_1701368813061_0003,113.717746
258,2,24,38,GpuProject,3929,op time,0,1122963,2485654,237128527,nsTiming,36,application_1701368813061_0003,237.128527
267,2,24,40,GpuProject,3938,op time,0,583248,2264972,127696529,nsTiming,36,application_1701368813061_0003,127.696529
301,2,24,45,GpuProject,2501,op time,0,2332531,2332531,2332531,nsTiming,34,application_1701368813061_0003,2.332531
379,2,24,55,GpuProject,2595,op time,0,817510,10245974,164620703,nsTiming,31,application_1701368813061_0003,164.620703
576,2,24,81,GpuProject,7494,op time,0,454728,3004255,97171207,nsTiming,76,application_1701368813061_0003,97.171207


In [16]:
codegen_times = []
for i, app in enumerate(cpu_files) :
    sql_info = pd.read_csv(cpu_dir + "/" + app + "/sql_plan_metrics_for_application.csv")
    filtered_df = sql_info[(sql_info["name"] == "duration") & sql_info["nodeName"].str.startswith("WholeStageCodegen")].copy()
    filtered_df["appIndex"] = i  
    filtered_df["app"] = app 
    codegen_times.append(filtered_df)
codegen_times_df = pd.concat(codegen_times)
codegen_times_df

Unnamed: 0,appIndex,sqlID,nodeID,nodeName,accumulatorId,name,min,median,max,total,metricType,stageIds,app
0,0,24,1,WholeStageCodegen (5),1095,duration,0,317,317,317,timing,36,application_1701330728146_0001
16,0,24,5,WholeStageCodegen (4),1058,duration,0,570,4016,2035271,timing,34,application_1701330728146_0001
36,0,24,18,WholeStageCodegen (1),887,duration,0,651,651,651,timing,32,application_1701330728146_0001
50,0,24,24,WholeStageCodegen (2),896,duration,0,700,700,700,timing,33,application_1701330728146_0001
64,0,24,30,WholeStageCodegen (3),905,duration,0,637,637,637,timing,31,application_1701330728146_0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,11,24,6,WholeStageCodegen (5),1247,duration,0,801,4360,2945574,timing,35,application_1701380218879_0011
40,11,24,20,WholeStageCodegen (1),925,duration,0,634,634,634,timing,32,application_1701380218879_0011
54,11,24,25,WholeStageCodegen (2),934,duration,0,665,665,665,timing,33,application_1701380218879_0011
68,11,24,30,WholeStageCodegen (3),943,duration,0,940,940,940,timing,31,application_1701380218879_0011


In [17]:
mappings = []
for i, app in enumerate(cpu_files):
    mapping_info = pd.read_csv(cpu_dir + "/" + app + "/wholestagecodegen_mapping.csv")
    filtered_df = mapping_info[mapping_info["Child Node"] == "Project"].copy()
    filtered_df["appIndex"] = i  
    filtered_df["app"] = app 
    mappings.append(filtered_df)
mapping_df = pd.concat(mappings)
mapping_df[mapping_df["appIndex"] == 5]

Unnamed: 0,appIndex,sqlID,nodeID,SQL Node,Child Node,Child NodeID,app
0,5,24,1,WholeStageCodegen (28),Project,2,application_1701380218879_0005
2,5,24,4,WholeStageCodegen (19),Project,5,application_1701380218879_0005
7,5,24,16,WholeStageCodegen (7),Project,17,application_1701380218879_0005
11,5,24,25,WholeStageCodegen (4),Project,27,application_1701380218879_0005
13,5,24,25,WholeStageCodegen (4),Project,29,application_1701380218879_0005
15,5,24,25,WholeStageCodegen (4),Project,31,application_1701380218879_0005
26,5,24,55,WholeStageCodegen (17),Project,56,application_1701380218879_0005
31,5,24,67,WholeStageCodegen (26),Project,68,application_1701380218879_0005


In [18]:
combined_codegen = pd.merge(codegen_times_df, mapping_df, on=["appIndex", "sqlID", "nodeID"], how="right")
combined_codegen

Unnamed: 0,appIndex,sqlID,nodeID,nodeName,accumulatorId,name,min,median,max,total,metricType,stageIds,app_x,SQL Node,Child Node,Child NodeID,app_y
0,0,24,5,WholeStageCodegen (4),1058.0,duration,0.0,570.0,4016.0,2035271.0,timing,34,application_1701330728146_0001,WholeStageCodegen (4),Project,7,application_1701330728146_0001
1,0,24,5,WholeStageCodegen (4),1058.0,duration,0.0,570.0,4016.0,2035271.0,timing,34,application_1701330728146_0001,WholeStageCodegen (4),Project,9,application_1701330728146_0001
2,0,24,5,WholeStageCodegen (4),1058.0,duration,0.0,570.0,4016.0,2035271.0,timing,34,application_1701330728146_0001,WholeStageCodegen (4),Project,11,application_1701330728146_0001
3,0,24,5,WholeStageCodegen (4),1058.0,duration,0.0,570.0,4016.0,2035271.0,timing,34,application_1701330728146_0001,WholeStageCodegen (4),Project,13,application_1701330728146_0001
4,0,24,18,WholeStageCodegen (1),887.0,duration,0.0,651.0,651.0,651.0,timing,32,application_1701330728146_0001,WholeStageCodegen (1),Project,19,application_1701330728146_0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,11,24,6,WholeStageCodegen (5),1247.0,duration,0.0,801.0,4360.0,2945574.0,timing,35,application_1701380218879_0011,WholeStageCodegen (5),Project,8,application_1701380218879_0011
1099,11,24,6,WholeStageCodegen (5),1247.0,duration,0.0,801.0,4360.0,2945574.0,timing,35,application_1701380218879_0011,WholeStageCodegen (5),Project,10,application_1701380218879_0011
1100,11,24,6,WholeStageCodegen (5),1247.0,duration,0.0,801.0,4360.0,2945574.0,timing,35,application_1701380218879_0011,WholeStageCodegen (5),Project,12,application_1701380218879_0011
1101,11,24,6,WholeStageCodegen (5),1247.0,duration,0.0,801.0,4360.0,2945574.0,timing,35,application_1701380218879_0011,WholeStageCodegen (5),Project,14,application_1701380218879_0011


In [19]:
project_counts = project_times_df.groupby("appIndex").size().reset_index(name="counts gpu")
codegen_counts = combined_codegen.groupby("appIndex").size().reset_index(name="counts cpu")
combined_counts = pd.merge(project_counts, codegen_counts, on="appIndex", how="outer")
combined_counts[combined_counts["counts gpu"] != combined_counts["counts cpu"]]

Unnamed: 0,appIndex,counts gpu,counts cpu
0,0,1042,987
2,2,15,14
3,3,15,14
5,5,9,8
6,6,13,12


2 - query 24a: GPU misisng 1 `ReusedExchange`
3 - query 24b: GPU misisng 1 `ReusedExchange`
5 - query 47:
  CPU: 14 `Project`s - 6 `ReusedExchange`s (3+3+0+0+0+0) = 8
  GPU: 15 (extra project after window) `GpuProject`s - 3 `ReusedExchange`s (3+3+0) = 9
6 - query 56:
  CPU: 21 `Project`s - 5 out of 7 `ReusedExchange`s - 2 `ReusedSubquery` = 14
  GPU: 21 `GpuProject`s - 5 out of 7 `ReusedExchange`s - 2 `ReusedSubquery` = 14???
