Parse the raw event log to compare CPU and GPU exchange operations.

In [None]:
import json
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import HuberRegressor

cpu_profile_dirs = ["/opt/data/profiles/dataproc-cpu-sf100",
                    "/opt/data/profiles/dataproc-cpu-sf3k",
                    "/opt/data/profiles/dataproc-cpu-sf5k",
                    "/opt/data/profiles/dataproc-cpu-sf10k"]
gpu_profile_dirs = ["/opt/data/profiles/dataproc-gpu-sf100",
                    "/opt/data/profiles/dataproc-gpu-sf3k",
                    "/opt/data/profiles/dataproc-gpu-sf5k",
                    "/opt/data/profiles/dataproc-gpu-sf10k"]
cpu_event_dirs = ["/opt/data/events/dataproc-cpu-sf100",
                  "/opt/data/events/dataproc-cpu-sf3k",
                  "/opt/data/events/dataproc-cpu-sf5k",
                  "/opt/data/events/dataproc-cpu-sf10k"]
gpu_event_dirs = ["/opt/data/events/dataproc-gpu-sf100",
                  "/opt/data/events/dataproc-gpu-sf3k",
                  "/opt/data/events/dataproc-gpu-sf5k",
                  "/opt/data/events/dataproc-gpu-sf10k"]
scale_factors = ["100", "3K", "5K", "10K"]

In [None]:
class Exchange:
    def __init__(self, group=None):
        self.data_size = 0
        self.shuffle_bytes_written = 0
        self.shuffle_records_written = 0
        self.shuffle_write_time = 0
        self.accumulator_id = 0
        if group is None:
            return
        for row in group.itertuples():
            if row.name == "data size":
                self.data_size = row.total
            elif row.name == "shuffle bytes written":
                self.shuffle_bytes_written = row.total
            elif row.name == "shuffle records written":
                self.shuffle_records_written = row.total
            elif row.name == "shuffle write time":
                self.shuffle_write_time = row.total
                self.accumulator_id = row.accumulatorId

    def __eq__(self, other):
        if isinstance(other, Exchange):
            return (self.data_size == 0 or other.data_size == 0 or self.data_size == other.data_size) and \
                (self.shuffle_bytes_written == 0 or other.shuffle_bytes_written == 0 or
                 self.shuffle_bytes_written == other.shuffle_bytes_written) and \
                (self.shuffle_records_written == 0 or other.shuffle_records_written == 0 or
                 self.shuffle_records_written == other.shuffle_records_written) and \
                (self.shuffle_write_time == 0 or other.shuffle_write_time == 0 or
                    self.shuffle_write_time == other.shuffle_write_time)
        return False
    
    def __hash__(self):
        return hash((self.data_size, self.shuffle_bytes_written, self.shuffle_records_written, self.shuffle_write_time))

    def __str__(self):
        return f"{self.data_size}, {self.shuffle_bytes_written}, {self.shuffle_records_written}, {self.shuffle_write_time}"


def collect(node, exchange_list, exchange_set, exchange_map):
    if node['nodeName'] == 'Exchange' or node['nodeName'] == 'GpuColumnarExchange':
        for m in node['metrics']:
            if m['name'] == 'shuffle write time':
                accumulator_id = m['accumulatorId']
                if accumulator_id in exchange_map:
                    exchange = exchange_map[accumulator_id]
                    if exchange not in exchange_set:
                        exchange_set.add(exchange)
                        exchange_list.append(exchange)
    for child in node['children']:
        collect(child, exchange_list, exchange_set, exchange_map)


def collect_exchanges(profile_dir, event_file):
    sql_info = pd.read_csv(f"{profile_dir}/sql_plan_metrics_for_application.csv")
    condition = (sql_info['nodeName'] == 'Exchange') | (sql_info['nodeName'] == 'GpuColumnarExchange')
    filtered = sql_info[condition]
    grouped = filtered.groupby("nodeID")
    exchange_dict = {}
    for _, group in grouped:
        exchange = Exchange(group)
        if exchange.accumulator_id != 0:
            exchange_dict[exchange.accumulator_id] = exchange

    exchanges = []
    exchanges_set = set()
    with open(event_file, "r") as f:
        for line in f:
            event = json.loads(line)
            if "sparkPlanInfo" in event:
                exchanges.clear()
                collect(event["sparkPlanInfo"], exchanges, exchanges_set, exchange_dict)
    return sorted(exchanges, key=lambda e: (e.data_size, e.shuffle_bytes_written, e.shuffle_records_written))

num_scale_factors = len(scale_factors)
cpu_times = [[] for _ in range(num_scale_factors)]
gpu_times = [[] for _ in range(num_scale_factors)]
cpu_query_times = [[] for _ in range(num_scale_factors)]
gpu_query_times = [[] for _ in range(num_scale_factors)]
for i in range(num_scale_factors):
    cpu_profiles = os.listdir(cpu_profile_dirs[i])
    cpu_profiles.sort()
    num_apps = len(cpu_profiles)
    gpu_profiles = os.listdir(gpu_profile_dirs[i])
    gpu_profiles.sort()
    cpu_events = os.listdir(cpu_event_dirs[i])
    cpu_events.sort()
    gpu_events = os.listdir(gpu_event_dirs[i])
    gpu_events.sort()
    assert num_apps == len(gpu_profiles) == len(cpu_events) == len(gpu_events)
    for j in range(num_apps):
        print(f"Processing cpu profile {cpu_profiles[j]}, event file {cpu_events[j]}")
        cpu_exchanges = collect_exchanges(f"{cpu_profile_dirs[i]}/{cpu_profiles[j]}", f"{cpu_event_dirs[i]}/{cpu_events[j]}")
        print(f"Processing gpu profile {gpu_profiles[j]}, event file {gpu_events[j]}")
        gpu_exchanges = collect_exchanges(f"{gpu_profile_dirs[i]}/{gpu_profiles[j]}", f"{gpu_event_dirs[i]}/{gpu_events[j]}")
        if len(cpu_exchanges) != len(gpu_exchanges):
            print(f"Warning: cpu and gpu exchanges are of different length: {len(cpu_exchanges)} vs {len(gpu_exchanges)}")
        common_len = min(len(cpu_exchanges), len(gpu_exchanges))
        if len(cpu_exchanges) > common_len:
            cpu_exchanges = random.sample(cpu_exchanges, common_len)
        if len(gpu_exchanges) > common_len:
            gpu_exchanges = random.sample(gpu_exchanges, common_len)
        cpu_times[i].extend(e.shuffle_write_time for e in cpu_exchanges)
        gpu_times[i].extend(e.shuffle_write_time for e in gpu_exchanges)
        cpu_query_times[i].append(sum(e.shuffle_write_time for e in cpu_exchanges))
        gpu_query_times[i].append(sum(e.shuffle_write_time for e in gpu_exchanges))

In [None]:
for i in range(num_scale_factors):
    plt.scatter(cpu_times[i], gpu_times[i])
    plt.xscale('log')
    plt.yscale('log')
    
    # Adding title and labels (optional)
    plt.title(f"Exchange Shuffle Write Times - SF{scale_factors[i]}")
    plt.xlabel("CPU")
    plt.ylabel("GPU")
    
    # Show the plot
    plt.show()

In [None]:
for i in range(num_scale_factors):
    plt.scatter(cpu_times[i], gpu_times[i], label=f"SF{scale_factors[i]}")

plt.xscale('log')
plt.yscale('log')
plt.title("Exchange Shuffle Write Times - Combined")
plt.xlabel("CPU")
plt.ylabel("GPU")
plt.legend()

# Show the plot
plt.show()