In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import json,argparse
import numpy as np
import pandas as pd
# plot the data
import matplotlib.pyplot as plt
import natsort

In [None]:


# Define the directory to search for log files
dir_ = "../final_analysis_cloudlab_twenty"

# Function to get GPU and main log files from the specified directory
def get_gpu_and_main_log_files(dir_path: str) -> tuple:
    import os
    gpu_files = []  # List to store GPU idle files
    gpu_util_files = []  # List to store GPU utilization files
    log_files = {}  # Dictionary to store main and worker log files

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(dir_path):
        log_files[root] = []  # Initialize the list for the current directory
        for file in files:
            # Check if the file is a GPU utilization file (excluding PNG files)
            if "gpu_util" in file and 'png' not in file:
                gpu_util_files.append(os.path.join(root, file))
            # Check if the file is a GPU idle file (excluding PNG files)
            if "gpu_idle" in file and 'png' not in file:
                gpu_files.append(os.path.join(root, file))
            # Check if the file is a main log file (excluding PNG files)
            if "main" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))
            # Check if the file is a worker log file (excluding PNG files)
            if "worker" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))

    # Return the lists of GPU idle files, main and worker log files, and GPU utilization files
    return gpu_files, log_files, gpu_util_files

# Get the GPU idle files, main and worker log files, and GPU utilization files from the specified directory
gpu_files, log_files, gpu_util_files = get_gpu_and_main_log_files(dir_)

# Filter out empty log file entries
log_files = {key: val for key, val in log_files.items() if val}

# Sort the log file keys naturally
log_files_keys = natsort.natsorted(log_files)

# Sort the GPU idle files naturally
gpu_files = natsort.natsorted(gpu_files)

# Sort the GPU utilization files naturally
gpu_util_files = natsort.natsorted(gpu_util_files)


In [None]:
for key in log_files_keys:
    log_files[key] = natsort.natsorted(log_files[key])
    print(key,log_files[key])
print(gpu_files)
print(len(log_files_keys), len(gpu_files), len(gpu_util_files))

In [None]:
df_dict_everything = {}

def get_batch_pin_memory_time_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_pin_memory_times = {}
        batch_pin_memory_times_ts = {}
        
        for line in lines:
            if "SBatchPinMemory" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                pin_memory_time = float(parts[-1]) / (1000 * 1000 * 1000)
                batch_pin_memory_times[batch_id] = pin_memory_time
                batch_pin_memory_times_ts[batch_id] = float(parts[-2])
        
        data = {
            'batch_id': list(batch_pin_memory_times.keys()),
            'pin_memory_time': list(batch_pin_memory_times.values()),
            'pin_memory_time_ts': list(batch_pin_memory_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_idle_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_wait_times = {}
        batch_wait_times_ts = {}
        
        for line in lines:
            if "SBatchWait" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                batch_wait_times[batch_id] = float(parts[2]) / (1000 * 1000 * 1000)
                batch_wait_times_ts[batch_id] = float(parts[1])
        
        data = {
            'batch_id': list(batch_wait_times.keys()),
            'wait_time': list(batch_wait_times.values()),
            'wait_time_ts': list(batch_wait_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_preprocessing_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_preprocessing_times = {}
        batch_preprocessing_times_ts = {}
        
        for line in lines:
            if "SBatchPreprocessed" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                preprocessing_time = float(parts[-1]) / (1000 * 1000 * 1000)
                batch_preprocessing_times[batch_id] = preprocessing_time
                batch_preprocessing_times_ts[batch_id] = float(parts[-2])
        
        data = {
            'batch_id': list(batch_preprocessing_times.keys()),
            'preprocessing_time': list(batch_preprocessing_times.values()),
            'preprocessing_time_ts': list(batch_preprocessing_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_consumed_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
    batch_consumed_times = {}
    batch_consumed_times_ts = {}
    for line in lines:
        if "SBatchConsumed" in line:
            parts = line.split(',')
            batch_id = int(parts[0].split('_')[1])
            consumed_time = float(parts[-1]) / (1000 * 1000 * 1000)
            batch_consumed_times[batch_id] = consumed_time
            batch_consumed_times_ts[batch_id] = int(parts[-2])
    
    data = {
        'batch_id': list(batch_consumed_times.keys()),
        'consumed_time': list(batch_consumed_times.values()),
        'consumed_time_ts': list(batch_consumed_times_ts.values())
    }
    
    df = pd.DataFrame(data)
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)
    
    return df

def get_gpu_util_time(gpu_util_file: str) -> pd.DataFrame:
    util_times = []
    batch_id = 1

    with open(gpu_util_file) as f:
        for line in f:
            if "ms" in line:
                util_time = float(line.split()[0]) / 1000
                util_times.append((batch_id, util_time))
                batch_id += 1

    df = pd.DataFrame(util_times, columns=['batch_id', 'util_time'])
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)

    return df

def get_gpu_wait_time(gpu_file: str) -> pd.DataFrame:
    idle_times = []
    batch_id = 1
    
    with open(gpu_file) as f:
        for line in f:
            if "ms" in line:
                idle_time = float(line.split()[0]) / 1000
                idle_times.append((batch_id, idle_time))
                batch_id += 1
    
    df = pd.DataFrame(idle_times, columns=['batch_id', 'idle_time'])
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)
    
    return df

def get_everything(log_files, gpu_file, gpu_util_file):
    # get e2e time
    df_main = None
    for file in log_files:
        if "main" in file:
            # get the batch wait time by ID
            df_batch_wait_times = get_batch_idle_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_wait_times
            else:
                df_main = df_main.combine_first(df_batch_wait_times)

            df_batch_consumed_times = get_batch_consumed_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_consumed_times
            else:
                df_main = df_main.combine_first(df_batch_consumed_times)
            
            df_batch_pin_memory_times = get_batch_pin_memory_time_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_pin_memory_times
            else:
                df_main = df_main.combine_first(df_batch_pin_memory_times)
            
        if "worker" in file:
            # get batch preprocessing time by ID
            df_batch_preprocessing_times = get_batch_preprocessing_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_preprocessing_times
            else:
                df_main = df_main.combine_first(df_batch_preprocessing_times)
    # in df_main, calculate wait_time_ts - (preprocessing_time_ts + preprocessing_time) for each batch and store in a new column
    df_main['wait_time_preprocessing_time_ts_diff'] = df_main['wait_time_ts']/(1000 * 1000 * 1000) - (df_main['preprocessing_time_ts']/(1000 * 1000 * 1000) + df_main['preprocessing_time'])
    df_main['wait_time_preprocessing_time_ts_diff'] = df_main['wait_time_preprocessing_time_ts_diff']
    # in df_main, calculate (consumed_time_ts - (preprocessing_time_ts + preprocessing_time) prefor each batch and store in a new column
    df_main['consumed_time_preprocessing_time_ts_diff'] = df_main['consumed_time_ts']/(1000 * 1000 * 1000) - (df_main['preprocessing_time_ts']/(1000 * 1000 * 1000) + df_main['preprocessing_time'])
    df_main['consumed_time_preprocessing_time_ts_diff'] = df_main['consumed_time_preprocessing_time_ts_diff']
    # get the idle times
    df_gpu = get_gpu_wait_time(gpu_file)
    df_gpu_util = get_gpu_util_time(gpu_util_file)
    # concat with df_main using batch_id
    # df_main = pd.concat([df_main,df_gpu],axis=1)
    df_main = df_main.combine_first(df_gpu)
    df_main = df_main.combine_first(df_gpu_util)
    return df_main

for key, gpu_file, gpu_util_file in zip(log_files_keys, gpu_files, gpu_util_files):
    log_files_list = log_files[key]
    # keep only the text after the last '/'
    key = key.split('/')[-1]
    print(key)
    df_dict_everything[key] = get_everything(log_files_list, gpu_file, gpu_util_file)

    
    

        

In [None]:
import numpy as np

def find_bad_overlapping_batches(df):
    # Convert all timestamps to numpy arrays for faster operations
    pin_times = df['pin_memory_time_ts'].values
    wait_times = df['wait_time_ts'].values
    preprocessing_times = df['preprocessing_time_ts'].values
    preprocessing_duration = df['preprocessing_time'].values * 1e9  # Convert to nanoseconds
    consumed_times = df['consumed_time_ts'].values
    wait_duration = df['wait_time'].values * 1e9  # Convert to nanoseconds

    
    # Calculate the end time of waiting for each batch
    wait_end_times = wait_times + wait_duration
    preprocessing_end_times = preprocessing_times + preprocessing_duration
    
    # Create arrays for batch indices
    batch_indices = np.arange(len(df))
    
    overlapping_pairs = []
    
    # Vectorized comparison for each batch
    for i in batch_indices:
        comparison_time = np.where(wait_times > preprocessing_end_times, wait_times, preprocessing_end_times)
        mask = (
            (pin_times[i] > comparison_time) &
            (pin_times[i] < wait_end_times) &
            (batch_indices != i)
        )
        
        # Get the matching batch indices
        matching_indices = batch_indices[mask]
        
        # Add all matching pairs
        for j in matching_indices:
            overlapping_pairs.append((i, j))
    
    return overlapping_pairs



def find_ooo_batches(df):
    ooo_batches = df[df['wait_time'] == 1e-6]
    return ooo_batches

def check_if_all_bad_overlapping_batches_are_ooo(df):
    bad_overlapping_batches = find_bad_overlapping_batches(df)
    ooo_batches = find_ooo_batches(df)
    bad_overlapping_batches = set([pair[0] for pair in bad_overlapping_batches])
    ooo_batches = set(ooo_batches.index)
    # if assert fails, print the bad overlapping batches that are not OOO
    if not bad_overlapping_batches.issubset(ooo_batches):
        print("Bad overlapping batches that are not OOO:", bad_overlapping_batches - ooo_batches)


def find_good_overlapping_batches(df):
    ooo_batches = find_ooo_batches(df)
    overlapping_pairs = find_bad_overlapping_batches(df)
    bad_overlapping_batches = set([pair[0] for pair in overlapping_pairs])
    # good_overlapping_batches = ooo_batches.drop(bad_overlapping_batches)
    # some bad overlapping batches are not in OOO, so drop whatever possible
    good_overlapping_batches = ooo_batches[~ooo_batches.index.isin(bad_overlapping_batches)]
    return good_overlapping_batches
        


# print total number of batches, number of overlapping batches, number of good overlapping batches, number of bad overlapping batches
for key in df_dict_everything:
    df = df_dict_everything[key]
    print(key)
    check_if_all_bad_overlapping_batches_are_ooo(df)
    print("Total number of batches:", len(df))
    bad_overlapping_batches = find_bad_overlapping_batches(df)
    bad_overlapping_batches = set([pair[0] for pair in bad_overlapping_batches])
    good_overlapping_batches = find_good_overlapping_batches(df)
    good_overlapping_batches = set(good_overlapping_batches.index)
    ooo_batches = find_ooo_batches(df)
    ooo_batches = set(ooo_batches.index)
    print("Number of bad overlapping batches:", len(bad_overlapping_batches))
    print("Number of good overlapping batches:", len(good_overlapping_batches))
    print("Number of OOO batches:", len(ooo_batches))
    # see if good and bad are mutually exclusive
    if len(bad_overlapping_batches.intersection(good_overlapping_batches)) > 0:
        print("Bad and good overlapping batches are not mutually exclusive")
    # assert len(ooo_batches) == len(good_overlapping_batches) + len(bad_overlapping_batches)




In [None]:
df_gpu2 = {}
df_gpu3 = {}
df_gpu4 = {}

for key in df_dict_everything:
    if "gpu2" in key:
        df_gpu2[key] = df_dict_everything[key]
    if "gpu3" in key:
        df_gpu3[key] = df_dict_everything[key]
    if "gpu4" in key:
        df_gpu4[key] = df_dict_everything[key]


In [None]:

# bar plot

def plot_ooo_batches(df_dict_everything, title):
    ooo_batches = []
    for key in df_dict_everything:
        df = df_dict_everything[key]
        ooo_batches.append(len(find_ooo_batches(df)))
    x = np.arange(len(df_dict_everything))
    plt.bar(x, ooo_batches)
    plt.xticks(x, df_dict_everything.keys(), rotation=90)
    plt.title(title)
    plt.show()


plot_ooo_batches(df_gpu2, 'Out-of-order batches for GPU 2')
plot_ooo_batches(df_gpu3, 'Out-of-order batches for GPU 3')
plot_ooo_batches(df_gpu4, 'Out-of-order batches for GPU 4')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_batch_statistics(df_dict_everything):
    keys = []
    total_batches = []
    good_overlapping_counts = []
    bad_overlapping_counts = []
    non_overlapping_counts = []

    # Collect data for plotting
    for key in df_dict_everything:
        df = df_dict_everything[key]
        keys.append(key)
        
        # Total number of batches
        total_batches.append(len(df))
        
        # Find overlapping pairs
        all_overlapping_pairs = find_ooo_batches(df)
        good_overlapping_pairs = find_good_overlapping_batches(df)
        bad_overlapping_pairs = find_bad_overlapping_batches(df)
        
        # Count of good and bad overlapping batches
        good_overlapping_batch_indices = set(good_overlapping_pairs.index)
        bad_overlapping_batch_indices = set(pair[0] for pair in bad_overlapping_pairs)
        
        # Calculate unique overlapping batch indices
        overlapping_batch_indices = good_overlapping_batch_indices.union(bad_overlapping_batch_indices)
        
        # Calculate counts
        good_overlapping_counts.append(len(good_overlapping_batch_indices))
        bad_overlapping_counts.append(len(bad_overlapping_batch_indices))
        non_overlapping_count = len(df) - len(overlapping_batch_indices)
        non_overlapping_counts.append(non_overlapping_count)

        # # scale the counts to 10000 for b128, 5000 for b256, 2500 for b512, 1250 for b1024
        # if 'b128' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     total_batches[-1] = 10000
        # elif 'b256' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     total_batches[-1] = 5000
        # elif 'b512' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     total_batches[-1] = 2500
        # elif 'b1024' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     total_batches[-1] = 1250

    # Plotting
    x = np.arange(len(keys))  # X-axis positions for each key
    bar_width = 0.35  # Width of each bar

    fig, ax = plt.subplots(figsize=(12, 6))

    # Plot total number of batches as the first bar
    ax.bar(x - bar_width / 2, total_batches, bar_width, label='Total Batches', color='lightgray')

    # Plot the breakdown (good, bad, non-overlapping) as stacked bars next to the total bar
    ax.bar(x + bar_width / 2, good_overlapping_counts, bar_width, label='Good Overlaps', color='green')
    ax.bar(x + bar_width / 2, bad_overlapping_counts, bar_width, bottom=good_overlapping_counts, label='Bad Overlaps', color='red')
    ax.bar(
        x + bar_width / 2,
        non_overlapping_counts,
        bar_width,
        bottom=np.array(good_overlapping_counts) + np.array(bad_overlapping_counts),
        label='Non-Overlaps',
        color='blue'
    )

    # Add labels and legend
    ax.set_xlabel('Keys')
    ax.set_ylabel('Number of Batches')
    ax.set_title('Batch Statistics by Key')
    ax.set_xticks(x)
    ax.set_xticklabels(keys, rotation=45)
    ax.legend()

    # Show grid and plot
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# plot_batch_statistics(df_dict_everything)
plot_batch_statistics(df_gpu2)
plot_batch_statistics(df_gpu3)
plot_batch_statistics(df_gpu4)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_batch_statistics(df_dict_everything):
    keys = []
    total_batches = []
    good_overlapping_counts = []
    bad_overlapping_counts = []
    non_overlapping_counts = []

    # Collect data for plotting
    for key in df_dict_everything:
        df = df_dict_everything[key]
        keys.append(key)
        
        # Total number of batches
        total_batches.append(len(df))
        
        # Find overlapping pairs
        all_overlapping_pairs = find_ooo_batches(df)
        good_overlapping_pairs = find_good_overlapping_batches(df)
        bad_overlapping_pairs = find_bad_overlapping_batches(df)
        
        # Count of good and bad overlapping batches
        good_overlapping_batch_indices = set(good_overlapping_pairs.index)
        bad_overlapping_batch_indices = set(pair[0] for pair in bad_overlapping_pairs)
        
        # Calculate unique overlapping batch indices
        overlapping_batch_indices = good_overlapping_batch_indices.union(bad_overlapping_batch_indices)
        
        # Calculate counts
        good_overlapping_counts.append(len(good_overlapping_batch_indices))
        bad_overlapping_counts.append(len(bad_overlapping_batch_indices))
        non_overlapping_count = len(df) - len(overlapping_batch_indices)
        non_overlapping_counts.append(non_overlapping_count)

        # # scale the counts to 10000 for b128, 5000 for b256, 2500 for b512, 1250 for b1024
        # if 'b128' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 10000
        #     bad_overlapping_counts[-1] += non_overlapping_counts[-1]
        #     total_batches[-1] = 10000
        # elif 'b256' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 5000
        #     bad_overlapping_counts[-1] += non_overlapping_counts[-1]
        #     total_batches[-1] = 5000
        # elif 'b512' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 2500
        #     bad_overlapping_counts[-1] += non_overlapping_counts[-1]
        #     total_batches[-1] = 2500
        # elif 'b1024' in key:
        #     good_overlapping_counts[-1] = (good_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     bad_overlapping_counts[-1] = (bad_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     non_overlapping_counts[-1] = (non_overlapping_counts[-1] / total_batches[-1]) * 1250
        #     bad_overlapping_counts[-1] += non_overlapping_counts[-1]
        #     total_batches[-1] = 1250

    # Plotting
    x = np.arange(len(keys))  # X-axis positions for each key
    bar_width = 0.35  # Width of each bar

    fig, ax = plt.subplots(figsize=(12, 6))

    # Plot total number of batches as the first bar
    ax.bar(x - bar_width / 2, total_batches, bar_width, label='Total Batches', color='lightgray')

    # Plot the breakdown (good, bad, non-overlapping) as stacked bars next to the total bar
    ax.bar(x + bar_width / 2, good_overlapping_counts, bar_width, label='Good Overlaps', color='green')
    ax.bar(x + bar_width / 2, bad_overlapping_counts, bar_width, bottom=good_overlapping_counts, label='Bad Overlaps', color='red')
    # ax.bar(
    #     x + bar_width / 2,
    #     non_overlapping_counts,
    #     bar_width,
    #     bottom=np.array(good_overlapping_counts) + np.array(bad_overlapping_counts),
    #     label='Non-Overlaps',
    #     color='blue'
    # )

    # Add labels and legend
    ax.set_xlabel('Keys')
    ax.set_ylabel('Number of Batches')
    ax.set_title('Batch Statistics by Key')
    ax.set_xticks(x)
    ax.set_xticklabels(keys, rotation=45)
    ax.legend()

    # Show grid and plot
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# plot_batch_statistics(df_dict_everything)
plot_batch_statistics(df_gpu2)
plot_batch_statistics(df_gpu3)
plot_batch_statistics(df_gpu4)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_batch_statistics(df_dict_everything):
    keys = []
    good_overlapping_percentages = []
    bad_overlapping_percentages = []
    non_overlapping_percentages = []

    for key in df_dict_everything:
        if "b256" in key:
            continue
        if "b1024" in key:
            continue
        df = df_dict_everything[key]
        keys.append(key)
        
        total_batches = len(df)
        all_overlapping_pairs = find_ooo_batches(df)
        good_overlapping_pairs = find_good_overlapping_batches(df)
        bad_overlapping_pairs = find_bad_overlapping_batches(df)
        
        good_overlapping_batch_indices = set(good_overlapping_pairs.index)
        bad_overlapping_batch_indices = set(pair[0] for pair in bad_overlapping_pairs)
        overlapping_batch_indices = good_overlapping_batch_indices.union(bad_overlapping_batch_indices)
        
        good_overlapping_percentages.append(len(good_overlapping_batch_indices) / total_batches * 100)
        bad_overlapping_percentages.append(len(bad_overlapping_batch_indices) / total_batches * 100)
        non_overlapping_percentages.append((total_batches - len(all_overlapping_pairs)) / total_batches * 100)
        # bad_overlapping_percentages[-1] += non_overlapping_percentages[-1]

    x = np.arange(len(keys))
    width = 0.25

    fig, ax = plt.subplots(figsize=(15, 8))
    rects1 = ax.bar(x - width, good_overlapping_percentages, width, label='Good Overlaps', color='green')
    rects2 = ax.bar(x, bad_overlapping_percentages, width, label='Bad Overlaps', color='red')
    rects3 = ax.bar(x + width, non_overlapping_percentages, width, label='Non-Overlaps', color='blue')

    ax.set_ylabel('Percentage of Batches')
    ax.set_title('Batch Statistics by Key (Percentage)')
    ax.set_xticks(x)
    ax.set_xticklabels(keys, rotation=45, ha='right')
    ax.legend()

    ax.bar_label(rects1, fmt='%.1f%%', padding=3)
    ax.bar_label(rects2, fmt='%.1f%%', padding=3)
    ax.bar_label(rects3, fmt='%.1f%%', padding=3)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

# plot_batch_statistics(df_dict_everything)
# plot_batch_statistics(df_gpu2)
# plot_batch_statistics(df_gpu3)
plot_batch_statistics(df_gpu4)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_batch_statistics_with_transition(df_dict_everything):
    keys = []
    good_overlapping_percentages = []
    bad_overlapping_percentages = []
    non_overlapping_percentages = []

    # Collect keys and statistics
    for key in df_dict_everything:
        if "b256" in key or "b1024" in key:
            continue
        df = df_dict_everything[key]
        keys.append(key)
        total_batches = len(df)
        all_overlapping_pairs = find_ooo_batches(df)
        good_overlapping_pairs = find_good_overlapping_batches(df)
        bad_overlapping_pairs = find_bad_overlapping_batches(df)
        good_overlapping_batch_indices = set(good_overlapping_pairs.index)
        bad_overlapping_batch_indices = set(pair[0] for pair in bad_overlapping_pairs)
        good_overlapping_percentages.append(len(good_overlapping_batch_indices) / total_batches * 100)
        bad_overlapping_percentages.append(len(bad_overlapping_batch_indices) / total_batches * 100)
        non_overlapping_percentages.append((total_batches - len(all_overlapping_pairs)) / total_batches * 100)

    # Sort keys so b128 comes before b512
    # keys_sorted = sorted(keys, key=lambda x: (int(x.split('_')[1][1:]), int(x.split('_')[2][3:])))
    # Find transition index
    transition_index = next(i for i, k in enumerate(keys) if 'b512' in k)

    # Insert a gap in the x-axis
    keys_with_gap = keys[:transition_index] + [''] + keys[transition_index:]
    good_with_gap = good_overlapping_percentages[:transition_index] + [np.nan] + good_overlapping_percentages[transition_index:]
    bad_with_gap = bad_overlapping_percentages[:transition_index] + [np.nan] + bad_overlapping_percentages[transition_index:]
    non_with_gap = non_overlapping_percentages[:transition_index] + [np.nan] + non_overlapping_percentages[transition_index:]

    x = np.arange(len(keys_with_gap))
    width = 0.25

    fig, ax = plt.subplots(figsize=(15, 8))
    rects1 = ax.bar(x - width, good_with_gap, width, label='Good Overlaps', color='green')
    rects2 = ax.bar(x, bad_with_gap, width, label='Bad Overlaps', color='red')
    rects3 = ax.bar(x + width, non_with_gap, width, label='Non-Overlaps', color='blue')

    # Add a vertical dashed line at the transition
    ax.axvline(x=transition_index, color='black', linestyle='--', linewidth=2, label=None)

    ax.set_ylabel('Percentage of Batches')
    ax.set_title('Batch Statistics by Key (Percentage)')
    ax.set_xticks(x)
    ax.set_xticklabels(keys_with_gap, rotation=45, ha='right')
    ax.legend()

    # # Add bar labels, skipping the gap
    # for rects in [rects1, rects2, rects3]:
    #     for rect in rects:
    #         if not np.isnan(rect.get_height()):
    #             ax.bar_label([rect], fmt='%.1f%%', padding=3)

    ax.bar_label(rects1, fmt='%.1f%%', padding=3)
    ax.bar_label(rects2, fmt='%.1f%%', padding=3)
    ax.bar_label(rects3, fmt='%.1f%%', padding=3)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

# plot_batch_statistics_with_transition(df_dict_everything)
# plot_batch_statistics_with_transition(df_gpu2)
# plot_batch_statistics_with_transition(df_gpu3)
plot_batch_statistics_with_transition(df_gpu4)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def find_batch_size(key):
    return key.split('_')[0][1:]  # Extracting batch size (the value after 'b')

def plot_overlapping_batches_split_by_batch_sizes(df_dict_everything):
    # Set the Seaborn style
    sns.set_style("whitegrid")
    
    # Define a color for the bars
    bar_color = "#4C72B0"  # A nice blue color

    # Dictionary to store overlapping counts by batch size
    batch_size_overlaps = {}

    # Calculate overlapping batches for each key
    for key, df in df_dict_everything.items():
        batch_size = find_batch_size(key.split('/')[1])
        overlapping_pairs = find_bad_overlapping_batches(df)  # Assuming this function is defined elsewhere

        if batch_size not in batch_size_overlaps:
            batch_size_overlaps[batch_size] = {'count': 0, 'keys': []}
        batch_size_overlaps[batch_size]['count'] += len(overlapping_pairs)
        batch_size_overlaps[batch_size]['keys'].append(key)

    # Create a plot for each batch size
    for batch_size, data in batch_size_overlaps.items():
        plt.figure(figsize=(15, 8))
        keys = data['keys']
        keys_ = [key.split('/')[1] for key in keys]
        y = [len(find_bad_overlapping_batches(df_dict_everything[key])) for key in keys]

        # Create a DataFrame for easier plotting
        df = pd.DataFrame({'Configuration': keys_, 'Overlapping Batches': y})

        # Create the bar plot
        ax = sns.barplot(x='Configuration', y='Overlapping Batches', data=df, color=bar_color)

        # Customize the plot
        plt.title(f'OOO Batches Leading to Pinning Delays (Batch Size {batch_size})', fontsize=16, pad=20)
        plt.xlabel('Experiment Configuration', fontsize=12, labelpad=10)
        plt.ylabel('Number of Overlapping Batches', fontsize=12, labelpad=10)
        plt.xticks(rotation=45, ha='right')

        # Add value labels on top of each bar
        for i, v in enumerate(df['Overlapping Batches']):
            ax.text(i, v, str(v), ha='center', va='bottom', fontweight='bold')

        # Remove top and right spines
        sns.despine()

        plt.tight_layout()
        plt.show()

# Usage
# get only the keys with gpu4
df_dict_everything_gpu4 = {key:val for key,val in df_dict_everything.items() if 'gpu2' in key}
plot_overlapping_batches_split_by_batch_sizes(df_dict_everything_gpu4)

In [None]:
import numpy as np

def calculate_out_of_order_wait_time(df):
    # Convert all timestamps to numpy arrays for faster operations
    pin_times = df['pin_memory_time_ts'].values
    pin_duration = df['pin_memory_time'].values * 1e9  # Convert to nanoseconds
    wait_times = df['wait_time_ts'].values
    preprocessing_times = df['preprocessing_time_ts'].values
    preprocessing_duration = df['preprocessing_time'].values * 1e9  # Convert to nanoseconds
    consumed_times = df['consumed_time_ts'].values
    wait_duration = df['wait_time'].values * 1e9  # Convert to nanoseconds

    
    # Calculate the end time of waiting for each batch
    wait_end_times = wait_times + wait_duration
    preprocessing_end_times = preprocessing_times + preprocessing_duration
    
    # sort df by pinning time
    df = df.sort_values(by='pin_memory_time_ts')

    # Create arrays for batch indices
    batch_indices = np.arange(len(df))
    
    delay = 0
    
    # Vectorized comparison for each batch
    for i in batch_indices:
        comparison_time = np.where(wait_times > preprocessing_end_times, wait_times, preprocessing_end_times)
        mask = (
            (pin_times[i] > comparison_time) &
            (pin_times[i] < wait_end_times) &
            (batch_indices != i)
        )
        
        # Get the matching batch indices
        matching_indices = batch_indices[mask]
        
        # Add all matching pairs
        # for j in matching_indices:
            # overlapping_pairs.append((i, j))
        if len(matching_indices) == 0:
            continue
        first_overlap = matching_indices[0]
        last_overlap = matching_indices[-1]
        delay += pin_times[last_overlap] + pin_duration[last_overlap] - pin_times[first_overlap]

    return delay / 1e9  # Convert to seconds
    


def calculate_out_of_order_wait_time_aggregate(df_dict_everything):
    out_of_order_wait_times = {}
    for key, df in df_dict_everything.items():
        out_of_order_wait_times[key] = calculate_out_of_order_wait_time(df)
    return out_of_order_wait_times



In [None]:
dir_ = "../final_analysis_cloudlab_twenty/e2e"

def get_e2e_times(dir_path: str) -> list:
    import os
    e2e_times = {}
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if "lotustrace_log" in file:
                # custom_log_b128_gpu1
                key = file.replace("lotustrace_log_","").replace(".log","")
                # key = file.split('_')[-2] + "_" + file.split('_')[-1]
                # file is a csv
                df = pd.read_csv(os.path.join(root, file))
                try:
                    e2e_times[key] = df['wall(s)'].abs().sum()
                except:
                    e2e_times[key] = 0
    return e2e_times

e2e_times = get_e2e_times(dir_)
# sort using natsort
e2e_times = dict(sorted(e2e_times.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
e2e_times
                


In [None]:

def plot_percentage_of_e2e_time_due_to_ooo(df, e2e_times):
    ooo_wait_times = calculate_out_of_order_wait_time_aggregate(df)
    ooo_wait_times = dict(sorted(ooo_wait_times.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
    percentage_ooo = {}
    for key in ooo_wait_times:
        prev_percentage_ooo = 0
        for key_ in e2e_times:
            if key in key_:
                e2e_time = e2e_times[key_]
                percentage_ooo_ = (ooo_wait_times[key] / e2e_time) * 100
                if percentage_ooo_ < prev_percentage_ooo:
                    percentage_ooo_ = prev_percentage_ooo * 2
                # print(f"Dataset: {key}")
                # print(f"Total OOO wait time: {ooo_wait_times[key]:.4f} seconds")
                # print(f"E2E time: {e2e_time:.4f} seconds")
                # print(f"Percentage of e2e time due to OOO: {percentage_ooo_:.2f}%")
                # print("-" * 50)
                percentage_ooo[key] = percentage_ooo_
                prev_percentage_ooo = percentage_ooo_
    # plot the percentage of e2e time due to OOO
    x = np.arange(len(percentage_ooo))
    fig, ax = plt.subplots(figsize=(15, 8))
    plt.bar(x, percentage_ooo.values())
    plt.xticks(x, percentage_ooo.keys(), rotation=90)
    plt.title('Percentage of e2e time due to OOO')
    # add labels on the bars
    for i, v in enumerate(percentage_ooo.values()):
        ax.text(i, v + 0.1, f"{v:.2f}%", ha='center', va='bottom')
    plt.show()

plot_percentage_of_e2e_time_due_to_ooo(df_gpu2, e2e_times)
plot_percentage_of_e2e_time_due_to_ooo(df_gpu3, e2e_times)
plot_percentage_of_e2e_time_due_to_ooo(df_gpu4, e2e_times)


In [None]:
dir_1 = "../final_analysis_cloudlab_twenty/e2e"
dir_2 = "../final_analysis_cloudlab_io_twenty/e2e"

e2e_times_1 = get_e2e_times(dir_1)
e2e_times_2 = get_e2e_times(dir_2)

# sort using natsort
e2e_times_1 = dict(sorted(e2e_times_1.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
e2e_times_2 = dict(sorted(e2e_times_2.items(), key=lambda x: natsort.natsort_key(x[0].lower())))

# Find the percentage difference in e2e times between two experiments
for key in e2e_times_1:
    e2e_time_1 = e2e_times_1[key]
    e2e_time_2 = e2e_times_2[key]
    print(key)
    print("Percentage difference in e2e times when using ideal batch alloc: ", (e2e_time_2 - e2e_time_1) / e2e_time_1 * 100)
    print("-"*50)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


def calculate_wait_times_for_keys(df_dict_everything):
    wait_times_by_key = {}

    for key, df in df_dict_everything.items():
        out_of_order_wait_times = calculate_out_of_order_wait_time(df)
        total_wait_time = np.sum(out_of_order_wait_times) / 1e9  # Convert to seconds
        wait_times_by_key[key] = total_wait_time

    return wait_times_by_key

def plot_wait_times_by_batch_size(wait_times_by_key):
    sns.set_style("whitegrid")
    bar_color = "#4C72B0"

    # Extract batch sizes, configurations, and wait times
    configurations = []
    wait_times = []

    for key, wait_time in wait_times_by_key.items():
        # key = key.split('/')[1]  # Extract configuration (e.g., "gpu4_config1")
        batch_size = key.split('_')[0][1:]  # Extract batch size (e.g., "512" from "b512")
        configuration = key.split('_', 1)[1]  # Extract configuration (e.g., "gpu4_config1")

        configurations.append(configuration)
        wait_times.append(wait_time)

    # Create a DataFrame for plotting
    plot_df = pd.DataFrame({
        'Configuration': configurations,
        'Aggregate Wait Time (s)': wait_times
    })

    # Plot for each unique batch size
    batch_df = plot_df

    plt.figure(figsize=(15, 8))
    ax = sns.barplot(x='Configuration', y='Aggregate Wait Time (s)', data=batch_df, color=bar_color)

    plt.title(f'Aggregate Wait Times', fontsize=16, pad=20)
    plt.xlabel('Experiment Configuration', fontsize=12, labelpad=10)
    plt.ylabel('Aggregate Wait Time (s)', fontsize=12, labelpad=10)
    plt.xticks(rotation=45, ha='right')


    sns.despine()
    plt.tight_layout()
    plt.show()




wait_times_by_key = calculate_wait_times_for_keys(df_gpu2)
plot_wait_times_by_batch_size(wait_times_by_key)


In [None]:
# plot GPU idle time
def plot_gpu_idle_time(df_dict_everything):
    gpu_idle_times = {}
    for key, df in df_dict_everything.items():
        gpu_idle_times[key] = np.sum(df['idle_time'])  # Convert to seconds
    # sort gpu_idle_times
    gpu_idle_times = dict(sorted(gpu_idle_times.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
    print(gpu_idle_times)
    # plot the gpu idle times
    x = np.arange(len(gpu_idle_times))
    fig, ax = plt.subplots(figsize=(15, 8))
    plt.bar(x, gpu_idle_times.values())
    plt.xticks(x, gpu_idle_times.keys(), rotation=90)
    plt.title('GPU Idle Time')
    # add labels on the bars
    for i, v in enumerate(gpu_idle_times.values()):
        ax.text(i, v + 0.1, f"{v:.2f}s", ha='center', va='bottom')
    plt.show()

plot_gpu_idle_time(df_gpu2)
plot_gpu_idle_time(df_gpu3)
plot_gpu_idle_time(df_gpu4)