In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import json,argparse
import numpy as np
import pandas as pd
# plot the data
import matplotlib.pyplot as plt
import natsort

In [None]:
df_dict_everything = {}

def get_batch_pin_memory_time_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_pin_memory_times = {}
        batch_pin_memory_times_ts = {}
        
        for line in lines:
            if "SBatchPinMemory" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                pin_memory_time = float(parts[-1]) / (1000 * 1000 * 1000)
                batch_pin_memory_times[batch_id] = pin_memory_time
                batch_pin_memory_times_ts[batch_id] = float(parts[-2])
        
        data = {
            'batch_id': list(batch_pin_memory_times.keys()),
            'pin_memory_time': list(batch_pin_memory_times.values()),
            'pin_memory_time_ts': list(batch_pin_memory_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_idle_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_wait_times = {}
        batch_wait_times_ts = {}
        
        for line in lines:
            if "SBatchWait" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                batch_wait_times[batch_id] = float(parts[2]) / (1000 * 1000 * 1000)
                batch_wait_times_ts[batch_id] = float(parts[1])
        
        data = {
            'batch_id': list(batch_wait_times.keys()),
            'wait_time': list(batch_wait_times.values()),
            'wait_time_ts': list(batch_wait_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_preprocessing_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
        batch_preprocessing_times = {}
        batch_preprocessing_times_ts = {}
        
        for line in lines:
            if "SBatchPreprocessed" in line:
                parts = line.split(',')
                batch_id = int(parts[0].split('_')[1])
                preprocessing_time = float(parts[-1]) / (1000 * 1000 * 1000)
                batch_preprocessing_times[batch_id] = preprocessing_time
                batch_preprocessing_times_ts[batch_id] = float(parts[-2])
        
        data = {
            'batch_id': list(batch_preprocessing_times.keys()),
            'preprocessing_time': list(batch_preprocessing_times.values()),
            'preprocessing_time_ts': list(batch_preprocessing_times_ts.values())
        }
        
        df = pd.DataFrame(data)
        df.set_index('batch_id', inplace=True)
        df.sort_index(inplace=True)
        
        return df

def get_batch_consumed_times_with_ts(log_file: str) -> pd.DataFrame:
    with open(log_file) as f:
        lines = f.readlines()
    batch_consumed_times = {}
    batch_consumed_times_ts = {}
    for line in lines:
        if "SBatchConsumed" in line:
            parts = line.split(',')
            batch_id = int(parts[0].split('_')[1])
            consumed_time = float(parts[-1]) / (1000 * 1000 * 1000)
            batch_consumed_times[batch_id] = consumed_time
            batch_consumed_times_ts[batch_id] = int(parts[-2])
    
    data = {
        'batch_id': list(batch_consumed_times.keys()),
        'consumed_time': list(batch_consumed_times.values()),
        'consumed_time_ts': list(batch_consumed_times_ts.values())
    }
    
    df = pd.DataFrame(data)
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)
    
    return df

def get_gpu_util_time(gpu_util_file: str) -> pd.DataFrame:
    util_times = []
    batch_id = 1

    with open(gpu_util_file) as f:
        for line in f:
            if "ms" in line:
                util_time = float(line.split()[0]) / 1000
                util_times.append((batch_id, util_time))
                batch_id += 1

    df = pd.DataFrame(util_times, columns=['batch_id', 'util_time'])
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)

    return df

def get_gpu_wait_time(gpu_file: str) -> pd.DataFrame:
    idle_times = []
    batch_id = 1
    
    with open(gpu_file) as f:
        for line in f:
            if "ms" in line:
                idle_time = float(line.split()[0]) / 1000
                idle_times.append((batch_id, idle_time))
                batch_id += 1
    
    df = pd.DataFrame(idle_times, columns=['batch_id', 'idle_time'])
    df.set_index('batch_id', inplace=True)
    df.sort_index(inplace=True)
    
    return df

def get_everything(log_files, gpu_file, gpu_util_file):
    # get e2e time
    df_main = None
    for file in log_files:
        if "main" in file:
            # get the batch wait time by ID
            df_batch_wait_times = get_batch_idle_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_wait_times
            else:
                df_main = df_main.combine_first(df_batch_wait_times)

            df_batch_consumed_times = get_batch_consumed_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_consumed_times
            else:
                df_main = df_main.combine_first(df_batch_consumed_times)
            
            df_batch_pin_memory_times = get_batch_pin_memory_time_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_pin_memory_times
            else:
                df_main = df_main.combine_first(df_batch_pin_memory_times)
            
        if "worker" in file:
            # get batch preprocessing time by ID
            df_batch_preprocessing_times = get_batch_preprocessing_times_with_ts(file)
            # concat with df_main using batch_id
            if df_main is None:
                df_main = df_batch_preprocessing_times
            else:
                df_main = df_main.combine_first(df_batch_preprocessing_times)
    # in df_main, calculate wait_time_ts - (preprocessing_time_ts + preprocessing_time) for each batch and store in a new column
    df_main['wait_time_preprocessing_time_ts_diff'] = df_main['wait_time_ts']/(1000 * 1000 * 1000) - (df_main['preprocessing_time_ts']/(1000 * 1000 * 1000) + df_main['preprocessing_time'])
    df_main['wait_time_preprocessing_time_ts_diff'] = df_main['wait_time_preprocessing_time_ts_diff']
    # in df_main, calculate (consumed_time_ts - (preprocessing_time_ts + preprocessing_time) prefor each batch and store in a new column
    df_main['consumed_time_preprocessing_time_ts_diff'] = df_main['consumed_time_ts']/(1000 * 1000 * 1000) - (df_main['preprocessing_time_ts']/(1000 * 1000 * 1000) + df_main['preprocessing_time'])
    df_main['consumed_time_preprocessing_time_ts_diff'] = df_main['consumed_time_preprocessing_time_ts_diff']
    # get the idle times
    df_gpu = get_gpu_wait_time(gpu_file)
    df_gpu_util = get_gpu_util_time(gpu_util_file)
    # concat with df_main using batch_id
    # df_main = pd.concat([df_main,df_gpu],axis=1)
    df_main = df_main.combine_first(df_gpu)
    df_main = df_main.combine_first(df_gpu_util)
    return df_main


  

        

In [None]:


# Define the directory to search for log files
dir_ = "../final_analysis_cloudlab_twenty"

# Function to get GPU and main log files from the specified directory
def get_gpu_and_main_log_files(dir_path: str) -> tuple:
    import os
    gpu_files = []  # List to store GPU idle files
    gpu_util_files = []  # List to store GPU utilization files
    log_files = {}  # Dictionary to store main and worker log files

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(dir_path):
        log_files[root] = []  # Initialize the list for the current directory
        for file in files:
            # Check if the file is a GPU utilization file (excluding PNG files)
            if "gpu_util" in file and 'png' not in file:
                gpu_util_files.append(os.path.join(root, file))
            # Check if the file is a GPU idle file (excluding PNG files)
            if "gpu_idle" in file and 'png' not in file:
                gpu_files.append(os.path.join(root, file))
            # Check if the file is a main log file (excluding PNG files)
            if "main" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))
            # Check if the file is a worker log file (excluding PNG files)
            if "worker" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))

    # Return the lists of GPU idle files, main and worker log files, and GPU utilization files
    return gpu_files, log_files, gpu_util_files

# Get the GPU idle files, main and worker log files, and GPU utilization files from the specified directory
gpu_files, log_files, gpu_util_files = get_gpu_and_main_log_files(dir_)

# Filter out empty log file entries
log_files = {key: val for key, val in log_files.items() if val}

# Sort the log file keys naturally
log_files_keys = natsort.natsorted(log_files)

# Sort the GPU idle files naturally
gpu_files = natsort.natsorted(gpu_files)

# Sort the GPU utilization files naturally
gpu_util_files = natsort.natsorted(gpu_util_files)


for key in log_files_keys:
    log_files[key] = natsort.natsorted(log_files[key])
    print(key,log_files[key])
print(gpu_files)
print(len(log_files_keys), len(gpu_files), len(gpu_util_files))


for key, gpu_file, gpu_util_file in zip(log_files_keys, gpu_files, gpu_util_files):
    log_files_list = log_files[key]
    # keep only the text after the last '/'
    key = key.split('/')[-1]
    print(key)
    df_dict_everything[key] = get_everything(log_files_list, gpu_file, gpu_util_file)


In [None]:
df_dict_everything_io_only = {}

# Define the directory to search for log files
dir_ = "../final_analysis_cloudlab_io_badalloc_twenty"

# Function to get GPU and main log files from the specified directory
def get_gpu_and_main_log_files(dir_path: str) -> tuple:
    import os
    gpu_files = []  # List to store GPU idle files
    gpu_util_files = []  # List to store GPU utilization files
    log_files = {}  # Dictionary to store main and worker log files

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(dir_path):
        log_files[root] = []  # Initialize the list for the current directory
        for file in files:
            # Check if the file is a GPU utilization file (excluding PNG files)
            if "gpu_util" in file and 'png' not in file:
                gpu_util_files.append(os.path.join(root, file))
            # Check if the file is a GPU idle file (excluding PNG files)
            if "gpu_idle" in file and 'png' not in file:
                gpu_files.append(os.path.join(root, file))
            # Check if the file is a main log file (excluding PNG files)
            if "main" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))
            # Check if the file is a worker log file (excluding PNG files)
            if "worker" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))

    # Return the lists of GPU idle files, main and worker log files, and GPU utilization files
    return gpu_files, log_files, gpu_util_files

# Get the GPU idle files, main and worker log files, and GPU utilization files from the specified directory
gpu_files, log_files, gpu_util_files = get_gpu_and_main_log_files(dir_)

# Filter out empty log file entries
log_files = {key: val for key, val in log_files.items() if val}

# Sort the log file keys naturally
log_files_keys = natsort.natsorted(log_files)

# Sort the GPU idle files naturally
gpu_files = natsort.natsorted(gpu_files)

# Sort the GPU utilization files naturally
gpu_util_files = natsort.natsorted(gpu_util_files)


for key in log_files_keys:
    log_files[key] = natsort.natsorted(log_files[key])
    print(key,log_files[key])
print(gpu_files)
print(len(log_files_keys), len(gpu_files), len(gpu_util_files))


for key, gpu_file, gpu_util_file in zip(log_files_keys, gpu_files, gpu_util_files):
    log_files_list = log_files[key]
    # keep only the text after the last '/'
    key = key.split('/')[-1]
    print(key)
    df_dict_everything_io_only[key] = get_everything(log_files_list, gpu_file, gpu_util_file)


In [None]:
df_dict_everything_good_alloc_only = {}

# Define the directory to search for log files
dir_ = "../final_analysis_cloudlab_good_alloc_twenty"

# Function to get GPU and main log files from the specified directory
def get_gpu_and_main_log_files(dir_path: str) -> tuple:
    import os
    gpu_files = []  # List to store GPU idle files
    gpu_util_files = []  # List to store GPU utilization files
    log_files = {}  # Dictionary to store main and worker log files

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(dir_path):
        log_files[root] = []  # Initialize the list for the current directory
        for file in files:
            # Check if the file is a GPU utilization file (excluding PNG files)
            if "gpu_util" in file and 'png' not in file:
                gpu_util_files.append(os.path.join(root, file))
            # Check if the file is a GPU idle file (excluding PNG files)
            if "gpu_idle" in file and 'png' not in file:
                gpu_files.append(os.path.join(root, file))
            # Check if the file is a main log file (excluding PNG files)
            if "main" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))
            # Check if the file is a worker log file (excluding PNG files)
            if "worker" in file and 'png' not in file:
                log_files[root].append(os.path.join(root, file))

    # Return the lists of GPU idle files, main and worker log files, and GPU utilization files
    return gpu_files, log_files, gpu_util_files

# Get the GPU idle files, main and worker log files, and GPU utilization files from the specified directory
gpu_files, log_files, gpu_util_files = get_gpu_and_main_log_files(dir_)

# Filter out empty log file entries
log_files = {key: val for key, val in log_files.items() if val}

# Sort the log file keys naturally
log_files_keys = natsort.natsorted(log_files)

# Sort the GPU idle files naturally
gpu_files = natsort.natsorted(gpu_files)

# Sort the GPU utilization files naturally
gpu_util_files = natsort.natsorted(gpu_util_files)


for key in log_files_keys:
    log_files[key] = natsort.natsorted(log_files[key])
    print(key,log_files[key])
print(gpu_files)
print(len(log_files_keys), len(gpu_files), len(gpu_util_files))


for key, gpu_file, gpu_util_file in zip(log_files_keys, gpu_files, gpu_util_files):
    log_files_list = log_files[key]
    # keep only the text after the last '/'
    key = key.split('/')[-1]
    print(key)
    df_dict_everything_good_alloc_only[key] = get_everything(log_files_list, gpu_file, gpu_util_file)


In [None]:
df_gpu2 = {}
df_gpu3 = {}
df_gpu4 = {}

for key in df_dict_everything:
    if "gpu2" in key:
        df_gpu2[key] = df_dict_everything[key]
    if "gpu3" in key:
        df_gpu3[key] = df_dict_everything[key]
    if "gpu4" in key:
        df_gpu4[key] = df_dict_everything[key]

df_gpu2_io = {}
df_gpu3_io = {}
df_gpu4_io = {}

for key in df_dict_everything_io_only:
    if "gpu2" in key:
        df_gpu2_io[key] = df_dict_everything_io_only[key]
    if "gpu3" in key:
        df_gpu3_io[key] = df_dict_everything_io_only[key]
    if "gpu4" in key:
        df_gpu4_io[key] = df_dict_everything_io_only[key]

df_gpu2_good_alloc = {}
df_gpu3_good_alloc = {}
df_gpu4_good_alloc = {}
for key in df_dict_everything_good_alloc_only:
    if "gpu2" in key:
        df_gpu2_good_alloc[key] = df_dict_everything_good_alloc_only[key]
    if "gpu3" in key:
        df_gpu3_good_alloc[key] = df_dict_everything_good_alloc_only[key]
    if "gpu4" in key:
        df_gpu4_good_alloc[key] = df_dict_everything_good_alloc_only[key]

In [None]:
def get_worker_log_files(dir_path: str) -> tuple:
    import os
    log_files = {}
    for root, dirs, files in os.walk(dir_path):
        log_files[root] = []
        for file in files:
            if "worker" in file and 'png' not in file:
                # log_files.append(os.path.join(root, file))
                log_files[root].append(os.path.join(root, file))
    return log_files

In [None]:
dir_ = "../final_analysis_cloudlab_twenty"

log_files = get_worker_log_files(dir_)
print(log_files)
# For each key, process the log files
df_worker = {}
for key in log_files:
    log_files_list = log_files[key]
    for log_file in log_files_list:
        if "worker" in log_file:
            df_worker[log_file] = get_batch_preprocessing_times_with_ts(log_file)

In [None]:
dir_ = "../final_analysis_cloudlab_io_badalloc_twenty"

log_files = get_worker_log_files(dir_)
print(log_files)
# For each key, process the log files
df_worker_io_only = {}
for key in log_files:
    log_files_list = log_files[key]
    for log_file in log_files_list:
        if "worker" in log_file:
            df_worker_io_only[log_file] = get_batch_preprocessing_times_with_ts(log_file)

In [None]:
dir_ = "../final_analysis_cloudlab_good_alloc_twenty"

log_files = get_worker_log_files(dir_)
print(log_files)
# For each key, process the log files
df_worker_goodalloc_only = {}
for key in log_files:
    log_files_list = log_files[key]
    for log_file in log_files_list:
        if "worker" in log_file:
            df_worker_goodalloc_only[log_file] = get_batch_preprocessing_times_with_ts(log_file)

In [None]:
dir_ = "../final_analysis_cloudlab_io_twenty"

log_files = get_worker_log_files(dir_)
print(log_files)
# For each key, process the log files
df_worker_io = {}
for key in log_files:
    log_files_list = log_files[key]
    for log_file in log_files_list:
        if "worker" in log_file:
            df_worker_io[log_file] = get_batch_preprocessing_times_with_ts(log_file)

In [None]:
# For each key, calculate the idle time as per the formula
# idle_time = preprocessing_time_ts[cur_batch] - (preprocessing_time_ts[prev_batch] + preprocessing_time[prev_batch])
def calculate_idle_time(df_worker):
    for key in df_worker:
        if 'gpu1' in key:
            continue
        if 'w32' in key:
            continue
        df = df_worker[key]
        idle_times = []
        for batch_id in df.index:
            # find the next batch id in index
            # it is not batch_id + 1 because the index is not sequential
            # skip first 4 batches
            # The batch id is not sequential
            if batch_id < df.index[4]:
                # add 0 to idle times
                idle_times.append(0)
                continue        
            try:
                next_batch_id = df.index[df.index.get_loc(batch_id) + 1]
            except:
                continue
            if next_batch_id in df.index:
                idle_time = df.loc[next_batch_id]['preprocessing_time_ts']/10**9 - (df.loc[batch_id]['preprocessing_time_ts']/10**9 + df.loc[batch_id]['preprocessing_time'])
                idle_times.append(idle_time)
        idle_times.append(0)
        df_worker[key]['idle_time'] = idle_times

In [None]:
calculate_idle_time(df_worker)
calculate_idle_time(df_worker_io_only)
calculate_idle_time(df_worker_goodalloc_only)
calculate_idle_time(df_worker_io)

In [None]:
df_worker_gpu2 = {}
for key in df_worker:
    if "gpu2" in key:
        df_worker_gpu2[key] = df_worker[key]
df_worker_gpu2_io_only = {}
for key in df_worker_io_only:
    if "gpu2" in key:
        df_worker_gpu2_io_only[key] = df_worker_io_only[key]
df_worker_gpu3 = {}
for key in df_worker:
    if "gpu3" in key:
        df_worker_gpu3[key] = df_worker[key]
df_worker_gpu3_io_only = {}
for key in df_worker_io_only:
    if "gpu3" in key:
        df_worker_gpu3_io_only[key] = df_worker_io_only[key]
df_worker_gpu4 = {}
for key in df_worker:
    if "gpu4" in key:
        df_worker_gpu4[key] = df_worker[key]
df_worker_gpu4_io_only = {}
for key in df_worker_io_only:
    if "gpu4" in key:
        df_worker_gpu4_io_only[key] = df_worker_io_only[key]

df_worker_gpu2_good_alloc = {}
for key in df_worker_goodalloc_only:
    if "gpu2" in key:
        df_worker_gpu2_good_alloc[key] = df_worker_goodalloc_only[key]
df_worker_gpu3_good_alloc = {}
for key in df_worker_goodalloc_only:
    if "gpu3" in key:
        df_worker_gpu3_good_alloc[key] = df_worker_goodalloc_only[key]
df_worker_gpu4_good_alloc = {}
for key in df_worker_goodalloc_only:
    if "gpu4" in key:
        df_worker_gpu4_good_alloc[key] = df_worker_goodalloc_only[key]

        
df_worker_gpu2_io = {}
for key in df_worker_io:
    if "gpu2" in key:
        df_worker_gpu2_io[key] = df_worker_io[key]
df_worker_gpu3_io = {}
for key in df_worker_io:
    if "gpu3" in key:
        df_worker_gpu3_io[key] = df_worker_io[key]
df_worker_gpu4_io = {}
for key in df_worker_io:
    if "gpu4" in key:
        df_worker_gpu4_io[key] = df_worker_io[key]

print("df_gpu2",len(df_gpu2))
print("df_gpu3",len(df_gpu3))
print("df_gpu4",len(df_gpu4))
print("df_gpu2_io",len(df_gpu2_io))
print("df_gpu3_io",len(df_gpu3_io))
print("df_gpu4_io",len(df_gpu4_io))
print("df_gpu2_good_alloc",len(df_gpu2_good_alloc))
print("df_gpu3_good_alloc",len(df_gpu3_good_alloc))
print("df_gpu4_good_alloc",len(df_gpu4_good_alloc))
print("df_worker_gpu2",len(df_worker_gpu2))
print("df_worker_gpu3",len(df_worker_gpu3))
print("df_worker_gpu4",len(df_worker_gpu4))
print("df_worker_gpu2_io",len(df_worker_gpu2_io))
print("df_worker_gpu3_io",len(df_worker_gpu3_io))
print("df_worker_gpu4_io",len(df_worker_gpu4_io))
print("df_worker_gpu2_good_alloc",len(df_worker_gpu2_good_alloc))
print("df_worker_gpu3_good_alloc",len(df_worker_gpu3_good_alloc))
print("df_worker_gpu4_good_alloc",len(df_worker_gpu4_good_alloc))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from natsort import natsorted

def plot_total_idle_time(df_worker):
    # Extract configuration from keys
    config_idle_times = {}
    for key, df in df_worker.items():
        # config = key.split('/')[2]  # Extract 'b128_gpu1_w24' part
        config = key.split('/')[2]
        if config not in config_idle_times:
            config_idle_times[config] = []
        config_idle_times[config].append(df['idle_time'].sum())

    # Calculate total idle time for each configuration
    total_idle_times = {config: np.sum(times) for config, times in config_idle_times.items()}

    # Create a DataFrame for plotting
    plot_df = pd.DataFrame.from_dict(total_idle_times, orient='index', columns=['Total Idle Time'])
    plot_df['Configuration'] = plot_df.index
    plot_df['Batch Size'] = plot_df['Configuration'].apply(lambda x: x.split('_')[0][1:])
    plot_df['GPUs'] = plot_df['Configuration'].apply(lambda x: x.split('_')[1][3:])
    plot_df['Num Workers'] = plot_df['Configuration'].apply(lambda x: x.split('_')[2][1:])

    # Sort the DataFrame using natural sort for configurations
    plot_df = plot_df.loc[natsorted(plot_df.index)]

    # Set up the plot style
    sns.set_style("whitegrid")
    plt.figure(figsize=(20, 12), dpi=300)

    # Create the bar plot
    ax = sns.barplot(x='Configuration', y='Total Idle Time', data=plot_df, palette='viridis')

    # Customize the plot
    plt.title('Total Worker Idle Time per Configuration', fontsize=20, pad=20)
    plt.xlabel('Experiment Configuration', fontsize=16, labelpad=10)
    plt.ylabel('Total Idle Time (seconds)', fontsize=16, labelpad=10)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)

    # Add value labels on top of each bar
    for i, v in enumerate(plot_df['Total Idle Time']):
        ax.text(i, v, f'{v:.2f}', ha='center', va='bottom', fontweight='bold', fontsize=10)

    # Remove top and right spines
    sns.despine()

    plt.tight_layout()
    plt.savefig('total_idle_time_bar_plot.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Create a heatmap
    # Group by 'Batch Size', 'GPUs', and 'Num Workers', and take the mean of 'Total Idle Time'
    grouped_df = plot_df.groupby(['Batch Size', 'GPUs', 'Num Workers'])['Total Idle Time'].mean().reset_index()
    
    # Sort the grouped DataFrame using natural sorting for Batch Size and GPUs/Num Workers
    grouped_df['Batch Size'] = pd.to_numeric(grouped_df['Batch Size'], errors='coerce')  # Ensure Batch Size is numeric for sorting
    grouped_df.sort_values(by=['Batch Size', 'GPUs', 'Num Workers'], inplace=True)

    pivot_df = grouped_df.pivot(index='Batch Size', columns=['GPUs', 'Num Workers'], values='Total Idle Time')

    # Ensure natural sorting of columns in pivot table
    pivot_df.columns = natsorted(pivot_df.columns)

    plt.figure(figsize=(16, 10), dpi=300)
    sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='YlOrRd', annot_kws={'size': 10})
    
    plt.title('Total Worker Idle Time: Batch Size vs (GPUs, Num Workers)', fontsize=20)
    plt.xlabel('(GPUs, Num Workers)', fontsize=16)
    plt.ylabel('Batch Size', fontsize=16)
    
    plt.xticks(fontsize=12, rotation=45, ha='right')
    plt.yticks(fontsize=12)
    
    plt.tight_layout()
    plt.savefig('total_idle_time_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()

# Usage

In [None]:
# plot_total_idle_time(df_worker_gpu2)
# plot_total_idle_time(df_worker_gpu3)
# plot_total_idle_time(df_worker_gpu4)
# plot_total_idle_time(df_worker_gpu2_io)
# plot_total_idle_time(df_worker_gpu3_io)
# plot_total_idle_time(df_worker_gpu4_io)

In [None]:
def get_e2e_times(dir_path: str) -> list:
    import os
    e2e_times = {}
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if "lotustrace_log" in file:
                # custom_log_b128_gpu1
                key = file.replace("lotustrace_log_","").replace(".log","")
                # key = file.split('_')[-2] + "_" + file.split('_')[-1]
                # file is a csv
                df = pd.read_csv(os.path.join(root, file))
                try:
                    e2e_times[key] = df['wall(s)'].abs().sum()
                except:
                    e2e_times[key] = 0
    return e2e_times

In [None]:
dir_1 = "../final_analysis_cloudlab_twenty/e2e"
dir_2 = "../final_analysis_cloudlab_io_badalloc_twenty/e2e"
dir_3 = "../final_analysis_cloudlab_good_alloc_twenty/e2e"
dir_4 = "../final_analysis_cloudlab_io_twenty/e2e"

e2e_times_1 = get_e2e_times(dir_1)
e2e_times_2 = get_e2e_times(dir_2)
e2e_times_3 = get_e2e_times(dir_3)
e2e_times_4 = get_e2e_times(dir_4)

# sort using natsort
e2e_times_1 = dict(sorted(e2e_times_1.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
e2e_times_2 = dict(sorted(e2e_times_2.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
e2e_times_3 = dict(sorted(e2e_times_3.items(), key=lambda x: natsort.natsort_key(x[0].lower())))
e2e_times_4 = dict(sorted(e2e_times_4.items(), key=lambda x: natsort.natsort_key(x[0].lower())))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import natsort
from natsort import natsorted

def plot_total_idle_time_side_by_side(df_worker, df_worker_io):
    # Compute idle times for df_worker
    config_idle_times_worker = {}
    for key, df in df_worker.items():
        # Extract configuration string, e.g., 'b128_gpu1_w24'
        config = key.split('/')[2]
        # config = key
        config_idle_times_worker.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker = {config: np.sum(times) for config, times in config_idle_times_worker.items()}
    # apply the formula total worker idle/(total E2E * num of cpu)
    total_idle_times_worker = {config: total_idle_times_worker[config]/(e2e_times_1[config] * int(config.split('_')[2][1:])) for config in total_idle_times_worker}
    # print(total_idle_times_worker)
    # Compute idle times for df_worker_io
    config_idle_times_worker_io = {}
    for key, df in df_worker_io.items():
        config = key.split('/')[2]
        # config = key
        config_idle_times_worker_io.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker_io = {config: np.sum(times) for config, times in config_idle_times_worker_io.items()}
    # apply the formula total worker idle/(total E2E * num of cpu)
    total_idle_times_worker_io = {config: total_idle_times_worker_io[config]/(e2e_times_2[config] * int(config.split('_')[2][1:])) for config in total_idle_times_worker_io}

    # Use the union of keys from both dictionaries and sort naturally
    all_configs = natsorted(set(total_idle_times_worker.keys()).union(set(total_idle_times_worker_io.keys())))

    # Create a combined DataFrame with both idle time values for each configuration
    data = []
    for config in all_configs:
        worker_idle = total_idle_times_worker.get(config, 0)
        worker_io_idle = total_idle_times_worker_io.get(config, 0)
        # Extract additional details from configuration
        batch_size = config.split('_')[0][1:]
        gpus = config.split('_')[1][3:]
        num_workers = config.split('_')[2][1:]
        data.append({
            'Configuration': config,
            'Worker Inefficieny': worker_idle,
            'Worker Optimised Inefficieny': worker_io_idle,
            'Batch Size': batch_size,
            'GPUs': gpus,
            'Num Workers': num_workers
        })

    plot_df = pd.DataFrame(data)

    # Melt the DataFrame for side-by-side bar visualization using seaborn
    plot_df_melt = plot_df.melt(id_vars=['Configuration', 'Batch Size', 'GPUs', 'Num Workers'],
                                value_vars=['Worker Inefficieny', 'Worker Optimised Inefficieny'],
                                var_name='Task Type', value_name='Idle Time')

    # ----------------------- Bar Plot -----------------------
    sns.set_style("whitegrid")
    plt.figure(figsize=(20, 12), dpi=300)
    ax = sns.barplot(x='Configuration', y='Idle Time', hue='Task Type', data=plot_df_melt, palette='viridis')

    plt.title('Inefficieny per Configuration: Original vs Optimised', fontsize=20, pad=20)
    plt.xlabel('Experiment Configuration', fontsize=16, labelpad=10)
    plt.ylabel('Inefficieny', fontsize=16, labelpad=10)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)

    # Add value labels above each bar
    for p in ax.patches:
        height = p.get_height()
        # The x coordinate is obtained from the bar's x location and width
        ax.annotate(f'{height:.2f}',
                    (p.get_x() + p.get_width() / 2, height),
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

    sns.despine()
    plt.tight_layout()
    plt.savefig('idle_time_side_by_side_bar_plot.png', dpi=300, bbox_inches='tight')
    plt.show()

# Example call (assuming df_worker and df_worker_io dictionaries are defined)
plot_total_idle_time_side_by_side(df_worker_gpu2, df_worker_gpu2_io)
plot_total_idle_time_side_by_side(df_worker_gpu3, df_worker_gpu3_io)
plot_total_idle_time_side_by_side(df_worker_gpu4, df_worker_gpu4_io)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from natsort import natsorted

def plot_total_idle_time_side_by_side(df_worker, df_worker_io_only, df_worker_goodalloc, df_worker_io, e2e_times_1, e2e_times_2, e2e_times_3, e2e_times_4):
    # Compute idle times for df_worker
    config_idle_times_worker = {}
    for key, df in df_worker.items():
        if 'b256' in key:
            continue
        if 'b1024' in key:
            continue
        config = key.split('/')[2]
        config_idle_times_worker.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker = {config: np.sum(times) for config, times in config_idle_times_worker.items()}
    total_idle_times_worker = {
        config: total_idle_times_worker[config] / (e2e_times_1[config] * int(config.split('_')[2][1:]))
        for config in total_idle_times_worker
    }

    # Compute idle times for df_worker_io_only
    config_idle_times_worker_io = {}
    for key, df in df_worker_io_only.items():
        if 'b256' in key:
            continue
        if 'b1024' in key:
            continue
        config = key.split('/')[2]
        config_idle_times_worker_io.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker_io_only = {config: np.sum(times) for config, times in config_idle_times_worker_io.items()}
    total_idle_times_worker_io_only = {
        config: total_idle_times_worker_io_only[config] / (e2e_times_2[config] * int(config.split('_')[2][1:]))
        for config in total_idle_times_worker_io_only
    }

    # Compute idle times for df_worker_goodalloc
    config_idle_times_worker_goodalloc = {}
    for key, df in df_worker_goodalloc.items():
        if 'b256' in key:
            continue
        if 'b1024' in key:
            continue
        if 'w32' in key:
            continue
        config = key.split('/')[2]
        config_idle_times_worker_goodalloc.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker_goodalloc = {config: np.sum(times) for config, times in config_idle_times_worker_goodalloc.items()}
    total_idle_times_worker_goodalloc = {
        config: total_idle_times_worker_goodalloc[config] / (e2e_times_3[config] * int(config.split('_')[2][1:]))
        for config in total_idle_times_worker_goodalloc
    }


    # Compute idle times for df_worker_io
    config_idle_times_worker_io = {}
    for key, df in df_worker_io.items():
        if 'b256' in key:
            continue
        if 'b1024' in key:
            continue
        config = key.split('/')[2]
        config_idle_times_worker_io.setdefault(config, []).append(df['idle_time'].sum())
    total_idle_times_worker_io = {config: np.sum(times) for config, times in config_idle_times_worker_io.items()}
    total_idle_times_worker_io = {
        config: total_idle_times_worker_io[config] / (e2e_times_4[config] * int(config.split('_')[2][1:]))
        for config in total_idle_times_worker_io
    }

    # Use the union of keys from all dictionaries and sort naturally
    all_configs = natsorted(
        set(total_idle_times_worker.keys())
        | set(total_idle_times_worker_io.keys())
        | set(total_idle_times_worker_goodalloc.keys())
        | set(total_idle_times_worker_io.keys())
    )
    # Create a combined DataFrame with all idle time values for each configuration
    data = []
    for config in all_configs:
        worker_idle = total_idle_times_worker.get(config, 0)
        worker_io_only_idle = total_idle_times_worker_io_only.get(config, 0)
        worker_goodalloc_idle = total_idle_times_worker_goodalloc.get(config, 0)
        worker_io_idle = total_idle_times_worker_io.get(config, 0)
        batch_size = config.split('_')[0][1:]
        gpus = config.split('_')[1][3:]
        num_workers = config.split('_')[2][1:]
        data.append({
            'Configuration': config,
            'Original Pipeline': worker_idle,
            'Optimised Worker Assignment': worker_goodalloc_idle,
            'Optimised Batch Pinning': worker_io_only_idle,
            'Optimised Worker Assignment + Batch Pinning': worker_io_idle,
            'Batch Size': batch_size,
            'GPUs': gpus,
            'Num Workers': num_workers
        })

    plot_df = pd.DataFrame(data)

    transition_index = next((i for i, k in enumerate(all_configs) if 'b512' in k), None)

    # Melt the DataFrame for side-by-side bar visualization using seaborn
    plot_df_melt = plot_df.melt(
        id_vars=['Configuration', 'Batch Size', 'GPUs', 'Num Workers'],
        # value_vars=['Original Pipeline', 'IO Only Pipeline', 'Good Allocation Pipeline'],
        value_vars=['Original Pipeline', 'Optimised Worker Assignment', 'Optimised Batch Pinning', 'Optimised Worker Assignment + Batch Pinning'],
        var_name='Pipeline Type', value_name='Idle Time'
    )

    # ----------------------- Bar Plot -----------------------
    sns.set_style("whitegrid")
    plt.figure(figsize=(20, 12), dpi=1200)
    ax = sns.barplot(
        x='Configuration', y='Idle Time', hue='Pipeline Type',
        data=plot_df_melt, palette='viridis'
    )

    plt.title('Worker Inefficiency per Configuration: Original vs Optimised Worker Assignment vs Optimised Batch Pinning vs Optimised Worker Assignment + Batch Pinning', fontsize=20, pad=20)
    plt.xlabel('Experiment Configuration', fontsize=16, labelpad=10)
    plt.ylabel('Normalized Worker Idle Time', fontsize=16, labelpad=10)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)

    # Add value labels above each bar
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height:.2f}',
                    (p.get_x() + p.get_width() / 2, height),
                    ha='center', va='bottom', fontsize=10, fontweight='bold')

    ax.axvline(x=transition_index-0.5, color='black', linestyle='--', linewidth=2)

    sns.despine()
    # plt.tight_layout()
    # plt.savefig('idle_time_side_by_side_bar_plot.png', dpi=300, bbox_inches='tight')
    plt.show()
# Example call (assuming df_worker, df_worker_io, and df_worker_goodalloc dictionaries are defined)
# plot_total_idle_time_side_by_side(df_worker_gpu2, df_worker_gpu2_io, df_worker_gpu2_good_alloc, e2e_times_1, e2e_times_2, e2e_times_3)
plot_total_idle_time_side_by_side(df_worker_gpu3, df_worker_gpu3_io_only, df_worker_gpu3_good_alloc, df_worker_gpu3_io, e2e_times_1, e2e_times_2, e2e_times_3, e2e_times_4)
# plot_total_idle_time_side_by_side(df_worker_gpu4, df_worker_gpu4_io, df_worker_gpu4_good_alloc, e2e_times_1, e2e_times_2, e2e_times_3)