In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os,natsort
import argparse
import seaborn as sns

plt.rc('axes', labelsize=80)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=70)    # fontsize of the tick labels
plt.rc('ytick', labelsize=70)    # fontsize of the tick labels
plt.rc('legend', fontsize=60)    # legend fontsize


In [3]:
data_dir = '/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results'
fig_dir = '/home/mayurpl/sem_2/special_problems/ml_profiling/ml-pipeline-benchmark/code/image_classification/analysis/custom_logs/figs'
plot_type = 'box' # or 'box'

In [4]:
def plotter_preprocessing_time(target_dir,fig_size=(50,25), plot_type = 'box',fig_dir=''):
    
    root_to_files = {}
    for root, dirs, files in os.walk(target_dir):
        root_to_files[root] = files
    roots = sorted(root_to_files, key=lambda x: natsort.natsort_key(x.lower()))
    plot_df = pd.DataFrame()
    plot_dfs = []
    configs = []
    for root in roots:
        if 'e2e' in root:
            continue
        print(root)
        files = root_to_files[root]
        config_df = pd.DataFrame()

        # if 'b512' not in root:
        #     continue

        for file in files:
            if "worker_pid" not in file:
                continue

            df = pd.read_csv(os.path.join(root, file)
                            , header=None)

            # add header
            df.columns = ['name','start_ts','duration']

            # names that start with 'SBatchPreprocessed'
            df = df[df['name'].str.startswith('SBatchPreprocessed')]
            # map 'SBatchPreprocessed_' such that 'SBatchPreprocessed_idx' becomes 'idx' where idx is an integer
            df['batch_id'] = df['name'].map(lambda x: int(x.replace('SBatchPreprocessed_','')))


            # divide by 1000000 to convert from nanoseconds to milliseconds
            df['duration'] = df['duration']/1000000

            batch_config = root.split('/')[-1] 

            # add batch_config to configs
            if batch_config not in configs:
                configs.append(batch_config)
            config_df = pd.concat([config_df, df])
            
        # concatentate all dataframes
        if config_df.empty:
            continue

        
        # rename 'duration' to 'preprocessing_time'
        config_df = config_df.rename(columns={'duration':batch_config})

        # drop columns 'name' and 'start_ts'
        config_df = config_df.drop(columns=['name','batch_id','start_ts'])
        # reset index
        config_df = config_df.reset_index(drop=True)
        # get first quartile of config_df
        q1 = config_df.quantile(0.25)
        q3 = config_df.quantile(0.75)
        iqr = (q3 - q1) * 2

        # remove outliers less than q1 - iqr only (these are numbers from last batch which has
        #  elements less than batch size because elements in a dataset may not be a multiple of batch size)

        config_df = config_df[~(config_df < (q1 - iqr))]

        plot_dfs.append(config_df)
        
    plot_df = pd.concat(plot_dfs, axis=1, ignore_index=True)


    # reset index of plot_df
    plot_df = plot_df.reset_index(drop=True)

    # set column names to configs
    plot_df.columns = configs

    # remove points with value less than 1500
    # These points belong to the drop out type batch
    # plot_df = plot_df[plot_df > 1500]

    if plot_type == 'box':
        # box plot
        # plot_df.boxplot(figsize=fig_size)
        # make box size bigger and lines thicker
        plot_df.boxplot(figsize=fig_size, medianprops=dict(linestyle='-', linewidth=5),\
                        boxprops=dict(linestyle='-', linewidth=5),\
                        whiskerprops=dict(linestyle='-', linewidth=7),\
                        capprops=dict(linestyle='-', linewidth=7),\
                        flierprops=dict(marker='o', markersize=20,  # fill it with color no empty circles
                                        markerfacecolor='r',
                                        linestyle='none', markeredgecolor='g'))

    elif plot_type == 'violin':
        # violin plot
        sns.violinplot(data=plot_df)
    else:
        raise ValueError(f'plot_type {plot_type} not supported')

    # plot on log scale
    # plt.yscale('log')

    plt.gcf().set_size_inches(fig_size[0], fig_size[1])
    # label x axis
    plt.xlabel(f'Configurations', labelpad=40)
    # label y axis
    plt.ylabel('Preprocessing time in ms',labelpad=40)
    plt.tick_params(axis='y', which='major', pad=40) 
    plt.tick_params(axis='x', which='major', pad=40) 
    # rotate x ticks
    plt.xticks(rotation=90)
    # add legend to bottom right and increase size of legend markers
    plt.legend(loc='lower right',markerscale=4)
    plt.tight_layout() 
    fig_path = os.path.join(fig_dir, f'zzzbox_plot_batch_preprocessing_time.png') 
    plt.savefig(fig_path)
    plt.clf()

In [5]:
plotter_preprocessing_time(data_dir,fig_dir=fig_dir,plot_type=plot_type)

/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b128_gpu1
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b128_gpu2
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b128_gpu3
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b128_gpu4
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b256_gpu1
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b256_gpu2
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b256_gpu3
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b256_gpu4
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b512_gpu1
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b512_gpu2
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b512_gpu3
/home/mayurpl/sem_2/special_problems/ml_profiling/pace_logs/results/b512_g

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


<Figure size 5000x2500 with 0 Axes>