In [1]:
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import os
import re

def get_files(dir):
    return [os.path.join(dir, f) for f in os.listdir(dir)]

def threads_from_filename(name):
    m = re.match(r'.*_(\d+)t[\._]', name)
    return int(m.group(1))

def batchsize_from_filename(name):
    m = re.match(r'.*_b(\d+)[\._]', name)
    return int(m.group(1))

def read_metrics(file_path):
    metrics = pd.read_csv(file_path)
    metrics['total_time_us'] = metrics['total_time_us'] - metrics['inference_start']
    metrics['non_max_suppression'] = metrics['non_max_suppression'] - metrics['model']
    metrics['model'] = metrics['model'] - metrics['inference_start']

    metrics['inference_start'] = metrics['inference_start'] - metrics['inference_start'][0]

    metrics.iloc(1)[:4] = metrics.iloc(1)[:4] // 1000
    metrics = metrics[metrics['model'] < metrics['model'].quantile(0.99)]
    metrics['model_per_frame'] = metrics['model'] // metrics['batch_size']
    metrics['total_per_frame'] = metrics['total_time_us'] // metrics['batch_size']
    metrics['fps'] = 1 / (metrics['total_per_frame'] / 1_000_000)
    return metrics

def create_histograms(metrics_list, name_list, title, metric_name):
    fig = make_subplots(rows=len(metrics_list), cols=1, shared_xaxes=True, vertical_spacing=0.02, subplot_titles=[f'{i}' for i in range(len(metrics_list))])
    for idx, (metric, name) in enumerate(zip(metrics_list, name_list)):
        fig.add_histogram(x=metric[metric_name], row=idx+1, col=1, nbinsx=200, name=name)
        fig.add_vline(x=np.median(metric[metric_name]), row=idx+1, col=1, annotation_text=f'med. {round(np.median(metric[metric_name]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', line_color="grey", annotation_xshift=3)
        fig.add_vline(x=np.average(metric[metric_name]), row=idx+1, col=1, annotation_text=f'avg. {round(np.average(metric[metric_name]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', annotation_xshift=3, annotation_yshift=-20)

    fig.update_layout(height=100*len(metrics_list)+150, title=title)
    fig.update_xaxes(title_text=metric_name, row=len(metrics_list))
    return fig

    

In [13]:
FILES = [
    '../results/threads_batchsize/metrics_yolov8n_b1_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b2_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b4_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b8_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b16_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b32_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b64_cpu_12t.csv',
]
METRICS_LIST = [read_metrics(f) for f in FILES]
METRIC_NAME = 'total_per_frame'
TITLE = 'Inference time histogram - Deriving optimal batch size'

fig = make_subplots(
    rows=len(METRICS_LIST), 
    cols=1, 
    shared_xaxes=True, 
    vertical_spacing=0.02, 
    row_titles=[f'bsize {batchsize_from_filename(f)}' for f in FILES],
)
for idx, (metric, name) in enumerate(zip(METRICS_LIST, FILES)):
    fig.add_histogram(x=metric[METRIC_NAME], row=idx+1, col=1, nbinsx=200, name=name)
    fig.add_vline(x=np.median(metric[METRIC_NAME]), row=idx+1, col=1, annotation_text=f'med. {round(np.median(metric[METRIC_NAME]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', line_color="grey", annotation_xshift=3)
    fig.add_vline(x=np.average(metric[METRIC_NAME]), row=idx+1, col=1, annotation_text=f'avg. {round(np.average(metric[METRIC_NAME]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', annotation_xshift=3, annotation_yshift=-20)

fig.update_layout(height=100*len(METRICS_LIST)+150, width=1200, title=TITLE, showlegend=False)
fig.update_xaxes(title_text='time per frame [µs]', row=len(METRICS_LIST))
fig.update_yaxes(title_text='#')
fig.show()
fig.write_image('optimal_batch_size.png', scale=4)
fig.write_image('optimal_batch_size.pdf')

# Comment
# Environment: 12c/12t on Intel® Xeon® Platinum 8480+; 640x640px infer. size; YOLOv8n; IPEX enabled; float32; 9000 individual images
# Data: one datapoint represents one batch inference; n = 9000 / batch size; time per frame = inference time / batch size; inference time = model runtime + NMS; last percentile removed from data (by model runtime)


In [14]:
FILES = [
    '../results/threads_batchsize/metrics_yolov8n_b8_cpu_12t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b8_cpu_24t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b8_cpu_36t.csv',
    '../results/threads_batchsize/metrics_yolov8n_b8_cpu_48t.csv',
]
METRICS_LIST = [read_metrics(f) for f in FILES]
METRIC_NAME = 'total_per_frame'
TITLE = 'Inference time histogram - Deriving optimal level of parallelism'

fig = make_subplots(
    rows=len(METRICS_LIST), 
    cols=1, 
    shared_xaxes=True, 
    vertical_spacing=0.02, 
    row_titles=[f'{threads_from_filename(f)}c/t' for f in FILES],
)
for idx, (metric, name) in enumerate(zip(METRICS_LIST, FILES)):
    fig.add_histogram(x=metric[METRIC_NAME], row=idx+1, col=1, nbinsx=200, name=name)
    fig.add_vline(x=np.median(metric[METRIC_NAME]), row=idx+1, col=1, annotation_text=f'med. {round(np.median(metric[METRIC_NAME]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', line_color="grey", annotation_xshift=3)
    fig.add_vline(x=np.average(metric[METRIC_NAME]), row=idx+1, col=1, annotation_text=f'avg. {round(np.average(metric[METRIC_NAME]), 2)}', annotation_bgcolor='rgba(255,255,255,0.5)', annotation_xshift=3, annotation_yshift=-20)

fig.update_layout(height=100*len(METRICS_LIST)+150, width=1200, title=TITLE, showlegend=False)
fig.update_xaxes(title_text='time per frame [µs]', row=len(METRICS_LIST))
fig.update_yaxes(title_text='#')
fig.show()
fig.write_image('optimal_parallelism.png', scale=4)
fig.write_image('optimal_parallelism.pdf')

# Comment
# Environment: Intel® Xeon® Platinum 8480+; 640x640px infer. size; YOLOv8n; IPEX enabled; float32; 9000 individual images
# Data: one datapoint represents one batch inference; n = 9000 / batch size; time per frame = inference time / batch size; inference time = model runtime + NMS; last percentile removed from data (by model runtime)
# Conclusion: Total throughput scales with parallelism (on PyTorch level) but far from linear, so lower thread/core counts are more effective (i.e. throughput per thread).
