In [None]:
import json
import os
import sys

sys.path.append('../')
sys.path.append('../../')

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Read benchmarking output files
Responses with errors will be skipped.

In [None]:
def read_json_files(folder_path):
    data = []
    
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file ends with 'individual_responses.json'
        if filename.endswith('individual_responses.json'):
            file_path = os.path.join(folder_path, filename)
            
            # Open and load the JSON file
            with open(file_path, 'r') as file:
                try:
                    json_data = json.load(file)
                    # Reading responses but skipping the ones that have an error code
                    json_data = [{**request_response, 'filename': filename} for request_response in json_data if request_response['error_code'] is None]
                    data.append(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error reading {file_path}: {e}")
    return data

In [None]:
# Specify the paths to the results directories of each provider
provider_results_dir_paths = [
    '../data/results/path/provider1',
    '../data/results/path/provider2',
    '../data/results/path/provider3',
]

In [None]:
all_responses = []
for provide_path in provider_results_dir_paths:
    provider_identifier = provide_path.split('/')[-1]
    provider_responses = {
        'provider': provider_identifier,
        'responses': read_json_files(provide_path)
    }
    all_responses.append(provider_responses)

In [None]:
run_stats = []
for run in all_responses:
    metrics = {
        'provider': [],
        'filename': [],
        'model': [],
        'input_tokens': [],
        'output_tokens': [],
        'concurrent_requests': [], 
        'server_number_input_tokens': [],
        'server_number_output_tokens': [],
        'server_ttft_s': [], 
        'server_output_token_per_s_per_request': [],
        'server_end_to_end_latency_s': [],
        'client_ttft_s': [], 
        'client_output_token_per_s_per_request': [], 
        'client_end_to_end_latency_s': []
    }
    
    # Read responses
    provider = run['provider']
    for requests_from_file in run['responses']:
        for request_metrics in requests_from_file:
            metrics['provider'].append(provider)
            metrics['filename'].append(request_metrics['filename'])
            
            model_name = request_metrics['filename'].split('_')[3]
            input_tokens = int(request_metrics['filename'].split('_')[4])
            output_tokens = int(request_metrics['filename'].split('_')[5])
            concurrent_requests = int(request_metrics['filename'].split('_')[6])
            
            metrics['model'].append(model_name)
            metrics['input_tokens'].append(input_tokens)
            metrics['output_tokens'].append(output_tokens)
            metrics['concurrent_requests'].append(concurrent_requests)

            metrics['server_number_input_tokens'].append(request_metrics['server_number_input_tokens'])
            metrics['server_number_output_tokens'].append(request_metrics['server_number_output_tokens'])
            metrics['server_ttft_s'].append(request_metrics['server_ttft_s'])
            metrics['server_output_token_per_s_per_request'].append(request_metrics['server_output_token_per_s_per_request'])
            metrics['server_end_to_end_latency_s'].append(request_metrics['server_end_to_end_latency_s'])
            metrics['client_ttft_s'].append(request_metrics['client_ttft_s'])
            metrics['client_output_token_per_s_per_request'].append(request_metrics['client_output_token_per_s_per_request'])
            metrics['client_end_to_end_latency_s'].append(request_metrics['client_end_to_end_latency_s'])

    df_metrics =  pd.DataFrame(metrics)
    
    # Calculate statistics
    df_metric_stats = df_metrics.groupby(by='filename')[[
        'server_ttft_s',
        'server_output_token_per_s_per_request',
        'server_end_to_end_latency_s',
        'client_ttft_s',
        'client_output_token_per_s_per_request',
        'client_end_to_end_latency_s'
    ]].agg(['median','std'])
    df_metric_stats.columns = ['_'.join(col).strip() for col in df_metric_stats.columns.values]
    df_metric_stats.style.format("{:,.3f}")
    
    # Calculate parameters
    df_parameters = df_metrics.groupby(by='filename')[[
        'provider',
        'model',
        'input_tokens',
        'output_tokens',
        'concurrent_requests'
    ]].agg(['first'])    
    df_parameters.columns = ['_'.join(col).strip() for col in df_parameters.columns.values]
    df_parameters.columns = [col.split('_')[0] for col in df_parameters.columns.values]
    df_parameters.style.format("{:,.0f}")
    
    request_count = df_metrics.groupby(by='filename')[['provider']].count().rename(columns={'provider': 'request_count'})
    request_count.style.format("{:,.0f}")
    
    # Merge statistics and parameters
    df = pd.concat([df_parameters, request_count, df_metric_stats], axis=1)
    run_stats.append(df)
    
results = pd.concat(run_stats)
results.head()

## Filter runs based on analysis objective (Optional)

In this example, we'll just filter Llama 70B models for input tokens 100, 1k and 10k, and concurrent requests 1 and 10.

In [None]:
results.model = results.model.str.lower()
results_70b = results[(results.model.str.contains('llama-3-1-70b')) & (results.input.isin([100,1_000,10_000])) & (results.concurrent.isin([1,10]))]
results_70b.head()

## Plot benchmarking charts among providers

You may change the pallette color based on the color that better identifies each provider. You can take the [following link](https://seaborn.pydata.org/tutorial/color_palettes.html) as reference. Also, you will need to update the suptitle to reflect the model you're showing and any other detail. 

### TTFT

In [None]:
# Pallette for the providers, change colors and provider names as needed
palette = {'provider1': 'orange', 'provider2': 'tab:purple', 'provider3': 'xkcd:blue'}

# Get unique concurrent values
concurrent_values = results_70b['concurrent'].unique()
concurrent_values.sort()

# Set up the figure and axes
fig, axes = plt.subplots(len(concurrent_values), 1, figsize=(10, 5 * len(concurrent_values)), sharex=False)

# Add a supertitle, it could be the model name
fig.suptitle('<<Model_name>>', fontsize=20)

# Plot each concurrent value
for ax, concurrent in zip(axes, concurrent_values):
    subset = results_70b[results_70b['concurrent'] == concurrent]
    sns.barplot(data=subset, x='input', y='client_ttft_s_median', hue='provider', ax=ax, palette=palette, errorbar=None)
    ax.set_title(f'Client TTFT (s) for Concurrent Requests: {concurrent}')
    ax.set_xlabel('Input Tokens')
    ax.set_ylabel('Client TTFT (s)')
    ax.annotate('Note: A lower TTFT is better',
            xy = (0.5, -0.2),
            xycoords='axes fraction',
            ha='center',
            va="center",
            fontsize=10)

plt.tight_layout()
plt.show()

### Latency

In [None]:
# Get unique concurrent values
concurrent_values = results_70b['concurrent'].unique()
concurrent_values.sort()

# Set up the figure and axes
fig, axes = plt.subplots(len(concurrent_values), 1, figsize=(10, 5 * len(concurrent_values)), sharex=False)

# Add a supertitle, it could be the model name
fig.suptitle('<<Model_name>>', fontsize=20)

# Plot each concurrent value
for ax, concurrent in zip(axes, concurrent_values):
    subset = results_70b[results_70b['concurrent'] == concurrent]
    sns.barplot(data=subset, x='input', y='client_end_to_end_latency_s_median', hue='provider', ax=ax, palette=palette, errorbar=None)
    ax.set_title(f'Client E2E Latency (s) for Concurrent Requests: {concurrent}')
    ax.set_xlabel('Input Tokens')
    ax.set_ylabel('Client E2E Latency (s)')
    ax.annotate('Note: A lower Latency is better',
        xy = (0.5, -0.2),
        xycoords='axes fraction',
        ha='center',
        va="center",
        fontsize=10)


plt.tight_layout()
plt.show()

### Tokens / sec

In [None]:
# Get unique concurrent values
concurrent_values = results_70b['concurrent'].unique()
concurrent_values.sort()

# Set up the figure and axes
fig, axes = plt.subplots(len(concurrent_values), 1, figsize=(10, 5 * len(concurrent_values)), sharex=False)

# Add a supertitle, it could be the model name
fig.suptitle('<<Model_name>>', fontsize=20)

# Plot each concurrent value
for ax, concurrent in zip(axes, concurrent_values):
    subset = results_70b[results_70b['concurrent'] == concurrent]
    sns.barplot(data=subset, x='input', y='client_output_token_per_s_per_request_median', hue='provider', ax=ax, palette=palette, errorbar=None)
    ax.set_title(f'Client Tokens/sec per request for Concurrent Requests: {concurrent}')
    ax.set_xlabel('Input Tokens')
    ax.set_ylabel('Client Tokens/sec per request')
    ax.annotate('Note: higher Tokens/sec is better',
        xy = (0.5, -0.2),
        xycoords='axes fraction',
        ha='center',
        va="center",
        fontsize=10)


plt.tight_layout()
plt.show()

### Throughput

In [None]:
# Get unique concurrent values
concurrent_values = results_70b['concurrent'].unique()
concurrent_values.sort()

# Set up the figure and axes
fig, axes = plt.subplots(len(concurrent_values), 1, figsize=(10, 5 * len(concurrent_values)), sharex=False)

# Add a supertitle, it could be the model name
fig.suptitle('<<Model_name>>', fontsize=20)

# Plot each concurrent value
results_70b['client_throughput_median'] = results_70b['concurrent']*results_70b['client_output_token_per_s_per_request_median']
for ax, concurrent in zip(axes, concurrent_values):
    subset = results_70b[results_70b['concurrent'] == concurrent]
    sns.barplot(data=subset, x='input', y='client_throughput_median', hue='provider', ax=ax, palette=palette, errorbar=None)
    ax.set_title(f'Client Throughput (tok/s) for Concurrent Requests: {concurrent}')
    ax.set_xlabel('Input Tokens')
    ax.set_ylabel('Client Throughput (tok/s)')
    ax.annotate('Note: A higher Throughput is better',
        xy = (0.5, -0.2),
        xycoords='axes fraction',
        ha='center',
        va="center",
        fontsize=10)


plt.tight_layout()
plt.show()