# Token Benchmark Example Analysis
The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def get_final_metrics_df(valid_df):
    final_df = pd.DataFrame()
    
    final_df["number_input_tokens"] = valid_df["number_input_tokens"]
    final_df["number_output_tokens"] = valid_df["number_output_tokens"]
    final_df["number_total_tokens"]  = valid_df["number_total_tokens"]
    final_df["concurrent_user"] = valid_df["concurrent_user"]
    
    # server metrics
    final_df["ttft_server_s"] = valid_df["ttft_server_s"]
    final_df["end_to_end_latency_server_s"] = valid_df["end_to_end_latency_server_s"]
    final_df["generation_throughput_server"] = valid_df["request_output_throughput_server_token_per_s"]
    final_df["batch_size_used"] = valid_df["batch_size_used"]
    final_df["total_tokens_per_sec_server"] = valid_df["total_tokens_per_sec_server"]
    
    # client metrics
    final_df["ttft_s"] = valid_df["ttft_s"]
    final_df["end_to_end_latency_s"] = valid_df["end_to_end_latency_s"]
    final_df["generation_throughput"] = valid_df["request_output_throughput_token_per_s"]
    
    return final_df

# Metrics across concurrent workers

In [None]:
# num_current_users = [1,4,8,16]
mode='stream'
num_current_users = [1]

In [None]:
# path to the individual responses json file
df = pd.DataFrame()
for concurrent_user in num_current_users:
    df_user = pd.read_json(f'../data/results/llmperf/COE-llama-2-7b-chat-hf_1024_1024_{concurrent_user}_{mode}_individual_responses.json')
    # df_user = pd.read_json(f'../data/results/llmperf/COE-llama-2-7b-chat-hf_150_150_{concurrent_user}_{mode}_individual_responses.json')
    # df_user = pd.read_json(f'sambanova-Llama-2-7b-chat_150_150_{concurrent_user}_individual_responses.json')
    df_user['concurrent_user'] = concurrent_user
    df = pd.concat([df,df_user])

In [None]:
valid_df = df[(df["error_code"] != "")]
final_df = get_final_metrics_df(valid_df)

In [None]:
fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(8,12))
sns.scatterplot(data=final_df, x="number_input_tokens", y="ttft_s", hue="concurrent_user", ax=ax[0]).set_title("Number of Input Tokens vs. TTFT")
sns.scatterplot(data=final_df, x="number_output_tokens", y="generation_throughput", hue="concurrent_user", ax=ax[1]).set_title("Number of output Tokens vs. Throughput")

In [None]:
fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(8,12))
sns.boxplot(data=final_df, x="ttft_s", hue="concurrent_user", ax=ax[0])
sns.boxplot(data=final_df, x="generation_throughput", hue="concurrent_user", ax=ax[1])


# Server vs client metrics

In [None]:
concurrent_user = 1
mode = 'stream'

In [None]:
df = pd.read_json(f'../data/results/llmperf/COE-llama-2-7b-chat-hf_1024_1024_{concurrent_user}_{mode}_individual_responses.json')
df['concurrent_user'] = concurrent_user

In [None]:
valid_df = df[(df["error_code"] != "")]
final_df = get_final_metrics_df(valid_df)

In [None]:
df_server = final_df[['ttft_server_s','number_input_tokens', 'number_total_tokens', 'generation_throughput_server','number_output_tokens', 'end_to_end_latency_server_s']].copy()
df_server = df_server.rename(columns = {'ttft_server_s': 'ttft', 'generation_throughput_server': 'generation_throughput', 'end_to_end_latency_server_s': 'e2e_latency'})
df_server['type'] = 'Server side'               

df_client = final_df[['ttft_s','number_input_tokens', 'number_total_tokens', 'generation_throughput','number_output_tokens', 'end_to_end_latency_s']].copy()
df_client = df_client.rename(columns = {'ttft_s': 'ttft', 'end_to_end_latency_s': 'e2e_latency'})
df_client['type'] = 'Client side'               

df_ttft_throughput_latency = pd.concat([df_server, df_client], ignore_index=True)

In [None]:
fig,ax = plt.subplots(nrows=3,ncols=1,figsize=(8,20))
sns.scatterplot(data=df_ttft_throughput_latency, x="number_input_tokens", y="ttft", hue="type", ax=ax[0], alpha=0.5).set_title("Number of Input Tokens vs. TTFT")
sns.scatterplot(data=df_ttft_throughput_latency, x="number_output_tokens", y="generation_throughput", hue="type", ax=ax[1], alpha=0.5).set_title("Number of output Tokens vs. Throughput");
sns.scatterplot(data=df_ttft_throughput_latency, x="number_output_tokens", y="e2e_latency", hue="type", ax=ax[2], alpha=0.5).set_title("Number of output tokens vs Latency");

In [None]:
fig,ax = plt.subplots(nrows=3,ncols=1,figsize=(8,12))
sns.boxplot(data=df_ttft_throughput_latency, x="ttft", y="type", ax=ax[0])
sns.boxplot(data=df_ttft_throughput_latency, x="e2e_latency", y="type", ax=ax[1])
sns.boxplot(data=df_ttft_throughput_latency, x="generation_throughput", y="type", ax=ax[2]);

## Show multiple distributions

In [None]:
def show_distributions(models, data_path, input_tokens=1000, output_tokens=1000, num_concurrent_workers = 1, mode='stream'):
    
    fig = plt.figure(layout='constrained', figsize=(40, 25))    
    subfigs = fig.subfigures(nrows=len(models))
    
    df_user = pd.read_json(f"{data_path}{model.replace('/','-').replace('.','-')}_{input_tokens}_{output_tokens}_{num_concurrent_workers}_stream_individual_responses.json")
    df_user['concurrent_user'] = num_concurrent_workers
    final_df = get_final_metrics_df(df_user)
    
    if mode == 'batch':
        
        for idx, model in enumerate(models):
            
            df_server = final_df[['ttft_server_s','number_input_tokens', 'number_total_tokens', 'generation_throughput_server','number_output_tokens', 'end_to_end_latency_server_s', 'batch_size_used', 'total_tokens_per_sec_server']].copy()
            df_server = df_server.rename(columns = {'ttft_server_s': 'ttft', 'generation_throughput_server': 'generation_throughput', 'end_to_end_latency_server_s': 'e2e_latency', 'total_tokens_per_sec_server': 'total_tokens_per_sec'})
            df_server['type'] = 'Server side'               

            df_client = final_df[['ttft_s','number_input_tokens', 'number_total_tokens', 'generation_throughput','number_output_tokens', 'end_to_end_latency_s']].copy()
            df_client = df_client.rename(columns = {'ttft_s': 'ttft', 'end_to_end_latency_s': 'e2e_latency'})
            df_client['batch_size_used'] = None
            df_client['total_tokens_per_sec'] = None
            df_client['type'] = 'Client side'               

            df_ttft_throughput_latency = pd.concat([df_server, df_client], ignore_index=True)
            
            ax = subfigs[idx].subplots(1, 6)
            subfigs[idx].suptitle(f'{model}', fontsize='x-large')

            sns.boxplot(data=df_ttft_throughput_latency, x="number_input_tokens", y="type", ax=ax[0])
            sns.boxplot(data=df_ttft_throughput_latency, x="number_output_tokens", y="type", ax=ax[1])
            sns.boxplot(data=df_ttft_throughput_latency, x="ttft", y="type", ax=ax[2])
            sns.boxplot(data=df_ttft_throughput_latency, x="generation_throughput", y="type", ax=ax[3])
            sns.boxplot(data=df_ttft_throughput_latency, x="e2e_latency", y="type", ax=ax[4])
            sns.boxplot(data=df_ttft_throughput_latency, x="total_tokens_per_sec", y="type", hue="batch_size_used", ax=ax[5])

    else:
    
        for idx, model in enumerate(models):
            
            df_server = final_df[['ttft_server_s','number_input_tokens', 'number_total_tokens', 'generation_throughput_server','number_output_tokens', 'end_to_end_latency_server_s']].copy()
            df_server = df_server.rename(columns = {'ttft_server_s': 'ttft', 'generation_throughput_server': 'generation_throughput', 'end_to_end_latency_server_s': 'e2e_latency'})
            df_server['type'] = 'Server side'               

            df_client = final_df[['ttft_s','number_input_tokens', 'number_total_tokens', 'generation_throughput','number_output_tokens', 'end_to_end_latency_s']].copy()
            df_client = df_client.rename(columns = {'ttft_s': 'ttft', 'end_to_end_latency_s': 'e2e_latency'})
            df_client['type'] = 'Client side'               

            df_ttft_throughput_latency = pd.concat([df_server, df_client], ignore_index=True)
            
            ax = subfigs[idx].subplots(1, 5)
            subfigs[idx].suptitle(f'{model}', fontsize='x-large')

            sns.boxplot(data=df_ttft_throughput_latency, x="number_input_tokens", y="type", ax=ax[0])
            sns.boxplot(data=df_ttft_throughput_latency, x="number_output_tokens", y="type", ax=ax[1])
            sns.boxplot(data=df_ttft_throughput_latency, x="ttft", y="type", ax=ax[2])
            sns.boxplot(data=df_ttft_throughput_latency, x="generation_throughput", y="type", ax=ax[3])
            sns.boxplot(data=df_ttft_throughput_latency, x="e2e_latency", y="type", ax=ax[4])
        
    fig.suptitle(f"No Output Tokens, TTFT, Throughput, E2E Latency distributions", fontsize='xx-large')
    plt.show()

In [None]:
models = ['COE/Meta-Llama-3-8B-Instruct-dybs_v1','COE/Meta-Llama-3-8B-Instruct-dybs_v2','COE/Meta-Llama-3-8B-Instruct-dybs_v3']
show_distributions(models,  data_path='../data/results/llmperf/debug_final_test/', input_tokens=1000, output_tokens=1000, num_concurrent_workers=50, mode='batch')

In [None]:
models = ['COE/Meta-Llama-3-8B-Instruct_v1','COE/Meta-Llama-3-8B-Instruct_v2','COE/Meta-Llama-3-8B-Instruct_v3']
show_distributions(models,  data_path='../data/results/llmperf/debug_final_test/', input_tokens=1000, output_tokens=1000)

In [None]:
models = ['COE/llama-2-7b-chat-hf_v1','COE/llama-2-7b-chat-hf_v2','COE/llama-2-7b-chat-hf_v3']
show_distributions(models,  data_path='../data/results/llmperf/debug_final_test/', input_tokens=1000, output_tokens=1000)

In [None]:
models = ['COE/llama-2-13b-chat-hf_v1','COE/llama-2-13b-chat-hf_v2','COE/llama-2-13b-chat-hf_v3']
show_distributions(models,  data_path='../data/results/llmperf/debug_final_test/', input_tokens=1000, output_tokens=1000)

In [None]:
models = ['COE/Mistral-7B-Instruct-v0-2_v1','COE/Mistral-7B-Instruct-v0-2_v2','COE/Mistral-7B-Instruct-v0-2_v3']
show_distributions(models,  data_path='../data/results/llmperf/debug_final_test/', input_tokens=1000, output_tokens=1000)