# Analyze responses
The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses.

In [1]:
import pandas as pd
from typing import List
import plotly.graph_objects as go
import numpy as np

## Read the input json file

In [2]:
# path to the individual responses json file
df_user = pd.read_json(f'../data/results/llmperf/COE-llama-2-7B-chat-hf_1024_1024_32_stream_individual_responses.json')
df_user = df_user[(df_user["error_code"] != "")]

In [3]:
# for non-batching endpoints, batch_size_used will be 1
if df_user["batch_size_used"].isnull().all():
    df_user["batch_size_used"] = 1

## Server vs client metrics
Following charts show a comparison between server-side and client-side metrics across different performance metrics

In [12]:
def plot_client_vs_server_barplots(df_user: pd.DataFrame, x_col: str, y_cols: List[str], legend_labels: List[str], title: str, ylabel: str, xlabel: str) -> None:
    """
    Plots bar plots for client vs server metrics from a DataFrame.

    Args:
        df_user (pd.DataFrame): The DataFrame containing the data to plot.
        x_col (str): The column name to be used as the x-axis.
        y_cols (List[str]): A list of column names to be used as the y-axis.
        legend_labels (List[str]): Human-readable labels for each grouping in y_cols.
        title (str): The title of the plot.
        ylabel (str): The label for the y-axis.
        xlabel (str): The label for the x-axis.

    Returns:
        fig (go.Figure): The plotly figure container
    """    
    value_vars = y_cols
    title_text = title
    yaxis_title = ylabel
    xaxis_title = xlabel

    df_melted = df_user.melt(
        id_vars=[x_col], 
        value_vars=value_vars, 
        var_name='Metric', 
        value_name='Value',
    )
    xgroups = [str(x) for x in sorted(pd.unique(df_melted[x_col]))]
    df_melted[x_col] = [str(x) for x in df_melted[x_col]]

    valsl = {}
    valsr = {}
    for i in xgroups:
        maskl = (df_melted["Metric"] == value_vars[0]) & (df_melted[x_col] == i)
        valsl[i] = np.percentile(df_melted["Value"][maskl], [5, 50, 95])
        maskr = (df_melted["Metric"] == value_vars[1]) & (df_melted[x_col] == i)
        valsr[i] = np.percentile(df_melted["Value"][maskr], [5, 50, 95])

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x = xgroups,
            y = [0 for _ in xgroups],
            base = [valsl[i][1] for i in xgroups],
            customdata=[legend_labels[0] for _ in xgroups],
            marker={"color":"#325c8c","line":{"color":"#325c8c", "width":2}},
            offsetgroup=0,
            legendgroup=legend_labels[0],
            name=legend_labels[0],
            showlegend=False,
            hovertemplate="<extra></extra><b>%{customdata}</b> median: %{base:.2f}",
        )
    )
    fig.add_trace(
        go.Bar(
            x = xgroups,
            y = [valsl[i][2] - valsl[i][0] for i in xgroups],
            base = [valsl[i][0] for i in xgroups],
            customdata = [valsl[i][2] for i in xgroups],
            marker={"color":"#325c8c"},
            opacity=0.5,
            offsetgroup=0,
            legendgroup=legend_labels[0],
            name=legend_labels[0],
            hovertemplate="<extra></extra>5–95 pctile range: %{base:.2f}–%{customdata:.2f}",
        )
    )
    fig.add_trace(
        go.Bar(
            x = xgroups,
            y= [0 for _ in xgroups],
            base = [valsr[i][1] for i in xgroups],
            customdata=[legend_labels[1] for _ in xgroups],
            marker={"color":"#ee7625","line":{"color":"#ee7625", "width":2}},
            offsetgroup=1,
            legendgroup=legend_labels[1],
            name=legend_labels[1],
            showlegend=False,
            hovertemplate="<extra></extra><b>%{customdata}</b> median: %{base:.2f}",
        )
    )
    fig.add_trace(
        go.Bar(
            x = xgroups,
            y = [valsr[i][2] - valsr[i][0] for i in xgroups],
            base = [valsr[i][0] for i in xgroups],
            customdata = [valsr[i][2] for i in xgroups],
            marker={"color":"#ee7625"},
            opacity=0.5,
            offsetgroup=1,
            legendgroup=legend_labels[1],
            name=legend_labels[1],
            hovertemplate="<extra></extra>5–95 pctile range: %{base:.2f}–%{customdata:.2f}",
        )
    )

    fig.update_layout(
        title_text=title_text,
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
        barmode="group",
        template="plotly_dark",
        hovermode="x unified",
    )

    fig.update_xaxes(hoverformat="foo")
    return fig

In [None]:
x_col = "batch_size_used"
xaxis_title="Batch size"
value_vars = ['server_output_token_per_s_per_request', 'client_output_token_per_s_per_request']
legend_labels = ["Server", "Client"]
yaxis_title = "Tokens per second, per request"
title_text = "Distribution of throughput by batch size"
plot_client_vs_server_barplots(df_user, x_col, value_vars, legend_labels, title_text, yaxis_title, xaxis_title).show()

In [None]:
x_col = "batch_size_used"
xaxis_title="Batch size"
value_vars = ['server_ttft_s', 'client_ttft_s']
legend_labels = ["Server", "Client"]
yaxis_title = "TTFT (s), per request"
title_text = "Distribution of Time to First Token (TTFT) by batch size"
plot_client_vs_server_barplots(df_user, x_col, value_vars, legend_labels, title_text, yaxis_title, xaxis_title).show()

In [None]:
x_col = "batch_size_used"
xaxis_title="Batch size"
value_vars = ['server_end_to_end_latency_s', 'client_end_to_end_latency_s']
legend_labels = ["Server", "Client"]
yaxis_title = "Latency (s), per request"
title_text = "Distribution of end-to-end latency by batch size"
plot_client_vs_server_barplots(df_user, x_col, value_vars, legend_labels, title_text, yaxis_title, xaxis_title).show()

## Create a summary dataframe
Group results by batch and get sum of number of tokens, mean throughput, mean TTFT, and batch frequency. Finally, calculate the total number of output tokens per batch 

In [None]:
df_summary_thorughput = df_user.groupby('batch_size_used')['server_output_token_per_s_per_request'].mean().reset_index()
df_summary_output_tokens = df_user.groupby('batch_size_used')['server_number_output_tokens'].sum().reset_index()
df_summary_ttft = df_user.groupby('batch_size_used')['server_ttft_s'].mean().reset_index()
df_summary_count = df_user.groupby('batch_size_used').size().reset_index(name='Counts')


df_summary = pd.merge(df_summary_thorughput, df_summary_output_tokens, on='batch_size_used', how='inner')
df_summary = pd.merge(df_summary, df_summary_ttft, on='batch_size_used', how='inner')
df_summary = pd.merge(df_summary, df_summary_count, on='batch_size_used', how='inner')
df_summary['server_combined_output_tokens_per_s'] = df_summary['server_output_token_per_s_per_request']*df_summary['batch_size_used']

df_summary.rename(columns={
    "batch_size_used": "Batch size",
    "server_output_token_per_s_per_request": "Avg. server tokens per sec per request",
    "server_number_output_tokens": "Total output tokens",
    "server_ttft_s": "Avg. server TTFT (s)",
    "Counts": "Total number of requests",
    "server_combined_output_tokens_per_s": "Avg. server total tokens per second"
}, 
inplace=True)

df_summary.set_index("Batch size", inplace=True)
df_summary.T.style \
    .format("{:.2f}", subset=([True,False,True,False,True],[True]*len(df_summary))) \
    .format("{:.0f}", subset=([False,True,False,True,False],[True]*len(df_summary)))

## Time taken
- Compute the time that calls are cumulatively waiting for time-to-first-token vs time to generate tokens

In [None]:
total_wait_time_ttft = (df_summary["Total number of requests"]/df_summary.index*df_summary["Avg. server TTFT (s)"]).sum()
total_generation_time = (df_summary["Total output tokens"]/df_summary["Avg. server tokens per sec per request"]).sum()
print(f'Total wait time due to TTFT (mins) = {total_wait_time_ttft/60:,.4f}')
print(f'Total generation time due (mins) = {total_generation_time/60:,.4f}')
print(f'Total time (mins) = {(total_wait_time_ttft + total_generation_time)/60:,.4f}')

## Requests Gantt Chart
- Blue bar is the total time to get back full response
- Orange line is the time call is waiting to be executed

In [10]:
def plot_requests_gantt_chart(df_user: pd.DataFrame):
    """
    Plots a Gantt chart of response timings across all requests

    Args:
        df_user (pd.DataFrame): The DataFrame containing the data to plot.

    Returns:
        fig (go.Figure): The plotly figure container
    """    
    requests = df_user.index+1
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            y=requests,
            x=1000*df_user["client_ttft_s"],
            base=[str(x) for x in df_user["start_time"]],
            name="TTFT",
            orientation="h",
            marker_color="#ee7625",
        )
    )
    fig.add_trace(
        go.Bar(
            y=requests,
            x=1000*df_user["client_end_to_end_latency_s"],
            base=[str(x) for x in df_user["start_time"]],
            name="End-to-end latency",
            orientation="h",
            marker_color="#325c8c",
        )
    )
    for i in range(0, len(df_user.index), 2):
        fig.add_hrect(y0=i+0.5, y1=i+1.5, line_width=0, fillcolor="grey", opacity=0.1)
    fig.update_xaxes(
        type="date",
        tickformat="%H:%M:%S",
        hoverformat="%H:%M:%S.%2f",)
    fig.update_layout(
        title_text="LLM requests across time",
        xaxis_title="Time stamp",
        yaxis_title="Request index",
        template="plotly_dark",
    )
    return fig

In [None]:
plot_requests_gantt_chart(df_user).show()