In [19]:
import os
import json
import sys
sys.path.append("../")
sys.path.append("../src")
sys.path.append("../prompts")
sys.path.append("../src/llmperf")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.performance_evaluation import SyntheticPerformanceEvaluator
from src.llmperf import utils

# Run multiple models through the benchmarking process

__Note:__ This analysis will work if a COE endpoint is used. Users will be able to test and compare performance metrics for different experts.

In [None]:
model_names = ["llama3-8b", "llama3-70b", "llama3-405b"]
results_dir = "../data/results/multiple_models"
num_workers = 1
timeout = 600
num_input_tokens = 1000
num_output_tokens = 1000
num_requests = 1 
sampling_params={}
user_metadata={}
llm_api="sncloud"

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    user_metadata["model_idx"] = model_idx
    # Instantiate evaluator
    evaluator = SyntheticPerformanceEvaluator(
        model_name=model_name,
        results_dir=results_dir,
        num_workers=num_workers,
        timeout=timeout,
        user_metadata=user_metadata,
        llm_api=llm_api,
    )

    # Run performance evaluation
    model_results_summary, model_results_per_request = evaluator.run_benchmark(
        num_input_tokens=num_input_tokens,
        num_output_tokens=num_output_tokens,
        num_requests=num_requests,
        sampling_params=sampling_params
    )
    
    flatten_model_results_summary = utils.flatten_dict(model_results_summary)
    filtered_flatten_model_results_summary = {key: value for key, value in flatten_model_results_summary.items() if key not in ["model"]}
    df_model_results_summary = pd.DataFrame.from_dict(filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary["model"]])
    
    df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Analyze collected results

In [None]:
df_all_summary_results

# Analyze switching time

In [None]:
# post processing individual request json files
def read_json_files_to_df(directory):
    data = []
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('individual_responses.json'):
            model_name = '_'.join(filename.split('_')[:2])
            file_path = os.path.join(directory, filename)
            # Read the JSON file
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                
                # Extract relevant fields from each JSON object and append to the data list
                for item in json_data:
                    data.append({
                        'start_time': item['start_time'],
                        'end_time': item['end_time'],
                        'server_ttft_s': item['server_ttft_s'],
                        'model_name': model_name
                    })
    
    # Create a DataFrame from the data list
    df = pd.DataFrame(data)
    return df

# Get the DataFrame
df = read_json_files_to_df(results_dir)

# transforming str to date time for sorting
df['start_time'] = pd.to_datetime(df['start_time'])
df = df.sort_values(by=['start_time'])

# transforming back to str for plotting
df['start_time'] = df['start_time'].dt.strftime(date_format="%H:%M:%S")
df

In [None]:
# Initialize a column for the switching time
df['server_switching_time'] = None

# Group by model_name
grouped = df.groupby('model_name')

# Iterate through each group to calculate switching time
for name, group in grouped:
    if len(group) > 1:
        first_ttft = group['server_ttft_s'].iloc[0]
        mean_ttft = group['server_ttft_s'].iloc[1:].mean()
        std_ttft = group['server_ttft_s'].iloc[1:].std()
        std_ttft = 1e-16 if np.isnan(std_ttft) else std_ttft
        switching_time = first_ttft - mean_ttft
        if switching_time > (mean_ttft + 3*std_ttft):
            df.loc[group.index[0], 'server_switching_time'] = switching_time
        else:
            None
    else:
        df.loc[group.index[0], 'server_switching_time'] = None

with pd.option_context('display.max_rows', None,):
    display(df)

In [None]:
# Create a scatter plot
plt.figure(figsize=(40, 12))
sns.scatterplot(x='start_time', y='server_ttft_s', hue='model_name', data=df, s=100, alpha=0.7)

# Identify and plot the first datapoint for each model_name with a red color and bold marker
first_points = df.groupby('model_name').first().reset_index()
plt.scatter(first_points['start_time'], first_points['server_ttft_s'], color='red', s=150, edgecolor='black', linewidth=2, label='First Point')

# Customize the plot
plt.title('Scatter Plot of server_ttft_s Over Time by Model Name')
plt.xlabel('Start Time')
plt.ylabel('Server TTFT (s)')
plt.legend(title='Model Name', loc='upper left')
plt.xticks(rotation='vertical')
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()