In [None]:
import sys
sys.path.append("../")
sys.path.append("../src")
sys.path.append("../prompts")
sys.path.append("../src/llmperf")

import pandas as pd

from src.performance_evaluation import SyntheticPerformanceEvaluator
from src.llmperf import utils

# Run multiple models through the benchmarking process

__Note:__ This analysis will work if a COE endpoint is used. Users will be able to test and compare performance metrics for different experts.

In [None]:
model_names = ["llama3-8b", "llama3-70b", "llama3-405b"]
results_dir = "../data/results/multiple_models"
num_workers = 1
timeout = 600
num_input_tokens = 1000
num_output_tokens = 1000
num_requests = 1 
sampling_params={}
user_metadata={}
llm_api="sncloud"

In [None]:
df_all_summary_results = pd.DataFrame()
for model_name in model_names:
    # Instantiate evaluator
    evaluator = SyntheticPerformanceEvaluator(
        model_name=model_name,
        results_dir=results_dir,
        num_workers=num_workers,
        timeout=timeout,
        user_metadata=user_metadata,
        llm_api=llm_api
    )

    # Run performance evaluation
    model_results_summary, model_results_per_request = evaluator.run_benchmark(
        num_input_tokens=num_input_tokens,
        num_output_tokens=num_output_tokens,
        num_requests=num_requests,
        sampling_params=sampling_params
    )
    
    flatten_model_results_summary = utils.flatten_dict(model_results_summary)
    filtered_flatten_model_results_summary = {key: value for key, value in flatten_model_results_summary.items() if key not in ["model"]}
    df_model_results_summary = pd.DataFrame.from_dict(filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary["model"]])
    
    df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Analyze collected results

In [None]:
with pd.option_context('display.max_rows', None):
    display(df_all_summary_results)