In [1]:
import json
import os
import sys

sys.path.append('../')
sys.path.append('../src')
sys.path.append('../prompts')
sys.path.append('../src/llmperf')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from benchmarking.src.llmperf import llmperf_utils
from benchmarking.src.performance_evaluation import SyntheticPerformanceEvaluator

  from .autonotebook import tqdm as notebook_tqdm


# Run multiple models through the benchmarking process

__Note:__ This analysis will work if a COE endpoint is used. Users will be able to test and compare performance metrics for different experts.

# Meta-Llama-3.1-70B-Instruct

In [3]:
# # SambaNova Cloud example
model_names = ['meta-llama/Meta-Llama-3.1-70B-Instruct']
llm_api = 'sambastudio'

# additional parameters
results_dir = '../data/results/aramco_mingran/llama_3.1_70b'
# num_concurrent_requests = [1, 10, 100]
num_concurrent_requests = [16]
timeout = 60000
# num_input_tokens = [1_000, 10_000, 50_000, 100_000]
num_input_tokens = [1_000]
# num_output_tokens = [100, 1_000]
num_output_tokens = [1_000]
sampling_params = {}
user_metadata = {}

ratio = 1

In [4]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*ratio
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

running model_name meta-llama/Meta-Llama-3.1-70B-Instruct, input_tokens 1000, output_tokens 1000, concurrent_requests 16, num_requests 16


2024-11-05 11:49:19.699 
  command:

    streamlit run /Users/rodrigom/ai-starter-kit/benchmarking/.venv/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-11-05 11:49:37,174 [INFO] Tasks Executed!
2024-11-05 11:49:37,174 [INFO] Results for token benchmark for meta-llama/Meta-Llama-3.1-70B-Instruct queried with the sambastudio api.
2024-11-05 11:49:37,187 [INFO] Building Metrics Summary for metric: client_ttft_s
2024-11-05 11:49:37,189 [INFO]     p25 = 0.6628
2024-11-05 11:49:37,189 [INFO]     p50 = 1.0024
2024-11-05 11:49:37,189 [INFO]     p75 = 1.0247
2024-11-05 11:49:37,189 [INFO]     p90 = 1.0874
2024-11-05 11:49:37,189 [INFO]     p95 = 1.0883
2024-11-05 11:49:37,189 [INFO]     p99 = 1.0887
2024-11-05 11:49:37,190 [INFO]     mean = 0.9033
2024-11-05 11:49:37,190 [INFO]     min = 0.4441
2024-11-05 11:49:37,190 [INFO]     max = 1.0888
2024-11-05 11:49:37,191 [INFO]     stddev = 0.2084
2024-11-05 11:49:37,191 [INFO] Building Metrics Summary for metric: client_end_to_e

# Meta-Llama-3.1-405B-Instruct

In [None]:
# SambaNova Cloud example
model_names = ['Meta-Llama-3.1-405B-Instruct']
llm_api = 'sncloud'

# additional parameters
results_dir = '../data/results/aramco_2/llama_3.1_405b'
num_concurrent_requests = [1, 10, 100]
timeout = 60000
num_input_tokens = [100, 1_000, 10_000]
num_output_tokens = [100, 1_000]
sampling_params = {}
user_metadata = {}

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*10
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Mixtral-8x7B-Instruct-v0.1

In [None]:
# SambaNova Cloud example
model_names = ['Mixtral-8x7B-Instruct-v0.1']
llm_api = 'sncloud'

# additional parameters
results_dir = '../data/results/aramco/mixtral_8_7b'
num_concurrent_requests = [1, 10, 100]
timeout = 60000
num_input_tokens = [100, 1_000]
num_output_tokens = [100, 1_000]
sampling_params = {}
user_metadata = {}

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*10
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Consolidate results

In [2]:

def read_json_files(folder_path):
    data = []
    
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file ends with 'individual_responses.json'
        if filename.endswith('individual_responses.json'):
            file_path = os.path.join(folder_path, filename)
            
            # Open and load the JSON file
            with open(file_path, 'r') as file:
                try:
                    json_data = json.load(file)
                    json_data = [{**request_response, 'filename': filename} for request_response in json_data]
                    data.append(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error reading {file_path}: {e}")
    return data

In [3]:
results_dir = '../data/results/aramco_gpu/llama_3.1_70b'
all_responses = read_json_files(results_dir)
len(all_responses)

10

In [4]:
run_stats = []
for run in all_responses:
    metrics = {
        'filename': [],
        'input_tokens': [],
        'output_tokens': [],
        'concurrent_requests': [], 
        'server_number_input_tokens': [],
        'server_number_output_tokens': [],
        'server_ttft_s': [], 
        'server_output_token_per_s_per_request': [],
        'server_end_to_end_latency_s': [],
        'client_ttft_s': [], 
        'client_output_token_per_s_per_request': [], 
        'client_end_to_end_latency_s': []
    }
    for request_metrics in run:
        metrics['filename'].append(request_metrics['filename'])
        
        input_tokens = int(request_metrics['filename'].split('_')[2])
        output_tokens = int(request_metrics['filename'].split('_')[3])
        concurrent_requests = int(request_metrics['filename'].split('_')[4])
        metrics['input_tokens'].append(input_tokens)
        metrics['output_tokens'].append(output_tokens)
        metrics['concurrent_requests'].append(concurrent_requests)
        
        metrics['server_number_input_tokens'].append(request_metrics['server_number_input_tokens'])
        metrics['server_number_output_tokens'].append(request_metrics['server_number_output_tokens'])
        metrics['server_ttft_s'].append(request_metrics['server_ttft_s'])
        metrics['server_output_token_per_s_per_request'].append(request_metrics['server_output_token_per_s_per_request'])
        metrics['server_end_to_end_latency_s'].append(request_metrics['server_end_to_end_latency_s'])
        metrics['client_ttft_s'].append(request_metrics['client_ttft_s'])
        metrics['client_output_token_per_s_per_request'].append(request_metrics['client_output_token_per_s_per_request'])
        metrics['client_end_to_end_latency_s'].append(request_metrics['client_end_to_end_latency_s'])
    df_metrics =  pd.DataFrame(metrics)
    df_metric_stats = df_metrics.groupby(by='filename').agg(['median','std'])
    df_metric_stats['request_count'] = df_metrics.shape[0]
    run_stats.append(df_metric_stats)
results = pd.concat(run_stats)
results.columns = ['_'.join(col).strip() for col in results.columns.values]
results.style.format("{:,.3f}")

Unnamed: 0_level_0,input_tokens_median,input_tokens_std,output_tokens_median,output_tokens_std,concurrent_requests_median,concurrent_requests_std,server_number_input_tokens_median,server_number_input_tokens_std,server_number_output_tokens_median,server_number_output_tokens_std,server_ttft_s_median,server_ttft_s_std,server_output_token_per_s_per_request_median,server_output_token_per_s_per_request_std,server_end_to_end_latency_s_median,server_end_to_end_latency_s_std,client_ttft_s_median,client_ttft_s_std,client_output_token_per_s_per_request_median,client_output_token_per_s_per_request_std,client_end_to_end_latency_s_median,client_end_to_end_latency_s_std,request_count_
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0_meta-llama-Meta-Llama-3-1-70B-Instruct_100000_1000_1_stream_individual_responses.json,100000.0,0.0,1000.0,0.0,1.0,0.0,100034.0,0.0,9.0,5.718,,,,,,,9.752,0.097,42.49,10.842,9.94,0.168,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_1000_1000_1_stream_individual_responses.json,1000.0,0.0,1000.0,0.0,1.0,0.0,1034.0,0.0,1000.0,0.0,,,,,,,0.451,0.048,72.433,0.252,14.253,0.042,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_50000_100_1_stream_individual_responses.json,50000.0,0.0,100.0,0.0,1.0,0.0,50034.0,0.0,8.0,40.89,,,,,,,3.82,0.054,53.971,13.087,4.057,0.611,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_1000_100_1_stream_individual_responses.json,1000.0,0.0,100.0,0.0,1.0,0.0,1034.0,0.0,100.0,0.0,,,,,,,0.45,0.046,69.737,0.78,1.884,0.062,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_50000_1000_1_stream_individual_responses.json,50000.0,0.0,1000.0,0.0,1.0,0.0,50034.0,0.0,3.0,10.183,,,,,,,3.854,0.06,27.461,13.692,3.978,0.166,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_100000_100_1_stream_individual_responses.json,100000.0,0.0,100.0,0.0,1.0,0.0,100034.0,0.0,3.0,43.116,,,,,,,9.737,0.041,27.125,17.721,9.848,0.66,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_100_100_1_stream_individual_responses.json,100.0,0.0,100.0,0.0,1.0,0.0,134.0,0.0,100.0,0.0,,,,,,,0.423,0.037,74.391,2.117,1.768,0.005,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_10000_100_1_stream_individual_responses.json,10000.0,0.0,100.0,0.0,1.0,0.0,10034.0,0.0,5.0,7.906,,,,,,,0.874,0.062,45.107,15.496,0.986,0.057,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_10000_1000_1_stream_individual_responses.json,10000.0,0.0,1000.0,0.0,1.0,0.0,10034.0,0.0,5.0,8.792,,,,,,,0.868,0.046,44.918,14.769,0.981,0.096,5.0
0_meta-llama-Meta-Llama-3-1-70B-Instruct_100_1000_1_stream_individual_responses.json,100.0,0.0,1000.0,0.0,1.0,0.0,134.0,0.0,1000.0,0.0,,,,,,,0.426,0.05,72.524,0.269,14.219,0.083,5.0
