In [None]:
import json
import os
import sys

sys.path.append('../')
sys.path.append('../src')
sys.path.append('../prompts')
sys.path.append('../src/llmperf')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from benchmarking.src.llmperf import llmperf_utils
from benchmarking.src.performance_evaluation import SyntheticPerformanceEvaluator

# Run multiple models through the benchmarking process

__Note:__ This analysis will work if a COE endpoint is used. Users will be able to test and compare performance metrics for different experts.

# Meta-Llama-3.1-70B-Instruct

In [None]:
# # SambaNova Cloud example
# model_names = ['Meta-Llama-3.1-70B-Instruct']
# llm_api = 'sncloud'

# # additional parameters
# results_dir = '../data/results/aramco_low_power/llama_3.1_70b'
# num_concurrent_requests = [1, 10, 100]
# timeout = 60000
# num_input_tokens = [100, 1_000, 10_000, 50_000, 100_000]
# num_output_tokens = [100, 1_000]
# sampling_params = {}
# user_metadata = {}

In [None]:
# low power

# SambaNova Cloud example
model_names = ['Meta-Llama-3.1-70B-Instruct']
llm_api = 'sncloud'

# additional parameters
results_dir = '../data/results/aramco_low_power/llama_3.1_70b'
num_concurrent_requests = [1, 10, 100]
timeout = 60000
num_input_tokens = [1_000]
num_output_tokens = [1_000]
sampling_params = {}
user_metadata = {}

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*10
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Meta-Llama-3.1-405B-Instruct

In [None]:
# SambaNova Cloud example
model_names = ['Meta-Llama-3.1-405B-Instruct']
llm_api = 'sncloud'

# additional parameters
results_dir = '../data/results/aramco_2/llama_3.1_405b'
num_concurrent_requests = [1, 10, 100]
timeout = 60000
num_input_tokens = [100, 1_000, 10_000]
num_output_tokens = [100, 1_000]
sampling_params = {}
user_metadata = {}

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*10
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Mixtral-8x7B-Instruct-v0.1

In [None]:
# SambaNova Cloud example
model_names = ['Mixtral-8x7B-Instruct-v0.1']
llm_api = 'sncloud'

# additional parameters
results_dir = '../data/results/aramco/mixtral_8_7b'
num_concurrent_requests = [1, 10, 100]
timeout = 60000
num_input_tokens = [100, 1_000]
num_output_tokens = [100, 1_000]
sampling_params = {}
user_metadata = {}

In [None]:
df_all_summary_results = pd.DataFrame()
for model_idx, model_name in enumerate(model_names):
    for input_tokens in num_input_tokens:
        for output_tokens in num_output_tokens:
            for concurrent_requests in num_concurrent_requests:
                num_requests = concurrent_requests*10
                print(f'running model_name {model_name}, input_tokens {input_tokens}, output_tokens {output_tokens}, concurrent_requests {concurrent_requests}, num_requests {num_requests}')
                user_metadata['model_idx'] = model_idx
                # Instantiate evaluator
                evaluator = SyntheticPerformanceEvaluator(
                    model_name=model_name,
                    results_dir=results_dir,
                    num_concurrent_requests=concurrent_requests,
                    timeout=timeout,
                    user_metadata=user_metadata,
                    llm_api=llm_api,
                )

                # Run performance evaluation
                model_results_summary, model_results_per_request = evaluator.run_benchmark(
                    num_input_tokens=input_tokens,
                    num_output_tokens=output_tokens,
                    num_requests=num_requests,
                    sampling_params=sampling_params,
                )

                flatten_model_results_summary = llmperf_utils.flatten_dict(model_results_summary)
                filtered_flatten_model_results_summary = {
                    key: value for key, value in flatten_model_results_summary.items() if key not in ['model']
                }
                df_model_results_summary = pd.DataFrame.from_dict(
                    filtered_flatten_model_results_summary, orient='index', columns=[flatten_model_results_summary['model']]
                )

                df_all_summary_results = pd.concat([df_all_summary_results, df_model_results_summary], axis=1)

# Consolidate results

In [None]:

def read_json_files(folder_path):
    data = []
    
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file ends with 'individual_responses.json'
        if filename.endswith('individual_responses.json'):
            file_path = os.path.join(folder_path, filename)
            
            # Open and load the JSON file
            with open(file_path, 'r') as file:
                try:
                    json_data = json.load(file)
                    json_data = [{**request_response, 'filename': filename} for request_response in json_data]
                    data.append(json_data)
                except json.JSONDecodeError as e:
                    print(f"Error reading {file_path}: {e}")
    return data

In [None]:
results_dir = '../data/results/aramco_client_in_us'
all_responses = read_json_files(results_dir)
len(all_responses)

In [None]:
run_stats = []
for run in all_responses:
    metrics = {
        'filename': [],
        'input_tokens': [],
        'output_tokens': [],
        'concurrent_requests': [], 
        'server_number_input_tokens': [],
        'server_number_output_tokens': [],
        'server_ttft_s': [], 
        'server_output_token_per_s_per_request': [],
        'server_end_to_end_latency_s': [],
        'client_ttft_s': [], 
        'client_output_token_per_s_per_request': [], 
        'client_end_to_end_latency_s': []
    }
    for request_metrics in run:
        metrics['filename'].append(request_metrics['filename'])
        
        input_tokens = int(request_metrics['filename'].split('_')[2])
        output_tokens = int(request_metrics['filename'].split('_')[3])
        concurrent_requests = int(request_metrics['filename'].split('_')[4])
        metrics['input_tokens'].append(input_tokens)
        metrics['output_tokens'].append(output_tokens)
        metrics['concurrent_requests'].append(concurrent_requests)
        
        metrics['server_number_input_tokens'].append(request_metrics['server_number_input_tokens'])
        metrics['server_number_output_tokens'].append(request_metrics['server_number_output_tokens'])
        metrics['server_ttft_s'].append(request_metrics['server_ttft_s'])
        metrics['server_output_token_per_s_per_request'].append(request_metrics['server_output_token_per_s_per_request'])
        metrics['server_end_to_end_latency_s'].append(request_metrics['server_end_to_end_latency_s'])
        metrics['client_ttft_s'].append(request_metrics['client_ttft_s'])
        metrics['client_output_token_per_s_per_request'].append(request_metrics['client_output_token_per_s_per_request'])
        metrics['client_end_to_end_latency_s'].append(request_metrics['client_end_to_end_latency_s'])
    df_metrics =  pd.DataFrame(metrics)
    df_metric_stats = df_metrics.groupby(by='filename').agg(['median','std'])
    df_metric_stats['request_count'] = df_metrics.shape[0]
    run_stats.append(df_metric_stats)
results = pd.concat(run_stats)
results.columns = ['_'.join(col).strip() for col in results.columns.values]
results.style.format("{:,.3f}")