# Benchmark an Endpoint

Welcome to this tutorial on benchmarking an endpoint deployed on a SambaNova dedicated node!

If you don't already have an endpoint deployed, please follow one of the workflows described in the [README](./README.md) to deploy an endpoint before proceeding with this tutorial.

Also, please install the benchmarking requirements in the Python kernel used in this Jupyter notebook

`! pip install -r ../../benchmarking/requirements.txt`

## 1.  Imports

In [2]:
import sys
sys.version

'3.11.11 (main, Dec 11 2024, 10:28:39) [Clang 14.0.6 ]'

In [3]:
from IPython.display import display, HTML
display(HTML("<style>:root { --jp-notebook-max-width: 100% !important; }</style>"))
import json
import os
import pprint
import getpass
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
benchmarking_dir = "../../benchmarking/"
sys.path.append(benchmarking_dir + "src")
sys.path.append(benchmarking_dir + "benchmarking_scripts")
sys.path.append(benchmarking_dir + "../")
from synthetic_performance_eval_script import *
from benchmarking.utils import read_perf_eval_json_files

In [None]:
class CustomPromptsPerformanceEvaluator(BasePerformanceEvaluator):
    def __init__(
        self,
        num_concurrent_requests: int,
        input_file_path: str,
        save_response_texts: bool = False,
        prompt_ids: List[str] = [],
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.num_concurrent_requests = num_concurrent_requests
        self.file_name = os.path.basename(input_file_path)
        self.dataset, self.prompt_key, self.img_path_key, self.prompt_ids = self.read_dataset(input_file_path)         
        self.save_response_texts = save_response_texts
        

    @staticmethod
    def read_dataset(input_file_path: str) -> List[Dict[str, Any]]:
        """Utility function for reading in the `.jsonl` file provided by the user for custom dataset evaluation.

        Args:
            input_file_path (str): The absolute file path of the input file provided by the user

        Returns:
            List[Dict]: A list of json objects (python dictionaries) containing the individual prompts the user wants
            to evaluate on
        """
        with open(input_file_path, 'r') as file:
            data = json.load(file)

        # check if dict keys contain 'prompt'
        prompt_ids = list(data.keys())
        img_path_key = None
        for prompt_id in prompt_ids:
            d = data[prompt_id]
            if 'prompt' not in d:
                raise ValueError(
                    'All entries in input file must contain the "prompt" key \
                    and its respective text value'
                )

            # check if dict keys contain 'img_path' if there are two keys
            if len(d) == 2:
                if 'image_path' not in data_entry:
                    raise ValueError(
                        'All entries in input file that contain two keys must contain "image_path" \
                        as a key in addition to the "prompt" key and its respective text encoding value'
                    )
                if d['image_path'].startswith('http'):
                    raise ValueError('Urls are not supported for image_path. Please provide text encodings.')                
                img_path_key = 'image_path'

            if len(d) > 2:
                raise ValueError('All entries in input file must contain at most two keys: "prompt" and "image_path".')

        return data, "prompt", img_path_key, prompt_ids     

    def create_output_filename(self, prompt_id: str, num_output_tokens: int) -> str:
        """Utility for creating a unique filename for a synthetic benchmarking experiment given user specified params.

        Args:
            prompt_id (str): The user-specified identifier for the prompt in the input file.
            num_output_tokens (int): The number of output tokens to be received.

        Returns:
            str: Filename for the synthetic benchmark run.
        """
        generation_mode = ''
        if self.is_stream_mode:
            generation_mode = 'stream'

        multimodal_suffix = ''
        if self.multimodal_image_size != 'na':
            multimodal_suffix = f'_multimodal_{self.multimodal_image_size}'

        output_file_name = (
            f'custom_{self.user_metadata["model_idx"]}_{self.model_name}{multimodal_suffix}_{prompt_id}'
            f'_{num_output_tokens}_{self.num_concurrent_requests}_{generation_mode}_{self.run_uuid}'
        )

        return self.sanitize_file_prefix(output_file_name)

    def save_results(
        self,
        filename: str,
        summary: Dict[str, Any],
        individual_responses: (
            List[LLMResponse]
            | List[Tuple[Dict[str, Any], str, RequestConfig]]
            | Tuple[Dict[str, object], List[LLMResponse]]
        ),
    ) -> None:
        """Save the performance evaluation results to a file, and completion texts if save_response_text condition is
        setup as True

        Args:
            filename (str): The base name of the file to save the results to.
            summary (Dict[str, Any]): A dictionary containing the summary of the performance evaluation.
            individual_responses (List[LLMResponse]): A list of individual responses from the performance evaluation.

        Raises:
            e: if an error happens when creating the output file related to prompts and completions, an error will be
            raised
        """

        super().save_results(filename, summary, individual_responses)

        # If specified, save the llm responses to output file
        if self.save_response_texts:
            # Create response texts file name
            response_texts_file_name = f'{filename}_response_texts'
            results_dir = Path(self.results_dir)

            # Save response texts
            try:
                self.response_texts_file_path = f'{results_dir}/{response_texts_file_name}.jsonl'
                with open(self.response_texts_file_path, 'w') as f:
                    for response in individual_responses:
                        if isinstance(response, LLMResponse):
                            output_json = {
                                'prompt': response.request_config.prompt_tuple[0],
                                'completion': str(response.response_text),
                            }
                            f.write(json.dumps(output_json))
                            f.write('\n')
            except Exception as e:
                logger.error('ERROR SAVING LLM OUTPUTS')
                raise e

    def run_benchmark(
        self, sampling_params: Dict[str, Any] = {}, *args: Any, **kwargs: Any
    ) -> Tuple[Dict[str, Any], List[LLMResponse]]:
        """Run a benchmark test for the specified LLM using synthetically generated data.

        Args:
            prompt_id (str): The user-specified identifier for the prompt in the input file.
            num_output_tokens (int): The number of output tokens to be received.
            num_requests (int): The number of requests to be made.
            sampling_params (str): The sampling parameters in JSON format.

        Raises:
            ValueError: If the number of input tokens is less than 40.

        Returns:
            summary (dict): structure with performance metrics and stats for the run
            individual_responses (tuple): list of performance metrics per request
        """
        self.run_uuid = uuid.uuid4()
        prompt_id = kwargs.get('prompt_id', "0")
        num_output_tokens = kwargs.get('num_output_tokens', 10)
        num_requests = kwargs.get('num_requests', 1)

        self.cli_progress_bar = tqdm(total=num_requests, desc='Running Requests')
        self.ui_progress_bar = kwargs.get('progress_bar', None)

        # Calculate performance metrics individually and summary
        summary, individual_responses = self.get_token_throughput_latencies(
            prompt_id=prompt_id,
            num_output_tokens=num_output_tokens,
            num_requests=num_requests,
            sampling_params=sampling_params,
        )

        if self.results_dir:
            filename = self.create_output_filename(prompt_id, num_output_tokens)
            self.save_results(filename, summary, individual_responses)

        return summary, individual_responses

    def add_metric_after_key(
        self,
        metrics_dict: Dict[str, Any],
        new_key: str,
        new_value: float,
        after_key: str,
    ) -> Dict[str, Any]:
        """Adds a new metric (dict key and value) to a dict after an specific key

        Args:
            metrics_dict (dict): dictionary to add new metric
            new_key (str): new key
            new_value (float): new value for key
            after_key (str): key for reference to add new key after

        Returns:
            dict: dictionary with new key and value added
        """

        # Create a new dictionary
        new_metrics_dict = {}

        for key, value in metrics_dict.items():
            # Copy the key-value pair to the new dictionary
            new_metrics_dict[key] = value

            # Check if this is the key after which to insert the new key-value pair
            if key == after_key:
                new_metrics_dict[new_key] = new_value

        return new_metrics_dict

    def calculate_switching_time(self, llm_responses: list[LLMResponse]) -> list[LLMResponse]:
        """Logic to calculate switching time. Based on the first request TTFT,
        if this value is significantly larger (more than 3 standard deviations) than the average TTFT
        of the rest requests, then switching time will be the difference between first TTFT
        and average of the coming TTFTs.

        Args:
            llm_responses (list[LLMResponse]): list of LLMResponse objects

        Returns:
            list[LLMResponse]: list of LLMResponse objects including switching time
        """
        # collect necessary information for switching time calculation
        responses_ttfts = []

        for llm_response in llm_responses:
            if pd.isnull(llm_response.metrics['error_code']):
                request_idx = llm_response.request_config.request_idx
                start_time = llm_response.metrics['start_time']
                server_ttft_s = llm_response.metrics['server_ttft_s']
                responses_ttfts.append(
                    {
                        'request_idx': request_idx,
                        'start_time': start_time,
                        'server_ttft_s': server_ttft_s,
                    }
                )

        df_valid_responses = pd.DataFrame(responses_ttfts)

        # transforming str to date time for sorting
        df_valid_responses['start_time'] = pd.to_datetime(df_valid_responses['start_time'])
        df_valid_responses = df_valid_responses.sort_values(by=['start_time'])

        # initialize a column for the switching time
        df_valid_responses['server_switching_time'] = None

        # check server ttft in case metric is not coming in response
        if df_valid_responses['server_ttft_s'].notna().all():
            # calculate switching time
            first_ttft = df_valid_responses['server_ttft_s'].iloc[0]
            mean_ttft = df_valid_responses['server_ttft_s'].iloc[1:].mean()
            std_ttft = df_valid_responses['server_ttft_s'].iloc[1:].std()
            std_ttft = 1e-16 if np.isnan(std_ttft) else std_ttft

            switching_time = first_ttft - mean_ttft
            outlier_switching_time = None

            if switching_time > (mean_ttft + 3 * std_ttft):
                outlier_switching_time = switching_time
                df_valid_responses['server_switching_time'].iloc[0] = outlier_switching_time

        # assign switching time back to request object
        for llm_response in llm_responses:
            metrics = llm_response.metrics

            if llm_response.request_config.request_idx == df_valid_responses.head(1)['request_idx'].values[0]:
                server_switching_time = df_valid_responses.head(1)['server_switching_time'].values[0]
            else:
                server_switching_time = None

            llm_response.metrics = self.add_metric_after_key(
                metrics,
                new_key='server_switching_time',
                new_value=server_switching_time,
                after_key=common_metrics.TTFT_SERVER,
            )

        return llm_responses

    def get_token_throughput_latencies(
        self,
        prompt_id: str,
        num_output_tokens: int,
        num_requests: int,
        sampling_params: Dict[str, Any],
    ) -> Tuple[dict[str, Any], List[LLMResponse]]:
        """This function runs a token benchmark for the given model and API,
        measuring the throughput and latencies for the specified number of input and output tokens,
        and the specified number of requests.

        Args:
            prompt_id (str): The user-specified identifier for the prompt in the input file.
            num_output_tokens (int): The user specified number of output tokens.
            num_requests (int): The user specified number of requests to run.
            sampling_params (dict): User specified sampling parameters for generation.

        Returns:
            metadata (dict): A dictionary containing the results of the benchmark,
                            including the model name, number of concurrent requests,
                            results, number of input tokens, number of output tokens,
                            and additional sampling parameters.
            completed_requests (list): A list of completed requests.

        Raises:
            Exception: If an unexpected error occurs during the execution of requests.
        """
        # random.seed(11111)

        # Build the request config objects that are to be sent to the LLM API endpoint
        request_configs = self.build_request_configs(num_requests, prompt_id, num_output_tokens, sampling_params)

        # Get the request counts in order to place them into threads to be executed in batches
        total_request_count = len(request_configs)
        request_config_batches: List[List[RequestConfig]] = []

        if self.num_concurrent_requests:
            requests_per_thread = (total_request_count) // self.num_concurrent_requests
            remainder = (total_request_count) % self.num_concurrent_requests

            idx = 0
            # Create batches of requests for each concurrent request
            for concurrent_requests in range(self.num_concurrent_requests):
                num_requests_for_thread = requests_per_thread + (1 if concurrent_requests < remainder else 0)
                request_config_batch = request_configs[idx : idx + num_requests_for_thread].copy()
                idx += num_requests_for_thread
                request_config_batches.append(request_config_batch)

        # Execute requests concurrently
        llm_responses: List[LLMResponse] = []
        progress: List[Any] = []

        start_time = time.monotonic()
        # Use ThreadPoolExecutor to handle threads
        with ThreadPoolExecutor(max_workers=self.num_concurrent_requests) as executor:
            # Store futures for the tasks
            futures = []

            for request_config_batch in request_config_batches:
                if self.stop_event.is_set():
                    logger.info('Stopping task submission due to stop signal.')
                    break
                # Submit the task to the executor
                future = executor.submit(
                    self.send_requests,
                    request_config_batch,
                    llm_responses,
                    progress,
                    start_time,
                    num_requests,
                )
                futures.append(future)
                for t in executor._threads:
                    add_script_run_ctx(t)

            # Wait for all tasks to complete
            for future in as_completed(futures):
                try:
                    # Retrieve result if needed
                    future.result()
                except Exception as e:
                    logger.error(f'Error occurred in a thread: {e}')

        if self.stop_event.is_set():
            logger.info('Benchmarking process terminated early due to stop signal.')
            return {}, []

        # Error handling
        error_codes = [llm_response.metrics['error_code'] for llm_response in llm_responses]

        if not any([pd.isnull(error_code) for error_code in error_codes]):
            unique_error_codes = list(
                set(
                    [
                        llm_response.metrics['error_code']
                        for llm_response in llm_responses
                        if not pd.isnull(llm_response.metrics['error_code'])
                    ]
                )
            )
            unique_error_msgs = list(
                set(
                    [
                        llm_response.metrics['error_msg']
                        for llm_response in llm_responses
                        if not pd.isnull(llm_response.metrics['error_code'])
                    ]
                )
            )
            nl = '\n'
            raise Exception(
                f"""Unexpected error happened when executing requests:\
                {nl}{f'{nl}'.join([f'- {error_code}' for error_code in unique_error_codes])}\
                {nl}Additional messages:{nl}{f'{nl}'.join([f'- {error_msg}' for error_msg in unique_error_msgs])}"""
            )

        # Capture end time and notify user
        end_time = time.monotonic()
        logger.info('Tasks Executed!')
        logger.info(f'Benchmarking results obtained for model {self.model_name} queried with the {self.llm_api} API.')

        # Calculate switching time
        llm_responses = self.calculate_switching_time(llm_responses)

        # Build a metrics summary for the results of the benchmarking run
        results = self.build_metrics_summary(
            metrics=[response.metrics for response in llm_responses],
            start_time=start_time,
            end_time=end_time,
        )

        # Construct metadata payload to be returned
        metadata = {
            'model': self.model_name,
            'num_concurrent_requests': self.num_concurrent_requests,
            'results': results,
            'prompt_id': prompt_id,
            'num_output_tokens': num_output_tokens,
            'additional_sampling_params': sampling_params,
        }

        return metadata, llm_responses

    def select_raw_prompts(self, raw_prompts: List[Dict[str, Any]], num_requests: int) -> List[Dict[str, Any]]:
        """Selects prompts randomly

        Args:
            num_requests (int): Number of requests to be generated

        Returns:
            List[Dict[str,Any]]: List of randomly selected prompts
        """

        random_selected_prompts = random.choices(raw_prompts, k=num_requests)
        assert len(random_selected_prompts) == num_requests, 'Number of selected prompts \
            does not match the requested count'
        return random_selected_prompts

    def build_request_configs(
        self,
        num_requests: int,
        prompt_id: str,
        output_token_count: int,
        sampling_params: Dict[str, Any],
    ) -> List[RequestConfig]:
        """Builds a list of request configuration objects used to send requests to the LLM. It iterates through the
        specified number of requests, builds an input prompt for each request, updates the sampling parameters with
        the maximum number of tokens to generate, and then creates the request configuration object. The request
        configurations are then returned as a list.

        Args:
            num_requests (int): The number of request configurations to build.
            prompt_id (str): The user-specified identifier for the prompt in the input file.
            output_token_count (int): The number of output tokens each request should return.
            sampling_params (dict): A dictionary of sampling parameters for the LLM.

        Returns:
            List[RequestConfig]: A list of request configurations, each containing the model name, prompt, sampling
            parameters, LLM API, generation mode, and number of concurrent requests.
        """
        # Empty list to be filled with valid request configs and then returned
        request_configs = []
        # Instantiate image variable
        image = None

        data_point = self.dataset[prompt_id]

        raw_prompt = {'name': 'custom_prompt', 'template': data_point[self.prompt_key]}

        # Apply prompt templating to get final prompt to send to LLM API along with tokenized prompt length
        prompt_tuple = (raw_prompt, self.get_token_length(raw_prompt["template"]))

        # Image to be sent in LLM request if exists
        image = None
        if self.img_path_key:
            image = self.get_image(data_point[self.img_path_key])

        # Iterate through data points and build a request config for each
        for request_idx in range(num_requests):
            # Add generic max tokens parameter to `sampling_params` dictionary
            updated_sampling_params = {
                'max_tokens_to_generate': output_token_count,
            }
            updated_sampling_params.update(sampling_params)

            # Create request config object
            request_config = RequestConfig(
                request_idx=request_idx,
                model=self.model_name,
                prompt_tuple=prompt_tuple,
                image=image,
                sampling_params=updated_sampling_params,
                llm_api=self.llm_api,
                api_variables=self.api_variables,
                is_stream_mode=self.is_stream_mode,
                num_concurrent_requests=self.num_concurrent_requests,
            )

            request_configs.append(request_config)

        return request_configs
    
    def build_prompt(self, *args: Any, **kwargs: Any) -> Tuple[Dict[str, Any], int]:
        pass


## 2. Get endpoint info
To benchmark the endpoint, we will need to obtain some of its information. Note that this information can be obtained from your SambaNova representative.

#### Enter the endpoint url
Run the cell below and then enter the endpoint URL. This should be in the format of `https://my.env/v1/<endpoint_id>/chat/completions` 

In [4]:
endpoint_url = input().strip()
if endpoint_url[-1] == "/":
    endpoint_url = endpoint_url[:-1]
os.environ["SAMBASTUDIO_URL"] = endpoint_url
endpoint_id = endpoint_url.split('/')[-3]
print("Benchmarking Endpoint:", endpoint_id)

Benchmarking Endpoint: 631938c4-1336-40fd-aa08-9683b4dda499


#### Enter the endpoint API key

In [5]:
endpoint_key = getpass.getpass().strip()
if len(endpoint_key) > 0:
    os.environ["SAMBASTUDIO_API_KEY"] = endpoint_key
else:
    print("Please enter a valid key")

## 3. Automatically retrieve model list from endpoint (Optional)
Run this section only if you don't the have list of models on the endpoint. Note that running this section requires you to first follow the set up instructions given in the [README](./README.md)
#### Set up environment connector
The connector connects to the remote dedicated environment using the variables defined below

In [None]:
env_url = '/'.join(endpoint_url.split('/')[:3])

'https://sjc3-e9.sambanova.net'

In [7]:
print("Enter the env access key")
env_key = getpass.getpass().strip()
if len(env_key) > 0:
    os.environ["SAMBASTUDIO_ACCESS_KEY"] = env_key
else:
    print("Please enter a valid key")

Enter the env access key


In [8]:
env_tenant = "default"

In [9]:
from snsdk import SnSdk
sn_env = SnSdk(host_url=env_url, 
                   access_key=env_key, 
                   tenant_id=env_tenant)

#### Get model names in the endpoint

In [10]:
endpoint_info = sn_env.endpoint_info_by_id(endpoint_id)
endpoint_model_id = endpoint_info['targets'][0]["model"]
model_info = sn_env.model_info(endpoint_model_id, job_type="deploy")
model_constituents = [m["name"] for m in model_info["dependencies"]]
sorted(model_constituents)

['Meta-Llama-3.3-70B-Instruct',
 'Qwen-2.5-72B-SD-Qwen-2.5-0.5B',
 'Salesforce--Llama-xLAM-2-70b-fc-r',
 'Salesforce--Llama-xLAM-2-8b-fc-r']

#### Get target model names in the endpoint
Target model names generally differ from model names when the model is a speculative decoding pair

In [11]:
target_models = []
for constituent_name in model_constituents:    
    model_name = constituent_name

    # Check for speculative decoding
    constituent_info = sn_env.model_info(constituent_name, job_type="deploy")
    if 'target_model' in constituent_info['config']:
        target_name = constituent_info['config']['target_model']        
        if len(target_name) > 0:
            model_name = target_name
    target_models.append(model_name)
sorted(target_models)

['Meta-Llama-3.3-70B-Instruct',
 'Qwen2.5-72B-Instruct',
 'Salesforce--Llama-xLAM-2-70b-fc-r',
 'Salesforce--Llama-xLAM-2-8b-fc-r']

## 4. Set up Model Configs for Benchmarking
Note that this section only currently supports a fraction of what the Benchmarking Kit is capable of. You may repurpose this section if you would like to benchmark images or run questions per second (qps).

#### Name the benchmarking run
Give the run a unique name so that the configs and results can be saved with that name. Please note that the name should be compatible with file system path naming rules.

In [5]:
run_name = "llama33_70b_sd_1b_short_studio_sfdata_20250602_1"

#### Specify target models
If not automatically set in Step 3, set Target models as a python list

In [6]:
target_models = ["Meta-Llama-3.3-70B-Instruct"] #target_models 

#### Specify custom input path

In [7]:
custom_input_path = os.path.expanduser("~/Downloads/custom_prompt_data.json")

#### Specify configs for benchmarking

In [8]:
output_path = f"{benchmarking_dir}data/benchmarking_tracking_tests/"

config = {
    'sampling_params': {},
    'max_tokens_to_generate': 1000, # treated as a special sampling param and not to be included in sampling_params  
    'output_files_dir': os.path.join(f'{output_path}logs/output_files/', run_name), # each run saved here
    'consolidated_results_dir': os.path.join(f'{output_path}consolidated_results', run_name), # consolidated xlsx saved here
    'timeout': 3600,
    'num_requests': 64,
    'concurrent_requests': [1, 2, 4, 8, 16, 32],
    'custom_input_path': custom_input_path,
    'save_response_texts': False,
    'llm_api': 'sambastudio'
}

## 5. Run Benchmarking
We will run benchmarking with the configs now

#### Benchmarking produces json files with detailed results for each configuration

In [12]:
for target_model in target_models:
    for num_concurrent_requests in config['concurrent_requests']:        
        custom_evaluator = CustomPromptsPerformanceEvaluator(
            model_name=target_model,        
            results_dir=config['output_files_dir'],
            num_concurrent_requests=num_concurrent_requests,
            timeout=config["timeout"],
            user_metadata={"model_idx": 0},
            input_file_path=config['custom_input_path'],
            save_response_texts=config['save_response_texts'],
            llm_api=config['llm_api'],
        )

        for prompt_id in custom_evaluator.prompt_ids:
            # Run performance evaluation
            custom_evaluator.run_benchmark(
                prompt_id=prompt_id,
                num_output_tokens=config["max_tokens_to_generate"],
                num_requests=config["num_requests"],
                sampling_params=config['sampling_params'])

Running Requests: 100%|██████████| 64/64 [01:55<00:00,  1.96s/it]

2025-06-02 15:13:06,741 [INFO] Tasks Executed!
2025-06-02 15:13:06,742 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:13:06,772 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:13:06,777 [INFO]     p5 = 0.1849
2025-06-02 15:13:06,777 [INFO]     p25 = 0.2005
2025-06-02 15:13:06,777 [INFO]     p50 = 0.2808
2025-06-02 15:13:06,778 [INFO]     p75 = 0.3374
2025-06-02 15:13:06,778 [INFO]     p90 = 0.4858
2025-06-02 15:13:06,779 [INFO]     p95 = 0.549
2025-06-02 15:13:06,779 [INFO]     p99 = 0.7479
2025-06-02 15:13:06,780 [INFO]     mean = 0.3037
2025-06-02 15:13:06,781 [INFO]     min = 0.1686
2025-06-02 15:13:06,781 [INFO]     max = 0.7583
2025-06-02 15:13:06,781 [INFO]     stddev = 0.1298
2025-06-02 15:13:06,782 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:13:06,783 [INFO]     p5 = 1.6367
2025-06-02 15:13:06,783 [INFO]     p25 = 1.6861
202

Running Requests: 100%|██████████| 64/64 [01:55<00:00,  1.81s/it]


2025-06-02 15:16:44,813 [INFO] Tasks Executed!
2025-06-02 15:16:44,813 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:16:44,823 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:16:44,824 [INFO]     p5 = 0.1849
2025-06-02 15:16:44,825 [INFO]     p25 = 0.2396
2025-06-02 15:16:44,825 [INFO]     p50 = 0.3272
2025-06-02 15:16:44,826 [INFO]     p75 = 0.4185
2025-06-02 15:16:44,827 [INFO]     p90 = 0.528
2025-06-02 15:16:44,827 [INFO]     p95 = 0.6694
2025-06-02 15:16:44,828 [INFO]     p99 = 0.9654
2025-06-02 15:16:44,829 [INFO]     mean = 0.355
2025-06-02 15:16:44,829 [INFO]     min = 0.1713
2025-06-02 15:16:44,830 [INFO]     max = 0.9903
2025-06-02 15:16:44,830 [INFO]     stddev = 0.1709
2025-06-02 15:16:44,830 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:16:44,832 [INFO]     p5 = 3.2136
2025-06-02 15:16:44,832 [INFO]     p25 = 3.2727
2025

Running Requests: 100%|██████████| 64/64 [03:38<00:00,  3.41s/it]
Running Requests: 100%|██████████| 64/64 [04:06<00:00,  3.89s/it]

2025-06-02 15:20:51,601 [INFO] Tasks Executed!
2025-06-02 15:20:51,602 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:20:51,609 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:20:51,610 [INFO]     p5 = 0.3471
2025-06-02 15:20:51,611 [INFO]     p25 = 0.3891
2025-06-02 15:20:51,611 [INFO]     p50 = 0.4593
2025-06-02 15:20:51,611 [INFO]     p75 = 0.5574
2025-06-02 15:20:51,611 [INFO]     p90 = 0.6377
2025-06-02 15:20:51,612 [INFO]     p95 = 0.8123
2025-06-02 15:20:51,612 [INFO]     p99 = 1.0439
2025-06-02 15:20:51,613 [INFO]     mean = 0.4956
2025-06-02 15:20:51,613 [INFO]     min = 0.32
2025-06-02 15:20:51,614 [INFO]     max = 1.0482
2025-06-02 15:20:51,614 [INFO]     stddev = 0.1505
2025-06-02 15:20:51,614 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:20:51,616 [INFO]     p5 = 3.6917
2025-06-02 15:20:51,617 [INFO]     p25 = 3.7345
2025

Running Requests: 100%|██████████| 64/64 [04:06<00:00,  3.86s/it]


2025-06-02 15:24:22,964 [INFO] Tasks Executed!
2025-06-02 15:24:22,964 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:24:22,973 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:24:22,974 [INFO]     p5 = 0.3396
2025-06-02 15:24:22,975 [INFO]     p25 = 0.3741
2025-06-02 15:24:22,975 [INFO]     p50 = 0.456
2025-06-02 15:24:22,976 [INFO]     p75 = 0.5578
2025-06-02 15:24:22,976 [INFO]     p90 = 0.8444
2025-06-02 15:24:22,976 [INFO]     p95 = 0.9999
2025-06-02 15:24:22,977 [INFO]     p99 = 1.6395
2025-06-02 15:24:22,977 [INFO]     mean = 0.5373
2025-06-02 15:24:22,977 [INFO]     min = 0.33
2025-06-02 15:24:22,978 [INFO]     max = 2.3432
2025-06-02 15:24:22,978 [INFO]     stddev = 0.3001
2025-06-02 15:24:22,978 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:24:22,979 [INFO]     p5 = 3.0747
2025-06-02 15:24:22,980 [INFO]     p25 = 3.1136
2025-

Running Requests: 100%|██████████| 64/64 [03:31<00:00,  3.30s/it]
Running Requests: 100%|██████████| 64/64 [01:59<00:00,  1.97s/it]

2025-06-02 15:26:22,036 [INFO] Tasks Executed!
2025-06-02 15:26:22,037 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:26:22,048 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:26:22,050 [INFO]     p5 = 0.5204
2025-06-02 15:26:22,050 [INFO]     p25 = 0.5576
2025-06-02 15:26:22,051 [INFO]     p50 = 0.6971
2025-06-02 15:26:22,051 [INFO]     p75 = 0.7906
2025-06-02 15:26:22,052 [INFO]     p90 = 0.9128
2025-06-02 15:26:22,052 [INFO]     p95 = 1.0958
2025-06-02 15:26:22,052 [INFO]     p99 = 1.9302
2025-06-02 15:26:22,053 [INFO]     mean = 0.7347
2025-06-02 15:26:22,053 [INFO]     min = 0.507
2025-06-02 15:26:22,053 [INFO]     max = 2.1206
2025-06-02 15:26:22,054 [INFO]     stddev = 0.2784
2025-06-02 15:26:22,054 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:26:22,056 [INFO]     p5 = 1.5679
2025-06-02 15:26:22,056 [INFO]     p25 = 1.6092
202

Running Requests: 100%|██████████| 64/64 [01:59<00:00,  1.86s/it]


2025-06-02 15:29:13,169 [INFO] Tasks Executed!
2025-06-02 15:29:13,169 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:29:13,180 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:29:13,182 [INFO]     p5 = 0.543
2025-06-02 15:29:13,183 [INFO]     p25 = 0.5937
2025-06-02 15:29:13,183 [INFO]     p50 = 0.6424
2025-06-02 15:29:13,184 [INFO]     p75 = 0.7844
2025-06-02 15:29:13,184 [INFO]     p90 = 0.8618
2025-06-02 15:29:13,185 [INFO]     p95 = 1.044
2025-06-02 15:29:13,185 [INFO]     p99 = 1.3295
2025-06-02 15:29:13,185 [INFO]     mean = 0.7065
2025-06-02 15:29:13,186 [INFO]     min = 0.5183
2025-06-02 15:29:13,186 [INFO]     max = 1.6057
2025-06-02 15:29:13,186 [INFO]     stddev = 0.1801
2025-06-02 15:29:13,187 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:29:13,188 [INFO]     p5 = 2.4093
2025-06-02 15:29:13,189 [INFO]     p25 = 2.4717
2025

Running Requests: 100%|██████████| 64/64 [02:51<00:00,  2.67s/it]
Running Requests: 100%|██████████| 64/64 [03:31<00:00,  3.54s/it]

2025-06-02 15:32:44,278 [INFO] Tasks Executed!
2025-06-02 15:32:44,279 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:32:44,290 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:32:44,292 [INFO]     p5 = 1.1041
2025-06-02 15:32:44,292 [INFO]     p25 = 1.1579
2025-06-02 15:32:44,292 [INFO]     p50 = 1.2297
2025-06-02 15:32:44,293 [INFO]     p75 = 1.3047
2025-06-02 15:32:44,293 [INFO]     p90 = 1.3252
2025-06-02 15:32:44,294 [INFO]     p95 = 1.3729
2025-06-02 15:32:44,294 [INFO]     p99 = 1.485
2025-06-02 15:32:44,295 [INFO]     mean = 1.232
2025-06-02 15:32:44,295 [INFO]     min = 1.0761
2025-06-02 15:32:44,295 [INFO]     max = 1.5019
2025-06-02 15:32:44,296 [INFO]     stddev = 0.0934
2025-06-02 15:32:44,296 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:32:44,297 [INFO]     p5 = 3.0182
2025-06-02 15:32:44,298 [INFO]     p25 = 3.0966
2025

Running Requests: 100%|██████████| 64/64 [03:32<00:00,  3.31s/it]
Running Requests: 100%|██████████| 64/64 [01:38<00:00,  1.56s/it]

2025-06-02 15:34:24,210 [INFO] Tasks Executed!
2025-06-02 15:34:24,210 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:34:24,222 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:34:24,223 [INFO]     p5 = 1.4033
2025-06-02 15:34:24,224 [INFO]     p25 = 1.5202
2025-06-02 15:34:24,224 [INFO]     p50 = 1.5426
2025-06-02 15:34:24,225 [INFO]     p75 = 1.6209
2025-06-02 15:34:24,225 [INFO]     p90 = 1.6308
2025-06-02 15:34:24,226 [INFO]     p95 = 1.6344
2025-06-02 15:34:24,226 [INFO]     p99 = 1.6961
2025-06-02 15:34:24,227 [INFO]     mean = 1.5422
2025-06-02 15:34:24,227 [INFO]     min = 0.1986
2025-06-02 15:34:24,227 [INFO]     max = 1.751
2025-06-02 15:34:24,227 [INFO]     stddev = 0.1845
2025-06-02 15:34:24,228 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:34:24,229 [INFO]     p5 = 2.9501
2025-06-02 15:34:24,229 [INFO]     p25 = 3.0347
202

Running Requests: 100%|██████████| 64/64 [01:39<00:00,  1.55s/it]


2025-06-02 15:37:43,572 [INFO] Tasks Executed!
2025-06-02 15:37:43,573 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:37:43,583 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:37:43,584 [INFO]     p5 = 3.0012
2025-06-02 15:37:43,585 [INFO]     p25 = 3.0997
2025-06-02 15:37:43,585 [INFO]     p50 = 3.1734
2025-06-02 15:37:43,586 [INFO]     p75 = 3.1967
2025-06-02 15:37:43,586 [INFO]     p90 = 3.2169
2025-06-02 15:37:43,586 [INFO]     p95 = 3.2337
2025-06-02 15:37:43,587 [INFO]     p99 = 3.2832
2025-06-02 15:37:43,587 [INFO]     mean = 3.1112
2025-06-02 15:37:43,587 [INFO]     min = 0.1907
2025-06-02 15:37:43,588 [INFO]     max = 3.286
2025-06-02 15:37:43,588 [INFO]     stddev = 0.3773
2025-06-02 15:37:43,588 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:37:43,589 [INFO]     p5 = 6.0487
2025-06-02 15:37:43,590 [INFO]     p25 = 6.1618
202

Running Requests: 100%|██████████| 64/64 [03:19<00:00,  3.12s/it]
Running Requests: 100%|██████████| 64/64 [03:48<00:00,  3.57s/it]

2025-06-02 15:41:32,132 [INFO] Tasks Executed!
2025-06-02 15:41:32,134 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:41:32,167 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:41:32,170 [INFO]     p5 = 3.6285
2025-06-02 15:41:32,170 [INFO]     p25 = 3.7655
2025-06-02 15:41:32,170 [INFO]     p50 = 3.7813
2025-06-02 15:41:32,171 [INFO]     p75 = 3.7914
2025-06-02 15:41:32,171 [INFO]     p90 = 3.801
2025-06-02 15:41:32,171 [INFO]     p95 = 3.805
2025-06-02 15:41:32,171 [INFO]     p99 = 3.8468
2025-06-02 15:41:32,172 [INFO]     mean = 3.7106
2025-06-02 15:41:32,172 [INFO]     min = 0.33
2025-06-02 15:41:32,173 [INFO]     max = 3.8905
2025-06-02 15:41:32,173 [INFO]     stddev = 0.434
2025-06-02 15:41:32,173 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:41:32,174 [INFO]     p5 = 6.9455
2025-06-02 15:41:32,175 [INFO]     p25 = 7.11
2025-06-0

Running Requests: 100%|██████████| 64/64 [03:48<00:00,  3.57s/it]


2025-06-02 15:44:40,983 [INFO] Tasks Executed!
2025-06-02 15:44:40,984 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:44:40,998 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:44:41,000 [INFO]     p5 = 2.9272
2025-06-02 15:44:41,000 [INFO]     p25 = 3.1272
2025-06-02 15:44:41,001 [INFO]     p50 = 3.1509
2025-06-02 15:44:41,001 [INFO]     p75 = 3.167
2025-06-02 15:44:41,002 [INFO]     p90 = 3.1751
2025-06-02 15:44:41,002 [INFO]     p95 = 3.1802
2025-06-02 15:44:41,002 [INFO]     p99 = 3.233
2025-06-02 15:44:41,003 [INFO]     mean = 3.0808
2025-06-02 15:44:41,004 [INFO]     min = 0.3564
2025-06-02 15:44:41,004 [INFO]     max = 3.3066
2025-06-02 15:44:41,004 [INFO]     stddev = 0.3553
2025-06-02 15:44:41,005 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:44:41,006 [INFO]     p5 = 5.6471
2025-06-02 15:44:41,007 [INFO]     p25 = 5.8326
2025

Running Requests: 100%|██████████| 64/64 [03:08<00:00,  2.95s/it]
Running Requests: 100%|██████████| 64/64 [01:33<00:00,  1.43s/it]

2025-06-02 15:46:14,338 [INFO] Tasks Executed!
2025-06-02 15:46:14,340 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:46:14,358 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:46:14,360 [INFO]     p5 = 1.5872
2025-06-02 15:46:14,360 [INFO]     p25 = 1.7823
2025-06-02 15:46:14,361 [INFO]     p50 = 1.8467
2025-06-02 15:46:14,361 [INFO]     p75 = 1.8554
2025-06-02 15:46:14,362 [INFO]     p90 = 1.8656
2025-06-02 15:46:14,362 [INFO]     p95 = 1.9259
2025-06-02 15:46:14,362 [INFO]     p99 = 1.9874
2025-06-02 15:46:14,363 [INFO]     mean = 1.7903
2025-06-02 15:46:14,364 [INFO]     min = 0.5237
2025-06-02 15:46:14,364 [INFO]     max = 1.9905
2025-06-02 15:46:14,364 [INFO]     stddev = 0.1897
2025-06-02 15:46:14,365 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:46:14,366 [INFO]     p5 = 2.6385
2025-06-02 15:46:14,367 [INFO]     p25 = 2.8521
20

Running Requests: 100%|██████████| 64/64 [01:33<00:00,  1.46s/it]


2025-06-02 15:48:39,850 [INFO] Tasks Executed!
2025-06-02 15:48:39,852 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:48:39,867 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:48:39,870 [INFO]     p5 = 2.5257
2025-06-02 15:48:39,870 [INFO]     p25 = 2.641
2025-06-02 15:48:39,871 [INFO]     p50 = 2.6581
2025-06-02 15:48:39,871 [INFO]     p75 = 2.6743
2025-06-02 15:48:39,871 [INFO]     p90 = 2.6904
2025-06-02 15:48:39,872 [INFO]     p95 = 2.7108
2025-06-02 15:48:39,872 [INFO]     p99 = 2.781
2025-06-02 15:48:39,873 [INFO]     mean = 2.6198
2025-06-02 15:48:39,874 [INFO]     min = 0.5381
2025-06-02 15:48:39,874 [INFO]     max = 2.815
2025-06-02 15:48:39,875 [INFO]     stddev = 0.2705
2025-06-02 15:48:39,875 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:48:39,877 [INFO]     p5 = 4.4398
2025-06-02 15:48:39,877 [INFO]     p25 = 4.4932
2025-

Running Requests: 100%|██████████| 64/64 [02:25<00:00,  2.27s/it]
Running Requests: 100%|██████████| 64/64 [03:03<00:00,  2.90s/it]

2025-06-02 15:51:43,939 [INFO] Tasks Executed!
2025-06-02 15:51:43,939 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:51:43,966 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:51:43,968 [INFO]     p5 = 3.3779
2025-06-02 15:51:43,969 [INFO]     p25 = 3.7406
2025-06-02 15:51:43,969 [INFO]     p50 = 3.8053
2025-06-02 15:51:43,969 [INFO]     p75 = 3.8358
2025-06-02 15:51:43,970 [INFO]     p90 = 3.8855
2025-06-02 15:51:43,971 [INFO]     p95 = 3.9013
2025-06-02 15:51:43,971 [INFO]     p99 = 3.938
2025-06-02 15:51:43,973 [INFO]     mean = 3.7192
2025-06-02 15:51:43,973 [INFO]     min = 1.0918
2025-06-02 15:51:43,973 [INFO]     max = 3.9459
2025-06-02 15:51:43,973 [INFO]     stddev = 0.3774
2025-06-02 15:51:43,974 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:51:43,975 [INFO]     p5 = 5.3133
2025-06-02 15:51:43,975 [INFO]     p25 = 5.6933
202

Running Requests: 100%|██████████| 64/64 [03:04<00:00,  2.89s/it]
Running Requests:  98%|█████████▊| 63/64 [00:53<00:01,  1.05s/it]

2025-06-02 15:52:38,573 [INFO] Tasks Executed!
2025-06-02 15:52:38,573 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:52:38,597 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:52:38,601 [INFO]     p5 = 1.6636
2025-06-02 15:52:38,601 [INFO]     p25 = 1.6759
2025-06-02 15:52:38,601 [INFO]     p50 = 1.6845
2025-06-02 15:52:38,602 [INFO]     p75 = 1.7902
2025-06-02 15:52:38,602 [INFO]     p90 = 1.8804
2025-06-02 15:52:38,602 [INFO]     p95 = 1.8845
2025-06-02 15:52:38,602 [INFO]     p99 = 1.8896
2025-06-02 15:52:38,603 [INFO]     mean = 1.705
2025-06-02 15:52:38,603 [INFO]     min = 0.2081
2025-06-02 15:52:38,603 [INFO]     max = 1.8925
2025-06-02 15:52:38,604 [INFO]     stddev = 0.2067
2025-06-02 15:52:38,604 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:52:38,605 [INFO]     p5 = 3.2545
2025-06-02 15:52:38,606 [INFO]     p25 = 3.3254
202

Running Requests: 100%|██████████| 64/64 [00:53<00:00,  1.19it/s]


2025-06-02 15:54:25,822 [INFO] Tasks Executed!
2025-06-02 15:54:25,823 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:54:25,836 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:54:25,838 [INFO]     p5 = 3.4069
2025-06-02 15:54:25,838 [INFO]     p25 = 3.4356
2025-06-02 15:54:25,839 [INFO]     p50 = 3.4444
2025-06-02 15:54:25,839 [INFO]     p75 = 3.4494
2025-06-02 15:54:25,840 [INFO]     p90 = 3.4585
2025-06-02 15:54:25,840 [INFO]     p95 = 3.4641
2025-06-02 15:54:25,840 [INFO]     p99 = 3.5693
2025-06-02 15:54:25,841 [INFO]     mean = 3.3447
2025-06-02 15:54:25,841 [INFO]     min = 0.2128
2025-06-02 15:54:25,842 [INFO]     max = 3.5704
2025-06-02 15:54:25,842 [INFO]     stddev = 0.5675
2025-06-02 15:54:25,842 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:54:25,843 [INFO]     p5 = 6.6316
2025-06-02 15:54:25,844 [INFO]     p25 = 6.671
202

Running Requests: 100%|██████████| 64/64 [01:47<00:00,  1.68s/it]
Running Requests: 100%|██████████| 64/64 [02:08<00:00,  2.09s/it]

2025-06-02 15:56:33,969 [INFO] Tasks Executed!
2025-06-02 15:56:33,969 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:56:33,979 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:56:33,980 [INFO]     p5 = 4.1499
2025-06-02 15:56:33,981 [INFO]     p25 = 4.1876
2025-06-02 15:56:33,981 [INFO]     p50 = 4.2151
2025-06-02 15:56:33,982 [INFO]     p75 = 4.3125
2025-06-02 15:56:33,982 [INFO]     p90 = 4.6403
2025-06-02 15:56:33,982 [INFO]     p95 = 4.6657
2025-06-02 15:56:33,983 [INFO]     p99 = 4.6811
2025-06-02 15:56:33,983 [INFO]     mean = 4.253
2025-06-02 15:56:33,984 [INFO]     min = 0.3201
2025-06-02 15:56:33,984 [INFO]     max = 4.682
2025-06-02 15:56:33,984 [INFO]     stddev = 0.5326
2025-06-02 15:56:33,984 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:56:33,986 [INFO]     p5 = 7.9052
2025-06-02 15:56:33,986 [INFO]     p25 = 7.9463
2025

Running Requests: 100%|██████████| 64/64 [02:08<00:00,  2.00s/it]


2025-06-02 15:58:20,914 [INFO] Tasks Executed!
2025-06-02 15:58:20,914 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:58:20,924 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:58:20,926 [INFO]     p5 = 3.5433
2025-06-02 15:58:20,926 [INFO]     p25 = 3.567
2025-06-02 15:58:20,927 [INFO]     p50 = 3.611
2025-06-02 15:58:20,927 [INFO]     p75 = 3.6584
2025-06-02 15:58:20,927 [INFO]     p90 = 3.9412
2025-06-02 15:58:20,928 [INFO]     p95 = 3.9492
2025-06-02 15:58:20,928 [INFO]     p99 = 3.9582
2025-06-02 15:58:20,929 [INFO]     mean = 3.6216
2025-06-02 15:58:20,929 [INFO]     min = 0.3189
2025-06-02 15:58:20,929 [INFO]     max = 3.9615
2025-06-02 15:58:20,929 [INFO]     stddev = 0.4456
2025-06-02 15:58:20,930 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:58:20,931 [INFO]     p5 = 6.6004
2025-06-02 15:58:20,931 [INFO]     p25 = 6.6448
2025

Running Requests: 100%|██████████| 64/64 [01:46<00:00,  1.67s/it]
Running Requests: 100%|██████████| 64/64 [01:03<00:00,  1.09s/it]

2025-06-02 15:59:24,547 [INFO] Tasks Executed!
2025-06-02 15:59:24,548 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 15:59:24,558 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 15:59:24,559 [INFO]     p5 = 2.7182
2025-06-02 15:59:24,560 [INFO]     p25 = 2.7367
2025-06-02 15:59:24,560 [INFO]     p50 = 2.794
2025-06-02 15:59:24,560 [INFO]     p75 = 2.8529
2025-06-02 15:59:24,561 [INFO]     p90 = 2.9101
2025-06-02 15:59:24,561 [INFO]     p95 = 2.934
2025-06-02 15:59:24,562 [INFO]     p99 = 2.987
2025-06-02 15:59:24,562 [INFO]     mean = 2.7741
2025-06-02 15:59:24,563 [INFO]     min = 0.5761
2025-06-02 15:59:24,563 [INFO]     max = 3.0067
2025-06-02 15:59:24,564 [INFO]     stddev = 0.2891
2025-06-02 15:59:24,564 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 15:59:24,566 [INFO]     p5 = 3.9319
2025-06-02 15:59:24,566 [INFO]     p25 = 3.9476
2025-

Running Requests: 100%|██████████| 64/64 [01:03<00:00,  1.01it/s]


2025-06-02 16:00:56,218 [INFO] Tasks Executed!
2025-06-02 16:00:56,219 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:00:56,230 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:00:56,231 [INFO]     p5 = 3.5197
2025-06-02 16:00:56,232 [INFO]     p25 = 3.5491
2025-06-02 16:00:56,232 [INFO]     p50 = 3.58
2025-06-02 16:00:56,233 [INFO]     p75 = 3.6628
2025-06-02 16:00:56,233 [INFO]     p90 = 3.8759
2025-06-02 16:00:56,233 [INFO]     p95 = 3.8931
2025-06-02 16:00:56,234 [INFO]     p99 = 3.9354
2025-06-02 16:00:56,234 [INFO]     mean = 3.5932
2025-06-02 16:00:56,235 [INFO]     min = 0.53
2025-06-02 16:00:56,235 [INFO]     max = 3.9639
2025-06-02 16:00:56,235 [INFO]     stddev = 0.4122
2025-06-02 16:00:56,236 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:00:56,237 [INFO]     p5 = 5.6447
2025-06-02 16:00:56,238 [INFO]     p25 = 5.6836
2025-0

Running Requests: 100%|██████████| 64/64 [01:31<00:00,  1.43s/it]
Running Requests: 100%|██████████| 64/64 [02:21<00:00,  2.71s/it]

2025-06-02 16:03:18,292 [INFO] Tasks Executed!
2025-06-02 16:03:18,293 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:03:18,312 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:03:18,314 [INFO]     p5 = 6.6001
2025-06-02 16:03:18,314 [INFO]     p25 = 6.6313
2025-06-02 16:03:18,314 [INFO]     p50 = 6.6555
2025-06-02 16:03:18,315 [INFO]     p75 = 6.8139
2025-06-02 16:03:18,315 [INFO]     p90 = 6.9695
2025-06-02 16:03:18,315 [INFO]     p95 = 6.9908
2025-06-02 16:03:18,316 [INFO]     p99 = 7.0228
2025-06-02 16:03:18,316 [INFO]     mean = 6.6405
2025-06-02 16:03:18,316 [INFO]     min = 1.0759
2025-06-02 16:03:18,317 [INFO]     max = 7.0519
2025-06-02 16:03:18,317 [INFO]     stddev = 0.72
2025-06-02 16:03:18,317 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:03:18,318 [INFO]     p5 = 8.781
2025-06-02 16:03:18,319 [INFO]     p25 = 8.8398
2025-

Running Requests: 100%|██████████| 64/64 [02:23<00:00,  2.23s/it]
Running Requests: 100%|██████████| 64/64 [00:29<00:00,  2.11it/s]

2025-06-02 16:03:48,616 [INFO] Tasks Executed!
2025-06-02 16:03:48,617 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:03:48,628 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:03:48,629 [INFO]     p5 = 1.7591
2025-06-02 16:03:48,630 [INFO]     p25 = 1.765
2025-06-02 16:03:48,630 [INFO]     p50 = 1.7706
2025-06-02 16:03:48,630 [INFO]     p75 = 1.7812
2025-06-02 16:03:48,631 [INFO]     p90 = 2.0513
2025-06-02 16:03:48,631 [INFO]     p95 = 2.1761
2025-06-02 16:03:48,631 [INFO]     p99 = 2.1892
2025-06-02 16:03:48,632 [INFO]     mean = 1.8029
2025-06-02 16:03:48,632 [INFO]     min = 0.2038
2025-06-02 16:03:48,633 [INFO]     max = 2.1918
2025-06-02 16:03:48,633 [INFO]     stddev = 0.2397
2025-06-02 16:03:48,633 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:03:48,634 [INFO]     p5 = 3.6071
2025-06-02 16:03:48,635 [INFO]     p25 = 3.6203
202

Running Requests: 100%|██████████| 64/64 [00:29<00:00,  2.17it/s]


2025-06-02 16:04:48,127 [INFO] Tasks Executed!
2025-06-02 16:04:48,128 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:04:48,137 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:04:48,139 [INFO]     p5 = 3.5156
2025-06-02 16:04:48,139 [INFO]     p25 = 3.5243
2025-06-02 16:04:48,140 [INFO]     p50 = 3.5542
2025-06-02 16:04:48,140 [INFO]     p75 = 3.6782
2025-06-02 16:04:48,140 [INFO]     p90 = 4.1651
2025-06-02 16:04:48,141 [INFO]     p95 = 4.1769
2025-06-02 16:04:48,141 [INFO]     p99 = 4.1827
2025-06-02 16:04:48,141 [INFO]     mean = 3.5863
2025-06-02 16:04:48,142 [INFO]     min = 0.22
2025-06-02 16:04:48,142 [INFO]     max = 4.1832
2025-06-02 16:04:48,142 [INFO]     stddev = 0.6614
2025-06-02 16:04:48,142 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:04:48,144 [INFO]     p5 = 7.3551
2025-06-02 16:04:48,144 [INFO]     p25 = 7.366
2025-

Running Requests: 100%|██████████| 64/64 [00:59<00:00,  1.08it/s]
Running Requests: 100%|██████████| 64/64 [01:11<00:00,  1.05s/it]

2025-06-02 16:06:00,049 [INFO] Tasks Executed!
2025-06-02 16:06:00,049 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:06:00,059 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:06:00,060 [INFO]     p5 = 4.7097
2025-06-02 16:06:00,061 [INFO]     p25 = 4.7359
2025-06-02 16:06:00,061 [INFO]     p50 = 4.7553
2025-06-02 16:06:00,062 [INFO]     p75 = 4.905
2025-06-02 16:06:00,062 [INFO]     p90 = 5.3921
2025-06-02 16:06:00,062 [INFO]     p95 = 5.6201
2025-06-02 16:06:00,063 [INFO]     p99 = 5.63
2025-06-02 16:06:00,063 [INFO]     mean = 4.8076
2025-06-02 16:06:00,063 [INFO]     min = 0.3366
2025-06-02 16:06:00,064 [INFO]     max = 5.6326
2025-06-02 16:06:00,064 [INFO]     stddev = 0.6293
2025-06-02 16:06:00,064 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:06:00,065 [INFO]     p5 = 8.8695
2025-06-02 16:06:00,066 [INFO]     p25 = 8.9287
2025-

Running Requests: 100%|██████████| 64/64 [01:11<00:00,  1.12s/it]


2025-06-02 16:07:00,754 [INFO] Tasks Executed!
2025-06-02 16:07:00,754 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:07:00,764 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:07:00,765 [INFO]     p5 = 4.0456
2025-06-02 16:07:00,765 [INFO]     p25 = 4.0801
2025-06-02 16:07:00,766 [INFO]     p50 = 4.0923
2025-06-02 16:07:00,766 [INFO]     p75 = 4.1731
2025-06-02 16:07:00,766 [INFO]     p90 = 4.6024
2025-06-02 16:07:00,767 [INFO]     p95 = 4.8116
2025-06-02 16:07:00,767 [INFO]     p99 = 4.8448
2025-06-02 16:07:00,768 [INFO]     mean = 4.1204
2025-06-02 16:07:00,768 [INFO]     min = 0.3355
2025-06-02 16:07:00,768 [INFO]     max = 4.8451
2025-06-02 16:07:00,769 [INFO]     stddev = 0.5319
2025-06-02 16:07:00,769 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:07:00,770 [INFO]     p5 = 7.4907
2025-06-02 16:07:00,771 [INFO]     p25 = 7.5122
20

Running Requests: 100%|██████████| 64/64 [01:00<00:00,  1.05it/s]
Running Requests: 100%|██████████| 64/64 [00:43<00:00,  1.33s/it]

2025-06-02 16:07:44,202 [INFO] Tasks Executed!
2025-06-02 16:07:44,203 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:07:44,214 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:07:44,216 [INFO]     p5 = 3.8943
2025-06-02 16:07:44,217 [INFO]     p25 = 3.9029
2025-06-02 16:07:44,217 [INFO]     p50 = 3.9223
2025-06-02 16:07:44,217 [INFO]     p75 = 3.9896
2025-06-02 16:07:44,218 [INFO]     p90 = 4.2709
2025-06-02 16:07:44,218 [INFO]     p95 = 4.3457
2025-06-02 16:07:44,218 [INFO]     p99 = 4.4386
2025-06-02 16:07:44,219 [INFO]     mean = 3.9402
2025-06-02 16:07:44,219 [INFO]     min = 0.5362
2025-06-02 16:07:44,220 [INFO]     max = 4.4508
2025-06-02 16:07:44,220 [INFO]     stddev = 0.4585
2025-06-02 16:07:44,220 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:07:44,221 [INFO]     p5 = 5.3571
2025-06-02 16:07:44,222 [INFO]     p25 = 5.3702
20

Running Requests: 100%|██████████| 64/64 [00:43<00:00,  1.47it/s]


2025-06-02 16:08:43,349 [INFO] Tasks Executed!
2025-06-02 16:08:43,350 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:08:43,359 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:08:43,361 [INFO]     p5 = 4.6603
2025-06-02 16:08:43,361 [INFO]     p25 = 4.707
2025-06-02 16:08:43,361 [INFO]     p50 = 4.7552
2025-06-02 16:08:43,362 [INFO]     p75 = 4.7714
2025-06-02 16:08:43,362 [INFO]     p90 = 5.2977
2025-06-02 16:08:43,363 [INFO]     p95 = 5.4991
2025-06-02 16:08:43,363 [INFO]     p99 = 5.5615
2025-06-02 16:08:43,364 [INFO]     mean = 4.7655
2025-06-02 16:08:43,364 [INFO]     min = 0.6235
2025-06-02 16:08:43,364 [INFO]     max = 5.5638
2025-06-02 16:08:43,365 [INFO]     stddev = 0.582
2025-06-02 16:08:43,365 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:08:43,366 [INFO]     p5 = 7.2795
2025-06-02 16:08:43,366 [INFO]     p25 = 7.3191
2025

Running Requests: 100%|██████████| 64/64 [00:59<00:00,  1.08it/s]
Running Requests: 100%|██████████| 64/64 [01:44<00:00,  1.96s/it]

2025-06-02 16:10:27,513 [INFO] Tasks Executed!
2025-06-02 16:10:27,513 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:10:27,524 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:10:27,525 [INFO]     p5 = 10.209
2025-06-02 16:10:27,525 [INFO]     p25 = 10.2231
2025-06-02 16:10:27,526 [INFO]     p50 = 10.2378
2025-06-02 16:10:27,526 [INFO]     p75 = 10.2471
2025-06-02 16:10:27,526 [INFO]     p90 = 10.7686
2025-06-02 16:10:27,527 [INFO]     p95 = 11.0219
2025-06-02 16:10:27,527 [INFO]     p99 = 11.0569
2025-06-02 16:10:27,528 [INFO]     mean = 10.1955
2025-06-02 16:10:27,528 [INFO]     min = 1.1319
2025-06-02 16:10:27,528 [INFO]     max = 11.0891
2025-06-02 16:10:27,528 [INFO]     stddev = 1.1774
2025-06-02 16:10:27,529 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:10:27,530 [INFO]     p5 = 12.9114
2025-06-02 16:10:27,530 [INFO]     p25 = 

Running Requests: 100%|██████████| 64/64 [01:44<00:00,  1.64s/it]
Running Requests: 100%|██████████| 64/64 [00:17<00:00,  3.33it/s]

2025-06-02 16:10:45,689 [INFO] Tasks Executed!
2025-06-02 16:10:45,690 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:10:45,699 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:10:45,700 [INFO]     p5 = 2.0089
2025-06-02 16:10:45,701 [INFO]     p25 = 2.0236
2025-06-02 16:10:45,701 [INFO]     p50 = 2.1232
2025-06-02 16:10:45,701 [INFO]     p75 = 2.2451
2025-06-02 16:10:45,701 [INFO]     p90 = 2.2561
2025-06-02 16:10:45,702 [INFO]     p95 = 2.751
2025-06-02 16:10:45,702 [INFO]     p99 = 2.7655
2025-06-02 16:10:45,703 [INFO]     mean = 2.1094
2025-06-02 16:10:45,703 [INFO]     min = 0.2565
2025-06-02 16:10:45,704 [INFO]     max = 2.7662
2025-06-02 16:10:45,704 [INFO]     stddev = 0.395
2025-06-02 16:10:45,705 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:10:45,706 [INFO]     p5 = 4.2733
2025-06-02 16:10:45,706 [INFO]     p25 = 4.2791
2025

Running Requests: 100%|██████████| 64/64 [00:17<00:00,  3.65it/s]


2025-06-02 16:11:18,755 [INFO] Tasks Executed!
2025-06-02 16:11:18,755 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:11:18,763 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:11:18,764 [INFO]     p5 = 3.4529
2025-06-02 16:11:18,764 [INFO]     p25 = 3.464
2025-06-02 16:11:18,764 [INFO]     p50 = 3.5006
2025-06-02 16:11:18,764 [INFO]     p75 = 3.5673
2025-06-02 16:11:18,765 [INFO]     p90 = 3.5849
2025-06-02 16:11:18,765 [INFO]     p95 = 3.5863
2025-06-02 16:11:18,765 [INFO]     p99 = 5.1548
2025-06-02 16:11:18,766 [INFO]     mean = 3.5297
2025-06-02 16:11:18,766 [INFO]     min = 0.1875
2025-06-02 16:11:18,766 [INFO]     max = 5.1737
2025-06-02 16:11:18,767 [INFO]     stddev = 0.5521
2025-06-02 16:11:18,767 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:11:18,768 [INFO]     p5 = 8.1689
2025-06-02 16:11:18,768 [INFO]     p25 = 8.1785
202

Running Requests: 100%|██████████| 64/64 [00:33<00:00,  1.94it/s]
Running Requests: 100%|██████████| 64/64 [00:44<00:00,  1.20it/s]

2025-06-02 16:12:02,983 [INFO] Tasks Executed!
2025-06-02 16:12:02,983 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:12:02,992 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:12:02,993 [INFO]     p5 = 5.7427
2025-06-02 16:12:02,993 [INFO]     p25 = 5.7634
2025-06-02 16:12:02,994 [INFO]     p50 = 5.7829
2025-06-02 16:12:02,994 [INFO]     p75 = 5.8911
2025-06-02 16:12:02,994 [INFO]     p90 = 5.9249
2025-06-02 16:12:02,994 [INFO]     p95 = 5.933
2025-06-02 16:12:02,995 [INFO]     p99 = 7.6532
2025-06-02 16:12:02,995 [INFO]     mean = 5.8064
2025-06-02 16:12:02,995 [INFO]     min = 0.3412
2025-06-02 16:12:02,996 [INFO]     max = 7.6594
2025-06-02 16:12:02,996 [INFO]     stddev = 0.7997
2025-06-02 16:12:02,996 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:12:02,997 [INFO]     p5 = 10.9489
2025-06-02 16:12:02,997 [INFO]     p25 = 10.9679
2

Running Requests: 100%|██████████| 64/64 [00:44<00:00,  1.45it/s]


2025-06-02 16:12:40,982 [INFO] Tasks Executed!
2025-06-02 16:12:40,982 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:12:40,990 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:12:40,991 [INFO]     p5 = 5.1246
2025-06-02 16:12:40,992 [INFO]     p25 = 5.1417
2025-06-02 16:12:40,992 [INFO]     p50 = 5.1546
2025-06-02 16:12:40,992 [INFO]     p75 = 5.2883
2025-06-02 16:12:40,992 [INFO]     p90 = 5.2953
2025-06-02 16:12:40,993 [INFO]     p95 = 5.299
2025-06-02 16:12:40,993 [INFO]     p99 = 6.7026
2025-06-02 16:12:40,993 [INFO]     mean = 5.1799
2025-06-02 16:12:40,994 [INFO]     min = 0.3335
2025-06-02 16:12:40,994 [INFO]     max = 6.7143
2025-06-02 16:12:40,994 [INFO]     stddev = 0.6958
2025-06-02 16:12:40,994 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:12:40,995 [INFO]     p5 = 9.3261
2025-06-02 16:12:40,996 [INFO]     p25 = 9.3661
202

Running Requests: 100%|██████████| 64/64 [00:37<00:00,  1.69it/s]
Running Requests: 100%|██████████| 64/64 [00:35<00:00,  1.80s/it]

2025-06-02 16:13:16,275 [INFO] Tasks Executed!
2025-06-02 16:13:16,275 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:13:16,285 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:13:16,287 [INFO]     p5 = 6.6444
2025-06-02 16:13:16,287 [INFO]     p25 = 6.6564
2025-06-02 16:13:16,287 [INFO]     p50 = 6.6961
2025-06-02 16:13:16,288 [INFO]     p75 = 6.7761
2025-06-02 16:13:16,288 [INFO]     p90 = 6.8306
2025-06-02 16:13:16,288 [INFO]     p95 = 7.5928
2025-06-02 16:13:16,289 [INFO]     p99 = 7.6517
2025-06-02 16:13:16,289 [INFO]     mean = 6.6079
2025-06-02 16:13:16,289 [INFO]     min = 0.8909
2025-06-02 16:13:16,290 [INFO]     max = 7.6523
2025-06-02 16:13:16,290 [INFO]     stddev = 1.0701
2025-06-02 16:13:16,290 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:13:16,292 [INFO]     p5 = 8.7176
2025-06-02 16:13:16,292 [INFO]     p25 = 8.7362
20

Running Requests: 100%|██████████| 64/64 [00:35<00:00,  1.81it/s]


2025-06-02 16:13:59,684 [INFO] Tasks Executed!
2025-06-02 16:13:59,685 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:13:59,693 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:13:59,694 [INFO]     p5 = 7.0173
2025-06-02 16:13:59,694 [INFO]     p25 = 7.0545
2025-06-02 16:13:59,694 [INFO]     p50 = 7.0901
2025-06-02 16:13:59,694 [INFO]     p75 = 7.2986
2025-06-02 16:13:59,695 [INFO]     p90 = 7.306
2025-06-02 16:13:59,695 [INFO]     p95 = 7.3098
2025-06-02 16:13:59,695 [INFO]     p99 = 8.8913
2025-06-02 16:13:59,696 [INFO]     mean = 7.1052
2025-06-02 16:13:59,696 [INFO]     min = 0.544
2025-06-02 16:13:59,696 [INFO]     max = 8.8982
2025-06-02 16:13:59,697 [INFO]     stddev = 0.9153
2025-06-02 16:13:59,697 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:13:59,698 [INFO]     p5 = 10.6945
2025-06-02 16:13:59,698 [INFO]     p25 = 10.7409
20

Running Requests: 100%|██████████| 64/64 [00:43<00:00,  1.47it/s]
Running Requests: 100%|██████████| 64/64 [01:25<00:00,  2.34s/it]

2025-06-02 16:15:25,440 [INFO] Tasks Executed!
2025-06-02 16:15:25,441 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:15:25,450 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:15:25,452 [INFO]     p5 = 17.4855
2025-06-02 16:15:25,453 [INFO]     p25 = 17.5051
2025-06-02 16:15:25,453 [INFO]     p50 = 17.5488
2025-06-02 16:15:25,453 [INFO]     p75 = 17.6818
2025-06-02 16:15:25,453 [INFO]     p90 = 17.6896
2025-06-02 16:15:25,454 [INFO]     p95 = 17.6973
2025-06-02 16:15:25,454 [INFO]     p99 = 19.4689
2025-06-02 16:15:25,454 [INFO]     mean = 17.3952
2025-06-02 16:15:25,455 [INFO]     min = 1.1027
2025-06-02 16:15:25,455 [INFO]     max = 19.4837
2025-06-02 16:15:25,455 [INFO]     stddev = 2.1091
2025-06-02 16:15:25,455 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:15:25,457 [INFO]     p5 = 21.3177
2025-06-02 16:15:25,457 [INFO]     p25 =

Running Requests: 100%|██████████| 64/64 [01:26<00:00,  1.35s/it]
Running Requests: 100%|██████████| 64/64 [00:11<00:00,  5.25it/s]

2025-06-02 16:15:37,938 [INFO] Tasks Executed!
2025-06-02 16:15:37,938 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:15:37,945 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:15:37,946 [INFO]     p5 = 2.1585
2025-06-02 16:15:37,947 [INFO]     p25 = 2.1859
2025-06-02 16:15:37,947 [INFO]     p50 = 2.3286
2025-06-02 16:15:37,947 [INFO]     p75 = 2.4194
2025-06-02 16:15:37,947 [INFO]     p90 = 2.4253
2025-06-02 16:15:37,948 [INFO]     p95 = 2.4279
2025-06-02 16:15:37,948 [INFO]     p99 = 3.1009
2025-06-02 16:15:37,949 [INFO]     mean = 2.3056
2025-06-02 16:15:37,949 [INFO]     min = 0.2059
2025-06-02 16:15:37,949 [INFO]     max = 4.2408
2025-06-02 16:15:37,949 [INFO]     stddev = 0.3767
2025-06-02 16:15:37,950 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:15:37,951 [INFO]     p5 = 5.5761
2025-06-02 16:15:37,951 [INFO]     p25 = 5.5962
20

Running Requests: 100%|██████████| 64/64 [00:11<00:00,  5.51it/s]


2025-06-02 16:16:00,475 [INFO] Tasks Executed!
2025-06-02 16:16:00,475 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:16:00,482 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:16:00,483 [INFO]     p5 = 4.0202
2025-06-02 16:16:00,483 [INFO]     p25 = 4.0372
2025-06-02 16:16:00,483 [INFO]     p50 = 4.0988
2025-06-02 16:16:00,483 [INFO]     p75 = 4.2064
2025-06-02 16:16:00,484 [INFO]     p90 = 4.2102
2025-06-02 16:16:00,484 [INFO]     p95 = 4.2156
2025-06-02 16:16:00,484 [INFO]     p99 = 7.9007
2025-06-02 16:16:00,484 [INFO]     mean = 4.112
2025-06-02 16:16:00,485 [INFO]     min = 0.239
2025-06-02 16:16:00,485 [INFO]     max = 7.905
2025-06-02 16:16:00,485 [INFO]     stddev = 0.9683
2025-06-02 16:16:00,485 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:16:00,486 [INFO]     p5 = 11.0774
2025-06-02 16:16:00,486 [INFO]     p25 = 11.088
2025

Running Requests: 100%|██████████| 64/64 [00:22<00:00,  2.83it/s]
Running Requests: 100%|██████████| 64/64 [00:31<00:00,  1.37it/s]

2025-06-02 16:16:32,232 [INFO] Tasks Executed!
2025-06-02 16:16:32,232 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:16:32,240 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:16:32,241 [INFO]     p5 = 7.8715
2025-06-02 16:16:32,241 [INFO]     p25 = 7.8832
2025-06-02 16:16:32,241 [INFO]     p50 = 7.9584
2025-06-02 16:16:32,242 [INFO]     p75 = 8.0295
2025-06-02 16:16:32,242 [INFO]     p90 = 8.0434
2025-06-02 16:16:32,242 [INFO]     p95 = 8.0467
2025-06-02 16:16:32,242 [INFO]     p99 = 9.6454
2025-06-02 16:16:32,243 [INFO]     mean = 7.9079
2025-06-02 16:16:32,243 [INFO]     min = 0.3636
2025-06-02 16:16:32,243 [INFO]     max = 12.3659
2025-06-02 16:16:32,244 [INFO]     stddev = 1.1077
2025-06-02 16:16:32,244 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:16:32,245 [INFO]     p5 = 15.7018
2025-06-02 16:16:32,245 [INFO]     p25 = 15.7076

Running Requests: 100%|██████████| 64/64 [00:31<00:00,  2.01it/s]


2025-06-02 16:17:00,613 [INFO] Tasks Executed!
2025-06-02 16:17:00,615 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:17:00,661 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:17:00,662 [INFO]     p5 = 7.5525
2025-06-02 16:17:00,663 [INFO]     p25 = 7.5691
2025-06-02 16:17:00,663 [INFO]     p50 = 7.6746
2025-06-02 16:17:00,663 [INFO]     p75 = 7.7695
2025-06-02 16:17:00,663 [INFO]     p90 = 7.7766
2025-06-02 16:17:00,664 [INFO]     p95 = 7.7805
2025-06-02 16:17:00,664 [INFO]     p99 = 11.0647
2025-06-02 16:17:00,664 [INFO]     mean = 7.5523
2025-06-02 16:17:00,665 [INFO]     min = 0.5156
2025-06-02 16:17:00,665 [INFO]     max = 11.0684
2025-06-02 16:17:00,665 [INFO]     stddev = 1.4092
2025-06-02 16:17:00,665 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:17:00,666 [INFO]     p5 = 13.9365
2025-06-02 16:17:00,666 [INFO]     p25 = 13.949

Running Requests: 100%|██████████| 64/64 [00:28<00:00,  2.26it/s]
Running Requests: 100%|██████████| 64/64 [00:29<00:00,  1.16s/it]

2025-06-02 16:17:29,795 [INFO] Tasks Executed!
2025-06-02 16:17:29,795 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:17:29,805 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:17:29,806 [INFO]     p5 = 6.3485
2025-06-02 16:17:29,807 [INFO]     p25 = 6.3609
2025-06-02 16:17:29,807 [INFO]     p50 = 11.6371
2025-06-02 16:17:29,807 [INFO]     p75 = 11.74
2025-06-02 16:17:29,808 [INFO]     p90 = 13.201
2025-06-02 16:17:29,808 [INFO]     p95 = 13.2018
2025-06-02 16:17:29,808 [INFO]     p99 = 13.2085
2025-06-02 16:17:29,809 [INFO]     mean = 10.5293
2025-06-02 16:17:29,809 [INFO]     min = 0.5215
2025-06-02 16:17:29,809 [INFO]     max = 13.214
2025-06-02 16:17:29,810 [INFO]     stddev = 2.8967
2025-06-02 16:17:29,810 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:17:29,811 [INFO]     p5 = 8.4271
2025-06-02 16:17:29,811 [INFO]     p25 = 8.4367

Running Requests: 100%|██████████| 64/64 [00:29<00:00,  2.20it/s]


2025-06-02 16:18:06,450 [INFO] Tasks Executed!
2025-06-02 16:18:06,451 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:18:06,459 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:18:06,461 [INFO]     p5 = 7.3168
2025-06-02 16:18:06,461 [INFO]     p25 = 7.3265
2025-06-02 16:18:06,461 [INFO]     p50 = 13.2671
2025-06-02 16:18:06,462 [INFO]     p75 = 13.292
2025-06-02 16:18:06,462 [INFO]     p90 = 15.7636
2025-06-02 16:18:06,462 [INFO]     p95 = 15.7667
2025-06-02 16:18:06,462 [INFO]     p99 = 15.7699
2025-06-02 16:18:06,463 [INFO]     mean = 12.1686
2025-06-02 16:18:06,463 [INFO]     min = 0.6063
2025-06-02 16:18:06,463 [INFO]     max = 15.7746
2025-06-02 16:18:06,463 [INFO]     stddev = 3.4325
2025-06-02 16:18:06,464 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:18:06,465 [INFO]     p5 = 10.989
2025-06-02 16:18:06,465 [INFO]     p25 = 11.

Running Requests: 100%|██████████| 64/64 [00:36<00:00,  1.75it/s]
Running Requests: 100%|██████████| 64/64 [01:17<00:00,  3.43s/it]

2025-06-02 16:19:23,599 [INFO] Tasks Executed!
2025-06-02 16:19:23,599 [INFO] Benchmarking results obtained for model Meta-Llama-3.3-70B-Instruct queried with the sambastudio API.
2025-06-02 16:19:23,608 [INFO] Building Client Metrics Summary for metric: client_ttft_s
2025-06-02 16:19:23,610 [INFO]     p5 = 17.8018
2025-06-02 16:19:23,610 [INFO]     p25 = 17.8217
2025-06-02 16:19:23,610 [INFO]     p50 = 33.1324
2025-06-02 16:19:23,611 [INFO]     p75 = 33.2169
2025-06-02 16:19:23,611 [INFO]     p90 = 36.2289
2025-06-02 16:19:23,611 [INFO]     p95 = 36.2388
2025-06-02 16:19:23,612 [INFO]     p99 = 36.2436
2025-06-02 16:19:23,612 [INFO]     mean = 29.5425
2025-06-02 16:19:23,612 [INFO]     min = 1.1484
2025-06-02 16:19:23,613 [INFO]     max = 36.2462
2025-06-02 16:19:23,613 [INFO]     stddev = 8.0664
2025-06-02 16:19:23,613 [INFO] Building Client Metrics Summary for metric: client_end_to_end_latency_s
2025-06-02 16:19:23,614 [INFO]     p5 = 21.5764
2025-06-02 16:19:23,614 [INFO]     p25 =

#### Consolidate all results into pandas dataframe

In [11]:
output_files_dir = config['output_files_dir']
df_summary = read_perf_eval_json_files(output_files_dir, type='summary')
sorted(df_summary.columns)

['acceptance_rate_max',
 'acceptance_rate_mean',
 'acceptance_rate_min',
 'acceptance_rate_p25',
 'acceptance_rate_p5',
 'acceptance_rate_p50',
 'acceptance_rate_p75',
 'acceptance_rate_p90',
 'acceptance_rate_p95',
 'acceptance_rate_p99',
 'acceptance_rate_stddev',
 'client_end_to_end_latency_s_max',
 'client_end_to_end_latency_s_mean',
 'client_end_to_end_latency_s_min',
 'client_end_to_end_latency_s_p25',
 'client_end_to_end_latency_s_p5',
 'client_end_to_end_latency_s_p50',
 'client_end_to_end_latency_s_p75',
 'client_end_to_end_latency_s_p90',
 'client_end_to_end_latency_s_p95',
 'client_end_to_end_latency_s_p99',
 'client_end_to_end_latency_s_stddev',
 'client_mean_output_token_per_s',
 'client_output_token_per_s_max',
 'client_output_token_per_s_mean',
 'client_output_token_per_s_min',
 'client_output_token_per_s_p25',
 'client_output_token_per_s_p5',
 'client_output_token_per_s_p50',
 'client_output_token_per_s_p75',
 'client_output_token_per_s_p90',
 'client_output_token_per_s

In [12]:
missing_columns = []

if 'num_concurrent_requests' not in df_summary.columns:
    missing_columns.append('num_concurrent_requests')

if 'multimodal_img_size' not in df_summary.columns or df_summary['multimodal_img_size'].isnull().all():
    missing_columns.append('multimodal_img_size')

# Set fields to report
selected_columns = [
    'name',
    'model',    
    'number_input_tokens_min',
    'number_input_tokens_max',
    'number_output_tokens_min',
    'number_output_tokens_max',
    'num_concurrent_requests',
    'multimodal_img_size',
    'server_ttft_s_min',
    'server_ttft_s_p50',
    'server_ttft_s_max',
    'server_end_to_end_latency_s_min',
    'server_end_to_end_latency_s_p50',
    'server_end_to_end_latency_s_max',
    'server_output_token_per_s_min',
    'server_output_token_per_s_p50',
    'server_output_token_per_s_max',
    'server_output_token_per_s_mean',                
    'acceptance_rate_min',
    'acceptance_rate_p50',
    'acceptance_rate_max',
    'server_number_input_tokens_p50',
    'server_number_output_tokens_p50',
    'client_ttft_s_min',
    'client_ttft_s_p50',
    'client_ttft_s_max',
    'client_end_to_end_latency_s_min',
    'client_end_to_end_latency_s_p50',
    'client_end_to_end_latency_s_max',
    'client_output_token_per_s_min',
    'client_output_token_per_s_p50',
    'client_output_token_per_s_max',
    'client_output_token_per_s_mean',
    'client_mean_output_token_per_s',
    'num_requests_started',
    'num_completed_requests',
    'num_completed_requests_per_min',
    'number_errors',
    'error_code_frequency',
]

selected_columns = [c for c in selected_columns if c not in missing_columns]
# Set fields to report
df_summary = df_summary[selected_columns]
df_summary['model'] = df_summary['model'].str.replace('.', '-')
df_summary['requests_grouping'] = pd.Series(None, index=df_summary.index, dtype=object)
df_summary['requests_batching'] = pd.Series(None, index=df_summary.index, dtype=object)

# Add UUID to summary and set as index
#df_summary['uuid'] = df_summary.apply(lambda x: find_uuid(x['name']), axis=1)
df_summary = df_summary.set_index('name')

# Read individual responses
df = read_perf_eval_json_files(output_files_dir, type='individual_responses')

# Process individual files and add requests batching approximation
for filename in os.listdir(output_files_dir):
    if 'individual_responses' in filename:
        df_file = df[df['filename'] == filename].copy()
        df_file = df_file[df_file['error_code'].isnull()]

        requests_grouping, requests_batching = get_grouping_and_batching_info(df_file)

        key = filename[:filename.index("_individual_responses.json")] + "_summary"
        total_input_tokens = df_file["number_input_tokens"].sum()
        total_output_tokens = df_file["number_output_tokens"].sum()

        if key in df_summary.index:
            df_summary.at[key, 'requests_grouping'] = requests_grouping
            df_summary.at[key, 'requests_batching'] = requests_batching
            df_summary.at[key, 'total_input_tokens'] = total_input_tokens
            df_summary.at[key, 'total_output_tokens'] = total_output_tokens
        else:
            raise KeyError(f'Key {key} not found in dictionary. File: {file}')
df_summary['representative_batch_size'] = df_summary['requests_batching'].apply(
    lambda x: find_median_in_batches(x)
)

# Sort and save the summary DataFrame
consolidated_results_dir = os.path.expanduser(config['consolidated_results_dir'])
if not os.path.exists(consolidated_results_dir):
    os.makedirs(consolidated_results_dir)

df_summary["number_input_tokens_min"] = df_summary["number_input_tokens_min"].astype(int)
df_summary["number_output_tokens_min"] = df_summary["number_output_tokens_min"].astype(int)

sort_columns = ['model', 'num_concurrent_requests', 'number_input_tokens_min', 'number_output_tokens_min']
if 'num_concurrent_requests' in df_summary.columns:
    sort_columns.append('num_concurrent_requests')
df_summary.sort_values(by=sort_columns, inplace=True)

df_summary["server_rated_throughput"] = df_summary["server_output_token_per_s_mean"] * df_summary["representative_batch_size"]
df_summary["total_latency"] =  df_summary["num_completed_requests"]*60/df_summary["num_completed_requests_per_min"]
df_summary["client_output_throughput"] = df_summary["total_output_tokens"]/df_summary["total_latency"]
df_summary["client_total_throughput"] = (df_summary["total_input_tokens"]+df_summary["total_output_tokens"])/df_summary["total_latency"]

df_summary.to_excel(os.path.join(consolidated_results_dir, f'{run_name}.xlsx'))

## 6. Compare with alternative benchmarking results

In [13]:
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt

In [15]:
current_name = "Llama 3.3 70B Standalone"
alt_name = "XLAM Standalone"

#### Read results from current run

In [16]:
current_file_path = os.path.join(config["consolidated_results_dir"], f'{run_name}.xlsx')
df_current = pd.read_excel(current_file_path, sheet_name='Sheet1')
df_current["provider"] = current_name
df_current.head()

Unnamed: 0,name,model,number_input_tokens_min,number_input_tokens_max,number_output_tokens_min,number_output_tokens_max,num_concurrent_requests,server_ttft_s_min,server_ttft_s_p50,server_ttft_s_max,server_end_to_end_latency_s_min,server_end_to_end_latency_s_p50,server_end_to_end_latency_s_max,server_output_token_per_s_min,server_output_token_per_s_p50,server_output_token_per_s_max,server_output_token_per_s_mean,acceptance_rate_min,acceptance_rate_p50,acceptance_rate_max,server_number_input_tokens_p50,server_number_output_tokens_p50,client_ttft_s_min,client_ttft_s_p50,client_ttft_s_max,client_end_to_end_latency_s_min,client_end_to_end_latency_s_p50,client_end_to_end_latency_s_max,client_output_token_per_s_min,client_output_token_per_s_p50,client_output_token_per_s_max,client_output_token_per_s_mean,client_mean_output_token_per_s,num_requests_started,num_completed_requests,num_completed_requests_per_min,number_errors,error_code_frequency,requests_grouping,requests_batching,total_input_tokens,total_output_tokens,representative_batch_size,server_rated_throughput,total_latency,client_output_throughput,client_total_throughput,provider
0,synthetic_0_Meta-Llama-3-3-70B-Instruct_0_1000...,Meta-Llama-3-3-70B-Instruct,39,39,496,496,1,0.0691,0.0723,0.0766,1.513,1.5266,1.5581,333.577,337.8375,342.3656,338.0911,,,,39,496,0.1686,0.2808,0.7583,1.6157,1.7567,2.4404,268.7738,338.5953,347.8019,331.8455,274.2699,64,64,33.1778,0,{},"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2496,31744,1,338.0911,115.740043,274.269813,295.835383,Llama 3.3 70B Standalone
1,synthetic_0_Meta-Llama-3-3-70B-Instruct_1_1000...,Meta-Llama-3-3-70B-Instruct,80,80,1000,1000,1,0.0694,0.0719,0.0747,3.0522,3.09,3.1353,326.1153,328.9144,333.7129,329.0694,,,,80,1000,0.1713,0.3272,0.9903,3.1948,3.3553,4.1669,306.4995,329.5512,334.8144,328.7933,293.6662,64,64,17.62,0,{},"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5120,64000,1,329.0694,217.934166,293.666667,317.16,Llama 3.3 70B Standalone
2,synthetic_0_Meta-Llama-3-3-70B-Instruct_2_1000...,Meta-Llama-3-3-70B-Instruct,1688,1688,1000,1000,1,0.2005,0.2037,0.2061,3.5109,3.5574,3.5797,295.5842,297.5359,300.8714,297.9287,,,,1688,1000,0.32,0.4593,1.0482,3.6522,3.793,4.4123,257.5206,298.8217,310.7804,298.5541,259.4201,64,64,15.5652,0,{},"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",108032,64000,1,297.9287,246.704186,259.42,697.32096,Llama 3.3 70B Standalone
3,synthetic_0_Meta-Llama-3-3-70B-Instruct_3_1000...,Meta-Llama-3-3-70B-Instruct,1955,1955,809,809,1,0.2022,0.204,0.2059,2.8917,2.9342,2.9575,293.556,295.3114,299.0403,295.3631,,,,1955,809,0.33,0.456,2.3432,3.0551,3.1778,5.0887,248.1662,296.4497,309.5396,293.9722,245.1368,64,64,18.1807,0,{},"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",125120,51776,1,295.3631,211.21299,245.136438,837.524247,Llama 3.3 70B Standalone
4,synthetic_0_Meta-Llama-3-3-70B-Instruct_4_1000...,Meta-Llama-3-3-70B-Instruct,3417,3417,402,402,1,0.3753,0.3767,0.3801,1.4128,1.4308,1.4469,369.8218,379.1978,384.9822,379.1761,,,,3417,402,0.507,0.6971,2.1206,1.5601,1.771,3.4071,220.9643,379.2784,443.5685,363.6444,216.2365,64,64,32.2741,0,{},"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",218688,25728,1,379.1761,118.980855,216.23647,2054.246465,Llama 3.3 70B Standalone


#### Read results from alternative source

In [None]:
"salesforce_xlam2_70b_short_studio_mp1_20250602_1"

alt_type = "run" # "run" or "master"

if alt_type == "run":
    alt_run_name = "salesforce_xlam2_70b_short_studio_20250523_1"
    current_file_path = os.path.join(config["consolidated_results_dir"], f'{alt_run_name}.xlsx')
    df_alt = pd.read_excel(current_file_path, sheet_name='Sheet1')
    df_alt = df_alt[df_alt["model"].isin([t.replace('.', '-') for t in target_models])]
    df_alt["source"] = alt_name
    df_alt.head()
if alt_type == "master":
    df_alt = pd.read_excel('~/Downloads/Master_Benchmarking_Sheet.xlsx', sheet_name='Sheet1',  header=0, index_col=[0, 1, 2, 3]).reset_index()
    df_alt = df_alt[df_alt["model"].isin([t.replace('.', '-') for t in target_models])]
    df_alt["source"] = alt_name
    df_alt.head()

#### Combine dataframes

In [None]:
df_current.shape, df_alt.shape

((9, 39), (30, 38))

In [None]:
common_columns = list(set(df_current.columns) & set(df_alt.columns)) # get intersection
common_columns = [c for c in df_current.columns if c in common_columns] # reorder
len(common_columns)

37

In [None]:
first_columns = ['provider','model','num_input_tokens','num_output_tokens','num_concurrent_requests']
following_columns = [column for column in common_columns if column not in first_columns]
results = pd.concat([
        df_current,
        df_alt
    ], axis=0
)[first_columns + following_columns]
results.head()

### Plot benchmarking charts among providers

You may change the pallette color based on the color that better identifies each provider. You can take the [following link](https://seaborn.pydata.org/tutorial/color_palettes.html) as reference. Also, you will need to update the suptitle to reflect the model you're showing and any other detail. 

In [None]:
%matplotlib inline

# Pallette for the providers, change colors and provider names as needed
palette = {current_name: 'orange', alt_name: 'tab:purple'}

fields = {
    "Server TTFT (s)": "server_ttft_s_p50",
    "Server E2E Latency (s)": "server_end_to_end_latency_s_p50",
    "Server Tokens/s": "server_output_token_per_s_p50",
}

# Get unique concurrent values
concurrent_values = results['num_concurrent_requests'].unique()
concurrent_values.sort()

plt.clf()
for target_model in target_models:
    # Set up the figure and axes
    fig, axes = plt.subplots(len(concurrent_values), 3, figsize=(20, 6 * len(concurrent_values)), sharex=False)
    fig.suptitle(target_model, fontsize=20)
    for i, concurrent in enumerate(concurrent_values):
        subset = results[results['num_concurrent_requests'] == concurrent]
        subset = subset[subset['model'] == target_model.replace('.', '-')]        
        for j, field in enumerate(fields.keys()):
            ax = axes[i, j]                
            sns.barplot(data=subset, x='num_input_tokens', y=fields[field], hue='provider', ax=ax, palette=palette, errorbar=None)
            ax.set_title(f'{field} for Concurrent Requests: {concurrent}')
            ax.set_xlabel('Input Tokens')
            ax.set_ylabel(field)    
            ax.legend(loc=2)
    plt.tight_layout(rect=[0, 0.03, 1, 0.96])
    plt.show()

