In [1]:
from collections import namedtuple
import itertools
import json
import numpy as np
import os
import re
import tqdm

In [2]:
LOGS_PATH = "/lfs/1/keshav2/helm-efficiency/logs"

In [3]:
Query = namedtuple("Query", ("num_prompt_tokens", "num_output_tokens", "runtime", "api"))

In [4]:
def parse_api_logs(datadir):
    queries = []
    for subdir in tqdm.tqdm(os.listdir(datadir)):
        with open(os.path.join(datadir, subdir, "per_instance_stats.json"), "r") as f:
            stats = json.load(f)
            for stat in stats:
                num_prompt_tokens = None
                num_output_tokens = None
                idealized_runtime = None
                counter = 0
                for metric in stat["stats"]:
                    if metric["name"]["name"] == "num_prompt_tokens":
                        num_prompt_tokens = metric["sum"]
                        counter += 1
                    elif metric["name"]["name"] == "num_output_tokens":
                        num_output_tokens = metric["sum"]
                        counter += 1
                    elif metric["name"]["name"] == "inference_idealized_runtime":
                        counter += 1
                        idealized_runtime = metric["sum"]
                    if counter == 3:
                        break
                if num_prompt_tokens is None or num_output_tokens is None:
                    continue
                queries.append(
                    Query(num_prompt_tokens, num_output_tokens, idealized_runtime, True)
                )
    return queries

In [5]:
def compute_api_cost(num_prompt_tokens, num_output_tokens, num_trials, per_token_costs):
    input_cost = per_token_costs["input"] * num_prompt_tokens / 1000
    output_cost = per_token_costs["output"] * num_output_tokens / 1000
    return num_trials * (input_cost + output_cost)

In [6]:
def compute_local_cost(runtime, num_trials, hardware_cost):
    return runtime * num_trials * hardware_cost / 60 / 60

In [7]:
def parse_calibration_log(path):
    calibration_queries = []
    with open(path, "r") as f:
        for line in f: 
            match = re.search(".*: (\d+),.*: (\d+)}.*: ([-+]?[0-9]*\.?[0-9]+)", line)
            if match is not None:
                num_prompt_tokens = int(match.group(1))
                num_output_tokens = int(match.group(2))
                runtime = float(match.group(3))
                calibration_queries.append(
                    Query(num_prompt_tokens, num_output_tokens, runtime, False)
                )
    return calibration_queries

In [8]:
def denoised_savings(
    queries,
    calibration_queries,
    api_costs,
    hardware_cost,
    num_api_trials=10,
    num_local_trials=50,
):
    baseline = sum(
        [
            compute_api_cost(
                q.num_prompt_tokens, q.num_output_tokens, num_api_trials, api_costs
            )
            for q in queries
        ]
    )
    calibration_cost = sum(
        [
            compute_api_cost(
                q.num_prompt_tokens, q.num_output_tokens, num_api_trials, api_costs
            )
            for q in calibration_queries
        ]
    )
    api_cost = sum(
        [
            compute_api_cost(q.num_prompt_tokens, q.num_output_tokens, 1, api_costs)
            for q in queries
        ]
    )
    denoised = calibration_cost + api_cost
    print(f"Baseline cost: ${baseline:.2f}")
    print(f"Denoised cost: ${calibration_cost:.2f} + ${api_cost:.2f} = ${denoised:.2f}")
    print(f"Savings: {baseline / denoised:.2f}x")

In [9]:
def idealized_savings(
    queries,
    calibration_queries,
    api_costs,
    hardware_cost,
    num_api_trials=10,
    num_local_trials=50,
):
    # Filter out queries which do not have an idealized runtime estimate
    queries = [q for q in queries if q.runtime is not None]

    local_cost = sum(
        [
            compute_local_cost(q.runtime, num_local_trials, hardware_cost)
            for q in queries
        ]
    )
    api_cost = sum(
        [
            compute_api_cost(q.num_prompt_tokens, q.num_output_tokens, 1, api_costs)
            for q in queries
        ]
    )
    baseline = local_cost + api_cost

    calibration_time = sum([q.runtime for q in calibration_queries])
    calibration_cost = compute_local_cost(
        calibration_time, num_local_trials, hardware_cost
    )
    idealized = calibration_cost + api_cost
    print(f"Baseline cost: ${local_cost:.2f} + ${api_cost:.2f} = ${baseline:.2f}")
    print(
        f"Idealized cost: ${calibration_cost:.2f} + ${api_cost:.2f} = ${idealized:.2f}"
    )
    print(f"Savings: {baseline / idealized:.2f}x")

In [10]:
datadir = os.path.join(LOGS_PATH, "v7_subset")
queries = parse_api_logs(datadir)

100%|██████████| 112/112 [00:43<00:00,  2.55it/s]


In [11]:
path = os.path.join(LOGS_PATH, "a100", "openai_davinci.log")
calibration_queries = parse_calibration_log(path)

In [12]:
davinci_costs = {
    "input": 0.0015,
    "output": 0.002,
}  # https://openai.com/pricing (GPT-3.5 Turbo 4K context)
p4de24xlarge_cost = 40.96  # https://aws.amazon.com/ec2/instance-types/p4/

In [13]:
denoised_savings(queries, calibration_queries, davinci_costs, p4de24xlarge_cost)

Baseline cost: $4176.34
Denoised cost: $3.11 + $417.63 = $420.74
Savings: 9.93x


In [14]:
idealized_savings(queries, calibration_queries, davinci_costs, p4de24xlarge_cost)

Baseline cost: $30877.35 + $250.91 = $31128.26
Idealized cost: $291.16 + $250.91 = $542.07
Savings: 57.42x
