In [223]:
import json
import os
import numpy as np
import tiktoken

In [224]:
enc = tiktoken.get_encoding("p50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

In [225]:
# Read all examples from the examples folder
examples = []

for filename in os.listdir('examples'):
    with open(os.path.join('examples', filename), 'r') as f:
        examples.append(json.load(f))

In [226]:
examples[0]

[{'role': 'system',
  'content': '\n# Instructions:\nYou are an autocomplete system for Python. Generate or complete Python code based on the user\'s description or the provided code snippet.\n\nThe user may\n1. Describe code, either directly or in a Python comment. In this case, you should implement the described code.\n2. Write incomplete code. In this case, you should complete the code from the last cursor position.\n3. If the last character is a newline or tab, come up with one or more possible next lines of code.\n\n\n## Example 1:\n\n[INPUT]\n# Function that adds two numbers\ndef\n[/INPUT]\n\n[ANSWER]\n```python\n add_numbers(a, b):\n    return a + b\n```\n[/ANSWER]\n\n\n## Example 2:\n\n[INPUT]\n```python\ndef factorial(n):\n    # Computes the fact\n```\n[/INPUT]\n\n[ANSWER]\n```python\n    # Computes the factorial of a number\n    if n == 0:\n        return 1\n    else:\n        return n * factorial(n-1)\n```\n[/ANSWER]\n\n\n## Example 3:\n\n[INPUT]\nI need a function that chec

In [227]:
# base cost per 1k tokens * number of tokens in the input file * number of epochs trained
# Count the system and user messages as input tokens
# Count the assistant messages as output tokens

messages_system = [message['content']  for example in examples for message in example if message['role'] == 'system']
messages_user = [message['content']  for example in examples for message in example if message['role'] == 'user']
messages_assistant = [message['content']  for example in examples for message in example if message['role'] == 'assistant']


text_input = ' '.join(messages_system + messages_user)
text_output = ' '.join(messages_assistant)

In [228]:
n_input_tokens = len(enc.encode(text_input))
n_output_tokens = len(enc.encode(text_output))
print(f'Input tokens: {n_input_tokens}')
print(f'Output tokens: {n_output_tokens}')

Input tokens: 13545
Output tokens: 3082


In [229]:
# Determine the number of tokens per example
n_system_tokens_per_example = [len(enc.encode(message)) for message in messages_system]
n_user_tokens_per_example = [len(enc.encode(message)) for message in messages_user]
n_assistant_tokens_per_example = [len(enc.encode(message)) for message in messages_assistant]

n_input_tokens_per_example = [n_system + n_user for n_system, n_user in zip(n_system_tokens_per_example, n_user_tokens_per_example)]
n_output_tokens_per_example = n_assistant_tokens_per_example

input_tokens_per_example_mean = np.mean(n_input_tokens_per_example)
output_tokens_per_example_mean = np.mean(n_output_tokens_per_example)

input_tokens_per_example_std = np.std(n_input_tokens_per_example)
output_tokens_per_example_std = np.std(n_output_tokens_per_example)

print(f'Input tokens per example: {input_tokens_per_example_mean:.1f} ± {input_tokens_per_example_std:.1f}')
print(f'Output tokens per example: {output_tokens_per_example_mean:.1f} ± {output_tokens_per_example_std:.1f}')

Input tokens per example: 588.2 ± 155.6
Output tokens per example: 134.0 ± 114.1


In [230]:
# GPT-3.5-Turbo
price_input_per_1k_tokens = 0.0010
price_output_per_1k_tokens = 0.0020

# GPT-4
# price_input_per_1k_tokens = 0.03
# price_output_per_1k_tokens = 0.06

In [231]:
# Determine the price per example
price_input_per_example = price_input_per_1k_tokens * input_tokens_per_example_mean / 1000
price_output_per_example = price_output_per_1k_tokens * output_tokens_per_example_mean / 1000

price_input_per_example_err = price_input_per_1k_tokens * input_tokens_per_example_std / 1000
price_output_per_example_err = price_output_per_1k_tokens * output_tokens_per_example_std / 1000

price_total_per_example = price_input_per_example + price_output_per_example

price_total_per_example_err = price_input_per_example_err + price_output_per_example_err

print(f'Price per example: {price_total_per_example:.5f} ± {price_total_per_example_err:.5f} USD')

Price per example: 0.00086 ± 0.00038 USD


In [232]:
n_examples = 100
n_epochs = 2

In [233]:
# Estimate the total number of input and output tokens accounting for the number of epochs and examples
n_input_tokens_total = input_tokens_per_example_mean * n_epochs * n_examples
n_output_tokens_total = output_tokens_per_example_mean * n_epochs * n_examples

n_input_tokens_total_err = input_tokens_per_example_std * n_epochs * n_examples
n_output_tokens_total_err = output_tokens_per_example_std * n_epochs * n_examples

print(f'Input tokens total: {n_input_tokens_total / 1000:.1f}k ± {n_input_tokens_total_err / 1000:.1f}k')
print(f'Output tokens total: {n_output_tokens_total / 1000:.1f}k ± {n_output_tokens_total_err / 1000:.1f}k')

Input tokens total: 117.6k ± 31.1k
Output tokens total: 26.8k ± 22.8k


In [234]:
# Estimate the price for 100 examples
price_total = price_total_per_example * n_examples

price_total_err = price_total_per_example_err * n_examples

print(f'Price for 100 examples: {price_total:.2f} ± {price_total_err:.2f} USD')

Price for 100 examples: 0.09 ± 0.04 USD
