In [2]:
from typing import Callable

import requests
import tiktoken

def read_text(url: str) -> str:
    response: requests.Response = requests.get(url)
    text: str = response.content.decode("utf-8")
    return text

def faust() -> str:
    text: str = read_text("https://www.gutenberg.org/files/2229/2229-0.txt")
    lines: list[str] = text.splitlines()
    return "\n".join(lines[69:7184])

def tiny_shakespeare() -> str:
    text: str = read_text("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    return text

def turtle() -> str:
    text: str = read_text("https://raw.githubusercontent.com/python/cpython/main/Lib/turtle.py")
    return text

def _ratio(text) -> float:
    encoding: tiktoken.Encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens: int = len(encoding.encode(text))
    num_chars: int = len(text)
    return num_chars / num_tokens

In [4]:
# In this analysis, we calculate the character-to-token ratio for several languages. 
# As per the definition provided by VG Wort, a standard page consists of 1,500 characters.
# https://www.vgwort.de/auszahlungen/wissenschaftliche-publikationen/fach-und-sachzeitschriften.html
data: list[tuple[str, Callable]] = [
    ("German", faust),
    ("English", tiny_shakespeare),
    ("Python", turtle)
]

for element in data:
    language, func = element
    text = func()
    ratio = _ratio(text)
    print(f"* In {language}, each token represents {ratio:.2f} characters. A standard page contains {1500 / ratio:.0f} tokens.")

* In German, each token represents 3.10 characters. A standard page contains 484 tokens.
* In English, each token represents 3.70 characters. A standard page contains 406 tokens.
* In Python, each token represents 4.21 characters. A standard page contains 356 tokens.
