In [35]:
import os
from typing import Callable

import requests
import tiktoken

def read_url(url: str) -> str:
    response: requests.Response = requests.get(url)
    text: str = response.content.decode("utf-8")
    return text

def read_file(path: str) -> str:
    with open(path, mode="r", encoding="utf-8") as file:
        return file.read()

def faust(url: str) -> str:
    text: str = read_url(url)
    lines: list[str] = text.splitlines()
    return "\n".join(lines[69:7184])

def _ratio(text: str, encoding_name: str) -> float:
    encoding: tiktoken.Encoding = tiktoken.get_encoding(encoding_name)
    num_tokens: int = len(encoding.encode(text))
    num_chars: int = len(text)
    return num_chars / num_tokens

In [37]:
# In this analysis, we calculate the character-to-token ratio for several languages.
# For the programming languages, the repositories from the GitHub project https://github.com/TheAlgorithms were used.
# Using merge_files.py, we combined the code into a single file for each programming.
# As per the definition provided by VG Wort, a standard page consists of 1,500 characters.
# https://www.vgwort.de/auszahlungen/wissenschaftliche-publikationen/fach-und-sachzeitschriften.html

data: list[tuple[str, Callable, str]] = [
    ("German", faust, "https://www.gutenberg.org/files/2229/2229-0.txt"),
    ("English", read_url, "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"),
    ("Python", read_file, os.path.join("playground", "data", "py.txt")),
    ("C", read_file, os.path.join("playground", "data", "c.txt")),
    ("C-Plus-Plus", read_file, os.path.join("playground", "data", "cpp.txt")),
    ("JavaScript", read_file, os.path.join("playground", "data", "js.txt")),
    ("Go", read_file, os.path.join("playground", "data", "go.txt")),
    ("Java", read_file, os.path.join("playground", "data", "java.txt"))
]

ENCODING_P50K_BASE: str = "p50k_base"
ENCODING_CL100K_BASE: str = "cl100k_base"

for element in data:
    language, func, url = element
    text: str = func(url)
    ratio_50: float = _ratio(text, ENCODING_P50K_BASE)
    ratio_100: float = _ratio(text, ENCODING_CL100K_BASE)
    print("---")
    print(f"* {ENCODING_P50K_BASE}: In {language}, each token represents {ratio_50:.2f} characters. A standard page contains {1500 / ratio_50:.0f} tokens.")
    print(f"* {ENCODING_CL100K_BASE}: In {language}, each token represents {ratio_100:.2f} characters. A standard page contains {1500 / ratio_100:.0f} tokens.")
    print(f"Improvement over old encoding: {(ratio_100 - ratio_50):.2f}")

---
* p50k_base: In German, each token represents 2.25 characters.
* cl100k_base: In German, each token represents 3.10 characters.
Improvement over old encoding: 0.85
---
* p50k_base: In English, each token represents 3.30 characters.
* cl100k_base: In English, each token represents 3.70 characters.
Improvement over old encoding: 0.40
---
* p50k_base: In Python, each token represents 3.07 characters.
* cl100k_base: In Python, each token represents 3.63 characters.
Improvement over old encoding: 0.56
---
* p50k_base: In C, each token represents 2.97 characters.
* cl100k_base: In C, each token represents 3.56 characters.
Improvement over old encoding: 0.59
---
* p50k_base: In C-Plus-Plus, each token represents 3.10 characters.
* cl100k_base: In C-Plus-Plus, each token represents 3.68 characters.
Improvement over old encoding: 0.57
---
* p50k_base: In JavaScript, each token represents 2.94 characters.
* cl100k_base: In JavaScript, each token represents 3.35 characters.
Improvement over o