In [1]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 3021, 23317]
Number of tokens: 3


In [2]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 3021, 23317]
Number of tokens: 3


In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 3047, 27941]
Number of tokens: 3


In [4]:
import tiktoken

encoding = tiktoken.encoding_for_model("text-davinci-003")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 1842, 14256]
Number of tokens: 3


In [5]:
import tiktoken

encoding = tiktoken.encoding_for_model("text-embedding-ada-002")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 3021, 23317]
Number of tokens: 3


In [6]:
import tiktoken

encoding = tiktoken.encoding_for_model("code-davinci-002")

text = "I love pizza"
tokens = encoding.encode(text)

print(tokens)  # Print the list of token IDs
print(f"Number of tokens: {len(tokens)}")


[40, 1842, 14256]
Number of tokens: 3


In [7]:
import tiktoken

models = ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]
text = "I love pizza"

for model in models:
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    print(f"{model}: {tokens} -> {len(tokens)} tokens")


gpt-3.5-turbo: [40, 3021, 23317] -> 3 tokens
gpt-4: [40, 3021, 23317] -> 3 tokens
gpt-4o: [40, 3047, 27941] -> 3 tokens


🧠 Supported Models & Their Tokenizers
Here are some examples you can try:

Model Name	Description
"gpt-3.5-turbo"	ChatGPT 3.5
"gpt-4"	GPT-4
"gpt-4o"	GPT-4 Omni (faster, cheaper)
"text-davinci-003"	GPT-3 Completion model
"text-embedding-ada-002"	Embedding model (different tokens)
"code-davinci-002"	Codex model for code

In [8]:
import tiktoken
from tiktoken.model import MODEL_TO_ENCODING  # list of all models tiktoken knows

text = "I love pizza"

def encode_with_model(model: str, txt: str):
    """Return token ids for `txt` using the tokenizer tied to `model`."""
    try:
        # Works for most models
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback: build the encoding from the raw encoding name
        enc = tiktoken.get_encoding(MODEL_TO_ENCODING[model])
    return enc.encode(txt)

for model in sorted(MODEL_TO_ENCODING.keys()):
    try:
        tokens = encode_with_model(model, text)
        print(f"{model:35} -> {len(tokens):2} tokens  {tokens}")
    except Exception as e:
        print(f"{model:35} -> ERROR: {e}")


ada                                 ->  3 tokens  [40, 1842, 14256]
babbage                             ->  3 tokens  [40, 1842, 14256]
babbage-002                         ->  3 tokens  [40, 3021, 23317]
code-cushman-001                    ->  3 tokens  [40, 1842, 14256]
code-cushman-002                    ->  3 tokens  [40, 1842, 14256]
code-davinci-001                    ->  3 tokens  [40, 1842, 14256]
code-davinci-002                    ->  3 tokens  [40, 1842, 14256]
code-davinci-edit-001               ->  3 tokens  [40, 1842, 14256]
code-search-ada-code-001            ->  3 tokens  [40, 1842, 14256]
code-search-babbage-code-001        ->  3 tokens  [40, 1842, 14256]
curie                               ->  3 tokens  [40, 1842, 14256]
cushman-codex                       ->  3 tokens  [40, 1842, 14256]
davinci                             ->  3 tokens  [40, 1842, 14256]
davinci-002                         ->  3 tokens  [40, 3021, 23317]
davinci-codex                       ->  3 tokens

In [17]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()  # Load from .env file

token = os.environ.get("HF_TOKEN")
if token:
    login(token=token)
    print("✅ Hugging Face logged in")
else:
    print("❌ HF_TOKEN not found.")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


✅ Hugging Face logged in


In [18]:
from __future__ import annotations
from typing import Dict, Any
from functools import lru_cache

# OpenAI
import tiktoken
from tiktoken.model import MODEL_TO_ENCODING

# Hugging Face
from transformers import AutoTokenizer


class TokenizationError(Exception):
    pass


def is_openai_model(model_name: str) -> bool:
    """Decide if we should use tiktoken (OpenAI) or HF (others)."""
    # Most robust check: does tiktoken know this model?
    return model_name in MODEL_TO_ENCODING or model_name.startswith(("gpt-", "o1", "o3", "text-davinci"))


@lru_cache(maxsize=None)
def get_openai_encoder(model_name: str):
    """Return a tiktoken encoder for an OpenAI model name (or fall back to base encodings)."""
    try:
        return tiktoken.encoding_for_model(model_name)
    except KeyError:
        # If the exact model isn't registered, use cl100k_base (most GPT-4/3.5 models)
        return tiktoken.get_encoding("cl100k_base")


@lru_cache(maxsize=None)
def get_hf_tokenizer(model_name: str):
    """Return a HF tokenizer (downloads if needed)."""
    return AutoTokenizer.from_pretrained(model_name, use_fast=True)


def auto_tokenize(model_name: str, text: str) -> Dict[str, Any]:
    """
    Tokenize `text` with the right library based on `model_name`.

    Returns:
        {
          "model": ...,
          "library": "tiktoken" | "transformers",
          "tokens": [int, ...],
          "n_tokens": int
        }
    """
    if is_openai_model(model_name):
        enc = get_openai_encoder(model_name)
        tokens = enc.encode(text)
        return {
            "model": model_name,
            "library": "tiktoken",
            "tokens": tokens,
            "n_tokens": len(tokens),
        }
    else:
        tok = get_hf_tokenizer(model_name)
        tokens = tok.encode(text)
        return {
            "model": model_name,
            "library": "transformers",
            "tokens": tokens,
            "n_tokens": len(tokens),
        }


if __name__ == "__main__":
    text = "I love pizza"

    models = [
        # OpenAI
        "gpt-3.5-turbo",
        "gpt-4",
        "gpt-4o",
        # Meta / Mistral / others (HF)
        "meta-llama/Meta-Llama-3-8B",
        "mistralai/Mistral-7B-Instruct",
        "tiiuae/falcon-7b-instruct",
        "google/gemma-7b-it",
    ]

    for m in models:
        try:
            out = auto_tokenize(m, text)
            print(f"{m:35} | {out['library']:12} | {out['n_tokens']:2} tokens | {out['tokens']}")
        except Exception as e:
            print(f"{m:35} | ERROR: {e}")


gpt-3.5-turbo                       | tiktoken     |  3 tokens | [40, 3021, 23317]
gpt-4                               | tiktoken     |  3 tokens | [40, 3021, 23317]
gpt-4o                              | tiktoken     |  3 tokens | [40, 3047, 27941]
meta-llama/Meta-Llama-3-8B          | ERROR: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
mistralai/Mistral-7B-Instruct       | ERROR: mistralai/Mistral-7B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

tiiuae/falcon-7b-instruct           | transformers |  3 tokens | [52, 1163, 12359]
google/gemma-7b-it                  | ERROR: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-7b-it.
403 Client Error. (Request ID: Root=1-6887f5fb-2e962880473c681461c631fb;cd46531d-c5f6-4cec-bd41-b9aeed583dfc)

Cannot access gated repo for url https://huggingface.co/google/gemma-7b-it/resolve/main/config.json.
Access to model google/gemma-7b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-7b-it to ask for access.


In [19]:
import numpy as np
print(np.arange(10))


[0 1 2 3 4 5 6 7 8 9]
