# Tokenizer

In [1]:
import os
import openai
import tiktoken

from dotenv import load_dotenv

In [2]:
print("openai version =", openai.__version__)

openai version = 0.28.0


In [3]:
load_dotenv("azure.env")

openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_type = "azure"

In [4]:
encoding = tiktoken.get_encoding("cl100k_base")

In [5]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
print(encoding)

<Encoding 'cl100k_base'>


In [6]:
file_path = "meeting_notes_from_audio.txt"

In [7]:
try:
    with open(file_path, "r") as file:
        meeting_notes = file.read()

except FileNotFoundError:
    print(f"The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [8]:
tokens_integer = encoding.encode(meeting_notes)
tokens_integer

[5159,
 6574,
 8554,
 15590,
 555,
 35219,
 5377,
 15592,
 323,
 35219,
 39841,
 3600,
 13,
 2696,
 25,
 220,
 806,
 96253,
 12,
 2366,
 18,
 220,
 806,
 25,
 2970,
 25,
 2491,
 271,
 19791,
 1473,
 37,
 396,
 4842,
 12623,
 30037,
 1047,
 264,
 6992,
 2132,
 8502,
 315,
 220,
 2366,
 18,
 449,
 264,
 13254,
 315,
 220,
 6549,
 3610,
 323,
 264,
 20547,
 11626,
 4850,
 315,
 220,
 2970,
 14697,
 11205,
 4272,
 8070,
 574,
 220,
 845,
 3610,
 11,
 264,
 5199,
 5376,
 505,
 279,
 220,
 605,
 3610,
 3970,
 304,
 279,
 1890,
 8502,
 315,
 279,
 3766,
 1060,
 13,
 23212,
 11,
 435,
 396,
 4842,
 12623,
 706,
 85957,
 872,
 9513,
 46128,
 34919,
 20136,
 323,
 29091,
 304,
 48197,
 22359,
 13166,
 27460,
 311,
 18885,
 5326,
 60684,
 291,
 4780,
 13,
 24296,
 11,
 814,
 617,
 3970,
 17808,
 1217,
 6650,
 449,
 264,
 445,
 16027,
 11547,
 1741,
 11595,
 315,
 220,
 18,
 13,
 20,
 13689,
 323,
 872,
 18057,
 369,
 279,
 5108,
 8502,
 374,
 6928,
 449,
 220,
 23,
 4,
 8502,
 29352,
 58414,
 665

In [9]:
print(f"{len(tokens_integer)} is the number of tokens in my text")

535 is the number of tokens in my text


In [10]:
tokens_string = [encoding.decode_single_token_bytes(token) for token in tokens_integer]
tokens_string

[b'My',
 b' meeting',
 b' notes',
 b' processed',
 b' by',
 b' Azure',
 b' Open',
 b' AI',
 b' and',
 b' Azure',
 b' Speech',
 b' services',
 b'.',
 b' Date',
 b':',
 b' ',
 b'11',
 b'-Sep',
 b'-',
 b'202',
 b'3',
 b' ',
 b'11',
 b':',
 b'58',
 b':',
 b'49',
 b'\n\n',
 b'Summary',
 b':\n\n',
 b'F',
 b'int',
 b'ech',
 b' Plus',
 b' Sync',
 b' had',
 b' a',
 b' successful',
 b' second',
 b' quarter',
 b' of',
 b' ',
 b'202',
 b'3',
 b' with',
 b' a',
 b' revenue',
 b' of',
 b' ',
 b'125',
 b' million',
 b' and',
 b' a',
 b' gross',
 b' profit',
 b' margin',
 b' of',
 b' ',
 b'58',
 b'%.',
 b' Their',
 b' net',
 b' income',
 b' was',
 b' ',
 b'16',
 b' million',
 b',',
 b' a',
 b' significant',
 b' increase',
 b' from',
 b' the',
 b' ',
 b'10',
 b' million',
 b' seen',
 b' in',
 b' the',
 b' same',
 b' quarter',
 b' of',
 b' the',
 b' previous',
 b' year',
 b'.',
 b' Additionally',
 b',',
 b' F',
 b'int',
 b'ech',
 b' Plus',
 b' has',
 b' diversified',
 b' their',
 b' asset',
 b'-backed',

In [14]:
message = [
    {
        "role": "user",
        "content": "Explain to me how tokenization is working in OpenAI models?",
    }
]

tokens_per_message = 4

num_tokens = 0
num_tokens += tokens_per_message

for key, value in message[0].items():
    text = value
    num_tokens += len(encoding.encode(value))
    print(f"{len(encoding.encode(value))} is the number of token included in {key}")

num_tokens += 3
# every reply is primed with <|start|>assistant<|message|>

print(f"{num_tokens} number of tokens to be sent in our request")

1 is the number of token included in role
14 is the number of token included in content
22 number of tokens to be sent in our request
