# Monolingual tokenizer analysis


## Imports and loading

In [1]:
from script_bpe.corpus.registry import MONOLINGUAL_DATASETS
from script_bpe.train import load_tokenizers_for_dataset
from script_bpe.bpe.stats import compression_curve
import pandas as pd

In [2]:
N=64000
num_undecodeable = {}
curves = {}
compression_stats = []
for file in MONOLINGUAL_DATASETS:
    tokenizers = load_tokenizers_for_dataset(file, n=N)
    curves = {ptok: compression_curve(tokenizer) for ptok, tokenizer in tokenizers.items() if tokenizer}
    print(f"Loaded {len(curves)} tokenizers for {file}")
    if 'scriptenc' not in curves:
        print(f"SCRIPT tokenizer for {file} is not available, skipping dataset")
        continue
    num_chars = curves['scriptenc'][0] / 2
    num_undecodeable[file] = {ptok: tokenizer.stats()['num_undecodeable'] for ptok, tokenizer in tokenizers.items() if tokenizer}
    for ptok, curve in curves.items():
        compression_stats.append(dict(file=file, pretokenizer=ptok, start=curve[0]/num_chars, end=curve[-1]/num_chars))


Loaded 8 tokenizers for eng_latn_300mb
Loaded 8 tokenizers for deu_latn_300mb
Loaded 8 tokenizers for vie_latn_300mb
Loaded 8 tokenizers for heb_hebr_300mb
Loaded 8 tokenizers for arb_arab_300mb
Loaded 8 tokenizers for rus_cyrl_300mb
Loaded 8 tokenizers for kor_hang_300mb
Loaded 8 tokenizers for hin_deva_300mb
Loaded 8 tokenizers for tha_thai_300mb
Loaded 8 tokenizers for zho_hans_300mb
Loaded 8 tokenizers for jpn_jpan_300mb
Loaded 8 tokenizers for pan_guru_300mb


In [15]:

def format_results(df, columns=None, floatfmt="{:.4f}", gradient='RdYlGn_r', relative=None):
    df = df[columns or df.columns]
    df = pd.concat([df, df.mean().rename('mean').to_frame().T])
    df_style = df.style
    if relative is not None:
        rel_val = df.div(df.median(axis=1), axis=0)
        df_style = df_style.background_gradient(cmap=gradient, gmap=rel_val, axis=None, vmin=1-relative, vmax=1+relative)
    elif gradient:
        df_style = df_style.background_gradient(cmap=gradient, axis=None)
    return df_style.format(floatfmt)


cdf = pd.DataFrame(compression_stats)
from_df = cdf.pivot(index='file',columns='pretokenizer',values='start').sort_values('bytes_gpt4', ascending=False)
to_df = cdf.pivot(index='file',columns='pretokenizer',values='end').sort_values('bytes_gpt4', ascending=False)

display(format_results(from_df, columns=['bytes_gpt4','scriptenc','scriptenc_cb'] ).set_caption("Table 1 (Left two columns): Tokens/Char at start"))
nonosplit = [c for c in to_df.columns if 'nosplit' not in c and '_cb' in c]
display(format_results(to_df,relative=0.5).set_caption(f"Table 1 (Compression) / Table 5 (Expanded): Results for Tokens/Char after merges"))

pretokenizer,bytes_gpt4,scriptenc,scriptenc_cb
jpn_jpan_300mb,2.7386,2.0,2.0
zho_hans_300mb,2.6927,2.0,2.0
tha_thai_300mb,2.6789,2.0,2.0
pan_guru_300mb,2.5449,2.0,2.0
hin_deva_300mb,2.5134,2.0,2.0
kor_hang_300mb,2.3325,2.0,2.0
rus_cyrl_300mb,1.813,2.0,2.0
arb_arab_300mb,1.794,2.0,2.0
heb_hebr_300mb,1.7743,2.0,2.0
vie_latn_300mb,1.3155,2.0,2.0


pretokenizer,bytes_gpt4,bytes_gpt4_cb,bytes_gpt4o,bytes_gpt4o_cb,scriptenc,scriptenc_cb,scriptenc_gpt4o,scriptenc_gpt4o_cb
zho_hans_300mb,0.5138,0.5136,0.5139,0.5137,0.5612,0.5587,0.5174,0.5134
pan_guru_300mb,0.4961,0.4961,0.2396,0.2394,0.2401,0.2398,0.2397,0.2394
hin_deva_300mb,0.4835,0.4835,0.2389,0.2387,0.239,0.2387,0.239,0.2387
jpn_jpan_300mb,0.423,0.4196,0.4232,0.4197,0.4493,0.4422,0.4283,0.4194
kor_hang_300mb,0.3945,0.3945,0.3946,0.3945,0.3974,0.3973,0.394,0.3942
tha_thai_300mb,0.3219,0.3216,0.2246,0.2048,0.255,0.2364,0.2258,0.2047
vie_latn_300mb,0.2552,0.2552,0.2553,0.2553,0.2555,0.2553,0.2554,0.2553
heb_hebr_300mb,0.2459,0.2439,0.2447,0.2426,0.2469,0.2446,0.2448,0.2427
arb_arab_300mb,0.2324,0.2325,0.2298,0.2299,0.2328,0.2298,0.2329,0.2299
rus_cyrl_300mb,0.212,0.2115,0.212,0.2115,0.2121,0.2123,0.2111,0.2115


In [16]:
nonosplit = [c for c in to_df.columns if 'nosplit' not in c]
num_undecodeable_df = pd.DataFrame.from_dict(num_undecodeable, orient='index').sort_values(by='bytes_gpt4', ascending=False)[nonosplit]
format_results(num_undecodeable_df, floatfmt='{:.1f}').set_caption(f"Table 2 (Partial char): Number tokens including partial characters")

Unnamed: 0,bytes_gpt4,bytes_gpt4_cb,bytes_gpt4o,bytes_gpt4o_cb,scriptenc,scriptenc_cb,scriptenc_gpt4o,scriptenc_gpt4o_cb
heb_hebr_300mb,15303.0,30.0,15510.0,30.0,16085.0,0.0,16461.0,0.0
kor_hang_300mb,1021.0,508.0,1021.0,508.0,208.0,0.0,239.0,0.0
jpn_jpan_300mb,987.0,401.0,987.0,401.0,962.0,0.0,1736.0,0.0
vie_latn_300mb,711.0,402.0,715.0,402.0,1682.0,0.0,1543.0,0.0
pan_guru_300mb,670.0,317.0,137.0,65.0,190.0,0.0,205.0,0.0
zho_hans_300mb,524.0,367.0,524.0,367.0,161.0,0.0,671.0,0.0
hin_deva_300mb,310.0,119.0,162.0,64.0,255.0,0.0,215.0,0.0
eng_latn_300mb,186.0,121.0,177.0,120.0,6713.0,0.0,6510.0,0.0
tha_thai_300mb,180.0,113.0,42831.0,104.0,22499.0,0.0,45395.0,0.0
arb_arab_300mb,103.0,54.0,101.0,54.0,17991.0,0.0,18109.0,0.0
