In [1]:
import json
import os
from pathlib import Path
from tqdm import tqdm

In [2]:
tokenizer_paths_folder = Path('./tokenizers')
subfolders = [f for f in tokenizer_paths_folder.iterdir() if f.is_dir() and f.name.startswith('size_variation_')]

vocabularies = []
for subfolder in tqdm(subfolders):
    tokenizer_path = subfolder / 'tokenizer.json'
    if tokenizer_path.exists():
        with open(tokenizer_path, 'r') as f:
            tokenizer_data = json.load(f)
        tokens = tokenizer_data['model']['vocab']
        vocabularies.append(tokens)

  0%|          | 0/99 [00:00<?, ?it/s]

100%|██████████| 99/99 [00:05<00:00, 19.65it/s]


In [3]:
vocab_size_dict = {}
raw_vocab_size_dict = {}
vocab_sizes = []
for vocab in tqdm(vocabularies):
    vocab_size = len(vocab)
    vocab_sizes.append(vocab_size)
    token_lengths = [len(token) for token in vocab.keys() if len(token) <= 20]
    raw_token_lengths = [len(token) for token in vocab.keys()]
    vocab_size_dict[vocab_size] = token_lengths
    raw_vocab_size_dict[vocab_size] = raw_token_lengths
# sort vocab_size_dict by key
vocab_size_dict = dict(sorted(vocab_size_dict.items()))
raw_vocab_size_dict = dict(sorted(raw_vocab_size_dict.items()))


100%|██████████| 99/99 [00:00<00:00, 172.94it/s]


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

plt.figure(figsize=(12, 8))

for vocab_len, token_lengths in tqdm(vocab_size_dict.items()):
    if vocab_len == 1000:
        sns.kdeplot(token_lengths, label=vocab_len, bw_method='silverman', linewidth=2)
    else:
        sns.kdeplot(token_lengths, label=vocab_len, bw_method='silverman')

plt.title(f"Distribution of token lengths across vocab sizes")
plt.xlabel("Token length")
plt.ylabel("Density")
plt.legend(ncol=3)
plt.xticks(range(0, 21, 1))
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join("images", f"length_distribution_full.png"), dpi=300, bbox_inches='tight')
plt.close()


plt.figure(figsize=(12, 8))

for vocab_len, token_lengths in tqdm(vocab_size_dict.items()):
    if vocab_len in range(1000, 100000, 10000):
        sns.kdeplot(token_lengths, label=vocab_len, bw_method='silverman')

plt.title(f"Distribution of token lengths across vocab sizes")
plt.xlabel("Token length")
plt.ylabel("Density")
plt.legend(ncol=3)
plt.xticks(range(0, 21, 1))
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join("images", f"length_distribution_every_tenth.png"), dpi=300, bbox_inches='tight')
plt.close()


plt.figure(figsize=(12, 8))

for vocab_len, token_lengths in tqdm(vocab_size_dict.items()):
    if vocab_len in range(1000, 50000, 2000):
        sns.kdeplot(token_lengths, label=vocab_len, bw_method='silverman')

plt.title(f"Distribution of token lengths across vocab sizes")
plt.xlabel("Token length")
plt.ylabel("Density")
plt.legend(ncol=3)
plt.xticks(range(0, 21, 1))
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join("images", f"length_distribution_start.png"), dpi=300, bbox_inches='tight')
plt.close()

100%|██████████| 83/83 [00:14<00:00,  5.58it/s]
100%|██████████| 83/83 [00:01<00:00, 50.60it/s] 
100%|██████████| 83/83 [00:03<00:00, 26.64it/s]


In [58]:
# Calculate statistics for each vocabulary size
stats = {
    'max': [],
    'mean': [],
    'median': [],
    '90th_percentile': []
}
vocab_sizes = []

for vocab_len, token_lengths in raw_vocab_size_dict.items():
    vocab_sizes.append(vocab_len)
    stats['max'].append(max(token_lengths))
    stats['mean'].append(np.mean(token_lengths))
    stats['median'].append(np.median(token_lengths))
    stats['90th_percentile'].append(np.percentile(token_lengths, 90))

# Create the plot
plt.figure(figsize=(12, 8))

for stat_name, values in stats.items():
    plt.plot(vocab_sizes, values, label=stat_name, marker='o', markersize=4)

plt.title("Token Length Statistics Across Vocabulary Sizes")
plt.xlabel("Vocabulary Size")
plt.ylabel("Token Length")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join("images", "token_length_statistics.png"), dpi=300, bbox_inches='tight')
plt.close()


In [53]:
import numpy as np
from collections import Counter

# suppose token_lengths is a list of ints
counts = Counter(token_lengths)
xs = np.array(sorted(counts))         # unique token lengths
ys = np.array([counts[x] for x in xs])  # raw counts

# if you want a density instead of raw counts:
ys = ys / ys.sum()                    # now sums to 1

import matplotlib.pyplot as plt

smallest_vocab_len = min(vocab_size_dict.keys())
biggest_vocab_len = max(vocab_size_dict.keys())

plt.figure(figsize=(16, 8))
for vocab_len, token_lengths in vocab_size_dict.items():
    # build (x,y)
    counts = Counter(token_lengths)
    xs = np.array(sorted(counts))
    ys = np.array([counts[x] for x in xs]) / sum(counts.values())

    # piecewise linear line
    if vocab_len in [smallest_vocab_len, biggest_vocab_len]:
        plt.plot(xs, ys, label=f"Vocab {vocab_len}", linewidth=2)
    else:
        plt.plot(xs, ys, linewidth=1)

plt.title(f"Distribution of token lengths across vocab sizes")
plt.xlabel("Token length")
plt.ylabel("Density")
plt.legend()
plt.xticks(range(0, 21, 1))
plt.grid(True)
plt.savefig(os.path.join("images", f"length_distribution_jagged.png"), dpi=300, bbox_inches='tight')
plt.close()

In [49]:
import numpy as np
from collections import Counter

# suppose token_lengths is a list of ints
counts = Counter(token_lengths)
xs = np.array(sorted(counts))         # unique token lengths
ys = np.array([counts[x] for x in xs])  # raw counts

# if you want a density instead of raw counts:
ys = ys / ys.sum()                    # now sums to 1

import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
for vocab_len, token_lengths in vocab_size_dict.items():

    if vocab_len in range(1000, 10000, 1000):
        # build (x,y)
        counts = Counter(token_lengths)
        xs = np.array(sorted(counts))
        ys = np.array([counts[x] for x in xs]) / sum(counts.values())

        # piecewise linear line

        plt.plot(xs, ys, label=f"Vocab {vocab_len}", linewidth=1)

plt.title(f"Distribution of token lengths across vocab sizes")
plt.xlabel("Token length")
plt.ylabel("Density")
plt.legend(ncol=3)
plt.xticks(range(0, 21, 1))
plt.grid(True)
plt.savefig(os.path.join("images", f"length_distribution_jagged_start.png"), dpi=300, bbox_inches='tight')
plt.close()

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from pathlib import Path
from tqdm import tqdm


# Calculate statistics for each vocabulary size
stats = {
    'max': [],
    'mean': [],
    'median': [],
    '90th_percentile': []
}
vocab_sizes = []

for vocab_len, token_lengths in raw_vocab_size_dict.items():
    vocab_sizes.append(vocab_len)
    stats['max'].append(max(token_lengths))
    stats['mean'].append(np.mean(token_lengths))
    stats['median'].append(np.median(token_lengths))
    stats['90th_percentile'].append(np.percentile(token_lengths, 90))

# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8), sharey=False)

# First subplot with all statistics (including max)
for stat_name, values in stats.items():
    ax1.plot(vocab_sizes, values, label=stat_name, marker='o', markersize=4)

ax1.set_title("All Token Length Statistics")
ax1.set_xlabel("Vocabulary Size")
ax1.set_ylabel("Token Length")
ax1.legend()
ax1.grid(True)

# Second subplot with all statistics except max
for stat_name, values in stats.items():
    if stat_name != 'max':  # Skip the max statistic for the second plot
        ax2.plot(vocab_sizes, values, label=stat_name, marker='o', markersize=4)

ax2.set_title("Token Length Statistics (without max)")
ax2.set_xlabel("Vocabulary Size")
ax2.legend()
ax2.grid(True)

# Ensure images directory exists
os.makedirs("images", exist_ok=True)

# Save the plot
plt.tight_layout()
plt.savefig(os.path.join("images", "token_length_statistics_comparison.png"), dpi=300, bbox_inches='tight')
plt.close()

print("Side-by-side plots created and saved to 'images/token_length_statistics_comparison.png'")

Side-by-side plots created and saved to 'images/token_length_statistics_comparison.png'


In [7]:
# Calculate mean statistic for each vocabulary size
mean_values = []
vocab_sizes = []

for vocab_len, token_lengths in raw_vocab_size_dict.items():
    vocab_sizes.append(vocab_len)
    mean_values.append(np.mean(token_lengths))

# Create a single plot with log scale for y-axis
plt.figure(figsize=(12, 8))

plt.plot(vocab_sizes, mean_values, label='Mean Token Length', marker='o', markersize=4, color='blue')

plt.title("Mean Token Length Across Vocabulary Sizes")
plt.xlabel("Vocabulary Size (log scale")
plt.ylabel("Mean Token Length)")
plt.xscale('log')  # Set y-axis to log scale
plt.legend()
plt.grid(True, which="both", ls="-")  # Grid lines for both major and minor ticks

# Ensure images directory exists
os.makedirs("images", exist_ok=True)

# Save the plot
plt.tight_layout()
plt.savefig(os.path.join("images", "mean_token_length_log_scale.png"), dpi=300, bbox_inches='tight')
plt.close()
