In [1]:
from tokenizers import ByteLevelBPETokenizer
import os
import time

# === Setup paths ===
data_dir = "split_files"
all_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".csv")])
files = all_files[:4]  # First 4 files only

print(f"âœ… Found {len(files)} files. Training tokenizer one file at a time...\n")

# === Initialize tokenizer once ===
tokenizer = ByteLevelBPETokenizer()

# === Train on each file one-by-one ===
start = time.time()
for i, file in enumerate(files):
    print(f"\nðŸ”¹ Training on file {i+1}/{len(files)}: {os.path.basename(file)}")
    tokenizer.train(
        files=[file],
        vocab_size=30000,
        min_frequency=2,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )
    print(f"âœ… Done with file {i+1}/{len(files)}")

# === Save tokenizer ===
output_dir = "tokenizer_gpt2_custom"
os.makedirs(output_dir, exist_ok=True)
tokenizer.save_model(output_dir)

end = time.time()
print(f"\nâœ… Tokenizer training complete in {round(end - start, 2)} seconds.")
print(f"ðŸ“‚ Saved to: {output_dir}")


âœ… Found 4 files. Training tokenizer one file at a time...


ðŸ”¹ Training on file 1/4: split_1.csv
âœ… Done with file 1/4

ðŸ”¹ Training on file 2/4: split_2.csv
âœ… Done with file 2/4

ðŸ”¹ Training on file 3/4: split_3.csv
âœ… Done with file 3/4

ðŸ”¹ Training on file 4/4: split_4.csv
âœ… Done with file 4/4

âœ… Tokenizer training complete in 2010.13 seconds.
ðŸ“‚ Saved to: tokenizer_gpt2_custom


In [4]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast(
    vocab_file="tokenizer_gpt2_custom/vocab.json",
    merges_file="tokenizer_gpt2_custom/merges.txt",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
)

tokenizer.save_pretrained("tokenizer_gpt2_custom")
print("âœ… Hugging Face-compatible tokenizer saved!")


âœ… Hugging Face-compatible tokenizer saved!
