# HuggingFace Codon-Pair-Ecoding Tokenizer

In [2]:
from pathlib import Path

from transformers import PreTrainedTokenizerFast
from tokenizers.processors import TemplateProcessing
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from genslm.utils import read_fasta, Sequence

In [2]:
# define a BPE training iterator

sequence_file = Path(
    "/lambda_stor/homes/khippe/genslm_foundation/genome_data/mdh_sc23/fasta/mdh_natural_sequences.ffn"
)
sequences = read_fasta(sequence_file)


def get_mdh_raw_sequences():
    for sequence in sequences:
        yield sequence.sequence.upper()

In [3]:
# Define cBPE training iterator

# Sequence pre-processing helpers
# Assign a unique character to each codon so that we can use it as an
# input token to a BPE tokenizer. This implements a codon-pair encoding.
CODON_CHAR = {
    "TCG": "A",
    "GCA": "B",
    "CTT": "C",
    "ATT": "D",
    "TTA": "E",
    "GGG": "F",
    "CGT": "G",
    "TAA": "H",
    "AAA": "I",
    "CTC": "J",
    "AGT": "K",
    "CCA": "L",
    "TGT": "M",
    "GCC": "N",
    "GTT": "O",
    "ATA": "P",
    "TAC": "Q",
    "TTT": "R",
    "TGC": "S",
    "CAC": "T",
    "ACG": "U",
    "CCC": "V",
    "ATC": "W",
    "CAT": "X",
    "AGA": "Y",
    "GAG": "Z",
    "GTG": "a",
    "GGT": "b",
    "GCT": "c",
    "TTC": "d",
    "AAC": "e",
    "TAT": "f",
    "GTA": "g",
    "CCG": "h",
    "ACA": "i",
    "CGA": "j",
    "TAG": "k",
    "CTG": "l",
    "GGA": "m",
    "ATG": "n",
    "TCT": "o",
    "CGG": "p",
    "GAT": "q",
    "ACC": "r",
    "GAC": "s",
    "GTC": "t",
    "TGG": "u",
    "CCT": "v",
    "GAA": "w",
    "TCA": "x",
    "CAA": "y",
    "AAT": "z",
    "ACT": "0",
    "GCG": "1",
    "GGC": "2",
    "CTA": "3",
    "AAG": "4",
    "AGG": "5",
    "CAG": "6",
    "AGC": "7",
    "CGC": "8",
    "TTG": "9",
    "TCC": "!",
    "TGA": "@",
    "XXX": "*"
}


def group_and_contextualize(seq: str, k: int = 3):
    grouped_codons = " ".join(seq[i : i + k] for i in range(0, len(seq), k)).upper()
    # Removes all modulo 3 chars
    return "".join(CODON_CHAR.get(codon, "") for codon in grouped_codons.split())


def decode_grouped_context(seq: str, sep: str = " "):
    return sep.join(CHAR_CODON[elem] for elem in seq)


def group_by_kmer(seq: str, kmer: int) -> str:
    return " ".join(seq[i : i + kmer] for i in range(0, len(seq), kmer)).upper()


# Define iterator
grouped_sequences = []  # Group by 3-mer (codon)
skipped = 0
for seq in sequences:
    if len(seq.sequence) % 3 != 0:
        skipped += 1
        continue
    grouped_sequences.append(group_by_kmer(seq.sequence, 3))

codon_pairs = [
    "".join(CODON_CHAR[codon] for codon in seq.split()) for seq in grouped_sequences
]  # Convert codons to unique chars
print(f"Skipped {skipped} sequences for non-translatable length")


def get_mdh_codon_tokens():
    for seq in codon_pairs:
        yield seq

Skipped 7 sequences for non-translatable length


In [4]:
def build_tokenizer(corpus_function, vocab_size=50_257, add_bos_eos: bool = True):
    special_tokens = {
        "unk_token": "[UNK]",
        "cls_token": "[CLS]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "mask_token": "[MASK]",
        "bos_token": "[BOS]",
        "eos_token": "[EOS]",
    }
    bos_index = 5
    eos_index = 6

    # Define tokenizer
    tokenizer = Tokenizer(models.BPE())

    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, special_tokens=list(special_tokens.values())
    )

    print("Training tokenizer")
    tokenizer.train_from_iterator(corpus_function(), trainer=trainer)

    # Add post-processor
    # trim_offsets=True will ignore spaces, false will leave them in
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
    if add_bos_eos:
        tokenizer.post_processor = TemplateProcessing(
            single="[BOS] $A [EOS]",
            special_tokens=[("[BOS]", bos_index), ("[EOS]", eos_index)],
        )

    # Add a decoder
    tokenizer.decoder = decoders.ByteLevel()

    # save the tokenizer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer, *special_tokens
    )

    return wrapped_tokenizer

In [None]:
bpe_output_path = Path("mdh-bpe")
if not bpe_output_path.exists():
    mdh_bpe = build_tokenizer(
        get_mdh_raw_sequences, vocab_size=50_257, add_bos_eos=True
    )
    mdh_bpe.save_pretrained(bpe_output_path)
else:
    print(f"Loading from {bpe_output_path}")
    mdh_bpe = PreTrainedTokenizerFast.from_pretrained(bpe_output_path)

In [None]:
codon_bpe_output_path = Path("mdh-codon-bpe")

if not codon_bpe_output_path.exists():
    mdh_codon_bpe = build_tokenizer(
        get_mdh_codon_tokens, vocab_size=50_257, add_bos_eos=True
    )
    mdh_codon_bpe.save_pretrained(codon_bpe_output_path)
else:
    print(f"Loading from {codon_bpe_output_path}")
    mdh_codon_bpe = PreTrainedTokenizerFast.from_pretrained(codon_bpe_output_path)

# Vocab Size Experiments


In [6]:
# 30k
cBPE_30k_output_path = Path("mdh-codon-bpe-vs30000")
vocab_size = 30_000
cBPE_30k = build_tokenizer(
    get_mdh_codon_tokens, vocab_size=vocab_size, add_bos_eos=True
)
cBPE_30k.save_pretrained(cBPE_30k_output_path)

Training tokenizer





('mdh-codon-bpe-vs30000/tokenizer_config.json',
 'mdh-codon-bpe-vs30000/special_tokens_map.json',
 'mdh-codon-bpe-vs30000/tokenizer.json')

In [7]:
# 20k
cBPE_20k_output_path = Path("mdh-codon-bpe-vs20000")
vocab_size = 20_000
cBPE_30k = build_tokenizer(
    get_mdh_codon_tokens, vocab_size=vocab_size, add_bos_eos=True
)
cBPE_30k.save_pretrained(cBPE_20k_output_path)

Training tokenizer





('mdh-codon-bpe-vs20000/tokenizer_config.json',
 'mdh-codon-bpe-vs20000/special_tokens_map.json',
 'mdh-codon-bpe-vs20000/tokenizer.json')

In [8]:
# 10k
cBPE_10k_output_path = Path("mdh-codon-bpe-vs10000")
vocab_size = 10_000
cBPE_30k = build_tokenizer(
    get_mdh_codon_tokens, vocab_size=vocab_size, add_bos_eos=True
)
cBPE_30k.save_pretrained(cBPE_10k_output_path)

Training tokenizer





('mdh-codon-bpe-vs10000/tokenizer_config.json',
 'mdh-codon-bpe-vs10000/special_tokens_map.json',
 'mdh-codon-bpe-vs10000/tokenizer.json')

# Full Patric BPE tokenizer

In [None]:
from tqdm import tqdm

patric_sequence_file = Path(
    "/lambda_stor/homes/khippe/genslm_foundation/genome_data/curriculum_datasets/homology-90-50/codon_homology/pgfam_30k_all.ffn"
)
patric_sequences = read_fasta(patric_sequence_file)
print(f"Read {len(patric_sequences)} sequences")

# Define iterator
patric_codon_pairs = [group_and_contextualize(seq.sequence) for seq in patric_sequences]
del patric_sequences  # free memory


def get_patric_codon_tokens():
    for seq in patric_codon_pairs:
        yield seq

In [None]:
vocab_size = 50257
patric_cBPE_50k_output_path = Path("patric-codon-bpe-vs50257")
patric_cBPE_50k = build_tokenizer(
    get_patric_codon_tokens, vocab_size=vocab_size, add_bos_eos=True
)
patric_cBPE_50k.save_pretrained(patric_cBPE_50k_output_path)

# Modelling

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from pathlib import Path

import accelerate
import datasets
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast
from transformers import BertConfig, BertForMaskedLM


from genslm.utils import read_fasta, Sequence

In [None]:
# Setup Tokenizer
special_tokens = {
    "unk_token": "[UNK]",
    "cls_token": "[CLS]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "mask_token": "[MASK]",
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
}

tokenizer = PreTrainedTokenizerFast.from_pretrained("mdh-codon-bpe")
tokenizer.add_special_tokens(special_tokens)
None

In [None]:
# Sequence pre-processing helpers

CODON_MAP = {
    "TCG": 0,
    "GCA": 1,
    "CTT": 2,
    "ATT": 3,
    "TTA": 4,
    "GGG": 5,
    "CGT": 6,
    "TAA": 7,
    "AAA": 8,
    "CTC": 9,
    "AGT": 10,
    "CCA": 11,
    "TGT": 12,
    "GCC": 13,
    "GTT": 14,
    "ATA": 15,
    "TAC": 16,
    "TTT": 17,
    "TGC": 18,
    "CAC": 19,
    "ACG": 20,
    "CCC": 21,
    "ATC": 22,
    "CAT": 23,
    "AGA": 24,
    "GAG": 25,
    "GTG": 26,
    "GGT": 27,
    "GCT": 28,
    "TTC": 29,
    "AAC": 30,
    "TAT": 31,
    "GTA": 32,
    "CCG": 33,
    "ACA": 34,
    "CGA": 35,
    "TAG": 36,
    "CTG": 37,
    "GGA": 38,
    "ATG": 39,
    "TCT": 40,
    "CGG": 41,
    "GAT": 42,
    "ACC": 43,
    "GAC": 44,
    "GTC": 45,
    "TGG": 46,
    "CCT": 47,
    "GAA": 48,
    "TCA": 49,
    "CAA": 50,
    "AAT": 51,
    "ACT": 52,
    "GCG": 53,
    "GGC": 54,
    "CTA": 55,
    "AAG": 56,
    "AGG": 57,
    "CAG": 58,
    "AGC": 59,
    "CGC": 60,
    "TTG": 61,
    "TCC": 62,
    "TGA": 63,
}

# Assign a unique character to each codon so that we can use it as an
# input token to a BPE tokenizer. This implements a codon-pair encoding.
CODON_CHAR = {
    "TCG": "A",
    "GCA": "B",
    "CTT": "C",
    "ATT": "D",
    "TTA": "E",
    "GGG": "F",
    "CGT": "G",
    "TAA": "H",
    "AAA": "I",
    "CTC": "J",
    "AGT": "K",
    "CCA": "L",
    "TGT": "M",
    "GCC": "N",
    "GTT": "O",
    "ATA": "P",
    "TAC": "Q",
    "TTT": "R",
    "TGC": "S",
    "CAC": "T",
    "ACG": "U",
    "CCC": "V",
    "ATC": "W",
    "CAT": "X",
    "AGA": "Y",
    "GAG": "Z",
    "GTG": "a",
    "GGT": "b",
    "GCT": "c",
    "TTC": "d",
    "AAC": "e",
    "TAT": "f",
    "GTA": "g",
    "CCG": "h",
    "ACA": "i",
    "CGA": "j",
    "TAG": "k",
    "CTG": "l",
    "GGA": "m",
    "ATG": "n",
    "TCT": "o",
    "CGG": "p",
    "GAT": "q",
    "ACC": "r",
    "GAC": "s",
    "GTC": "t",
    "TGG": "u",
    "CCT": "v",
    "GAA": "w",
    "TCA": "x",
    "CAA": "y",
    "AAT": "z",
    "ACT": "0",
    "GCG": "1",
    "GGC": "2",
    "CTA": "3",
    "AAG": "4",
    "AGG": "5",
    "CAG": "6",
    "AGC": "7",
    "CGC": "8",
    "TTG": "9",
    "TCC": "!",
    "TGA": "@",
    "XXX": "*"
}


def group_and_contextualize(seq: str, k: int = 3):
    grouped_codons = " ".join(seq[i : i + k] for i in range(0, len(seq), k)).upper()
    # Removes all modulo 3 chars
    return "".join(CODON_CHAR.get(codon, "") for codon in grouped_codons.split())

In [None]:
# Dataset
sequence_file = Path(
    "/lambda_stor/homes/khippe/genslm_foundation/genome_data/mdh_sc23/fasta/mdh_natural_sequences.ffn"
)
sequences = read_fasta(sequence_file)


dataset_seqs = [group_and_contextualize(seq.sequence) for seq in sequences]
tokenized_seqs = tokenizer(
    dataset_seqs,
    max_length=1024,
    padding="max_length",
    truncation=True,
    return_tensors="pt",
)


data = {
    "input_ids": tokenized_seqs.input_ids.tolist(),
    "attention_mask": tokenized_seqs.attention_mask.tolist(),
}

dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.05)
print(dataset)

In [None]:
# Modelling

# 124,690,513 params
config = BertConfig(
    hidden_size=512,
    num_hidden_layers=8,
    num_attention_heads=8,
    intermediate_size=2048,
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = BertForMaskedLM(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"BERT size: {model_size/1000**2:.1f}M parameters")

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

In [None]:
# Trainer

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-50m",
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [None]:
trainer.train()

In [None]:
# Modelling

# 124,690,513 params
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    pad_token_id=tokenizer.pad_token_id,
    max_position_embeddings=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = BertForMaskedLM(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"BERT size: {model_size/1000**2:.1f}M parameters")

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True)

# Trainer

args = TrainingArguments(
    output_dir="mdh-cBPE-BERT-125m",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=25,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()