# Tokenize DNA using Byte-Pair Encoding

In [1]:
import sys

sys.path.append("..")

In [2]:
from collections import defaultdict

from Bio.Seq import Seq
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tqdm import tqdm

from adna.pylib import consts
from adna.pylib import dataset_utils as du

## Tokenizer parameters

In [None]:
MIN_FREQ = 2

## Build sequences

In [3]:
SEQS, _ = du.read_seqs_labels()

Data augmentation use reverse complements so make sure they're represented in the token set.

In [4]:
SEQS += [du.rev_comp(seq) for seq in SEQS]

## What characters are used?

In [5]:
def count_bases(seqs):
    chars = defaultdict(int)
    for seq in tqdm(seqs):
        for base in seq:
            chars[base] += 1
    return chars


count_bases(SEQS)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 957444/957444 [00:09<00:00, 95829.91it/s]


defaultdict(int,
            {'T': 31635990,
             'C': 25066240,
             'A': 31635990,
             'G': 25066240,
             'N': 2812})

## Train the tokenizer

In [6]:
tokenizer = ByteLevelBPETokenizer()

In [7]:
tokenizer.train_from_iterator(
    SEQS,
    vocab_size=consts.VOCAB_SIZE,
    min_frequency=MIN_FREQ,
    special_tokens=consts.SPECIAL_TOKENS,
)






In [8]:
tokenizer.post_processor = BertProcessing(
    (consts.EOS, tokenizer.token_to_id(consts.EOS)),
    (consts.BOS, tokenizer.token_to_id(consts.BOS)),
)

## Get tokenized lengths

In [10]:
lengths = defaultdict(int)

STEP = 1024

In [11]:
for i in tqdm(range(0, len(SEQS), STEP)):
    batch = tokenizer.encode_batch(SEQS[i : i + STEP])
    for tokens in batch:
        t_len = len(tokens)
        lengths[t_len] += 1

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 936/936 [00:11<00:00, 84.93it/s]


In [12]:
sorted(lengths.items())

[(3, 6),
 (4, 125),
 (5, 1166),
 (6, 4887),
 (7, 11385),
 (8, 19592),
 (9, 30016),
 (10, 42025),
 (11, 54545),
 (12, 61554),
 (13, 65535),
 (14, 64280),
 (15, 61588),
 (16, 58187),
 (17, 53622),
 (18, 49117),
 (19, 45036),
 (20, 41231),
 (21, 37195),
 (22, 33417),
 (23, 29641),
 (24, 26398),
 (25, 21967),
 (26, 19725),
 (27, 16557),
 (28, 13639),
 (29, 11150),
 (30, 9131),
 (31, 7402),
 (32, 6328),
 (33, 5145),
 (34, 4538),
 (35, 4181),
 (36, 3782),
 (37, 2962),
 (38, 2490),
 (39, 2252),
 (40, 2188),
 (41, 2243),
 (42, 2002),
 (43, 1931),
 (44, 2092),
 (45, 1886),
 (46, 1612),
 (47, 1183),
 (48, 1213),
 (49, 1372),
 (50, 1598),
 (51, 1336),
 (52, 1086),
 (53, 1226),
 (54, 1406),
 (55, 1571),
 (56, 936),
 (57, 771),
 (58, 1008),
 (59, 1360),
 (60, 978),
 (61, 780),
 (62, 1009),
 (63, 897),
 (64, 609),
 (65, 531),
 (66, 411),
 (67, 215),
 (68, 117),
 (69, 66),
 (70, 8),
 (71, 5),
 (72, 1)]

Given the above I'm going to use a maximum sequence length of x tokens below.

In [14]:
consts.MAX_LENGTH

100

## Finalize the tokenizer

In [15]:
tokenizer.enable_padding(
    pad_token=consts.PAD,
    pad_id=tokenizer.token_to_id(consts.PAD),
    length=consts.MAX_LENGTH,
)

In [16]:
encoded = tokenizer.encode(SEQS[0])
encoded.tokens

['<s>',
 'TC',
 'AACCAA',
 'TTGTG',
 'TAC',
 'TCGCC',
 'GCAC',
 'TGGAGGTGTAG',
 'AGTG',
 'ATATTG',
 'CCCAAAA',
 'ATAG',
 'AGAACC',
 'AACCGAACTACTCCATTAAAATGTCGCGATTACGAGGC',
 'AGTGAG',
 'TCC',
 'TTCCTCC',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad

In [17]:
tokenizer.save(str(consts.MT_DIR / "tokenizer.json"))