# Tokenize DNA using Byte-Pair Encoding

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3
from collections import defaultdict

import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tqdm import tqdm

from adna.pylib import consts

## What characters are used?

In [3]:
with sqlite3.connect(consts.SQL) as cxn:
    RECS = pd.read_sql('select * from seqs', cxn)

SEQS = RECS.seq.tolist()

In [4]:
CHARS = set()

for seq in tqdm(SEQS):
    CHARS |= set(seq)

CHARS

100%|█████████████████████████████████████████████████████████████████████████████| 10996536/10996536 [00:21<00:00, 514721.89it/s]


{'A', 'C', 'G', 'N', 'T'}

## Train the tokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer()

In [6]:
tokenizer.train_from_iterator(
    SEQS,
    vocab_size=consts.VOCAB_SIZE,
    min_frequency=consts.MIN_FREQ,
    special_tokens=consts.SPECIAL_TOKENS,
    show_progress=True,
)






In [7]:
tokenizer.post_processor = BertProcessing(
    (consts.EOS, tokenizer.token_to_id(consts.EOS)),
    (consts.BOS, tokenizer.token_to_id(consts.BOS)),
)

## Get tokenized lengths

In [8]:
lengths = defaultdict(int)

STEP = 1024

In [9]:
for i in tqdm(range(0, len(SEQS), STEP)):
    batch = tokenizer.encode_batch(SEQS[i:i+STEP])
    for tokens in batch:
        t_len = len(tokens)
        lengths[t_len] += 1

100%|██████████████████████████████████████████████████████████████████████████████████████| 10739/10739 [01:34<00:00, 113.78it/s]


In [10]:
sorted(lengths.items())

[(3, 456132),
 (4, 50481),
 (5, 160323),
 (6, 496402),
 (7, 728419),
 (8, 974892),
 (9, 1006936),
 (10, 885681),
 (11, 735098),
 (12, 937537),
 (13, 658147),
 (14, 566957),
 (15, 478597),
 (16, 408926),
 (17, 342617),
 (18, 307897),
 (19, 243092),
 (20, 224337),
 (21, 188488),
 (22, 165525),
 (23, 119108),
 (24, 109520),
 (25, 92547),
 (26, 84722),
 (27, 87372),
 (28, 59555),
 (29, 61469),
 (30, 43390),
 (31, 46996),
 (32, 32887),
 (33, 35773),
 (34, 18087),
 (35, 14451),
 (36, 14450),
 (37, 4245),
 (38, 12010),
 (39, 4725),
 (40, 10594),
 (41, 3415),
 (42, 6593),
 (43, 6486),
 (44, 5224),
 (45, 5528),
 (46, 7570),
 (47, 6714),
 (48, 4085),
 (49, 3871),
 (50, 4719),
 (51, 9124),
 (52, 4623),
 (53, 4656),
 (54, 3473),
 (55, 4700),
 (56, 2148),
 (57, 6445),
 (58, 3351),
 (59, 4302),
 (60, 5204),
 (61, 2133),
 (62, 7285),
 (63, 1215),
 (64, 6416),
 (65, 1280),
 (66, 3008),
 (67, 806),
 (68, 990),
 (69, 1321),
 (70, 669),
 (71, 669),
 (72, 37),
 (73, 19),
 (74, 64),
 (75, 4),
 (76, 3),
 (7

Given the above I'm going to use a maximum sequence length of x tokens below.

In [11]:
consts.MAX_LENGTH

80

## Finalize the tokenizer

In [12]:
tokenizer.enable_padding(
    pad_token=consts.PAD,
    pad_id=tokenizer.token_to_id(consts.PAD),
    length=consts.MAX_LENGTH,
)

In [13]:
encoded = tokenizer.encode(SEQS[0])
encoded.tokens

['<s>',
 'GCCTT',
 'CAAGGATGAATTAATGATACGGTTTCGGGTGTAAGCCGGGCGTTCATTTTAACACTGATGCACTTGTAATTACATTTGGTT',
 'ATGGTATATCC',
 'ACCC',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [14]:
tokenizer.save(str(consts.SUB_DIR / 'tokenizer.json'))