# Tokenize DNA using Byte-Pair Encoding

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer
from tqdm import tqdm

from adna.pylib import consts, utils

In [3]:
SQL = consts.DATA_DIR / 'UF46992.sqlite'
JSON = consts.DATA_DIR / 'UF46992.json'

## What characters are used?

In [4]:
with sqlite3.connect(SQL) as cxn:
    RECS = pd.read_sql('select * from seqs', cxn)

SEQS = RECS.seq.tolist()

In [5]:
CHARS = set()

for seq in tqdm(SEQS):
    CHARS |= set(seq)

CHARS

100%|██████████████████████████████████████████████████████████████████████████████████████| 10996536/10996536 [00:20<00:00, 525776.69it/s]


{'A', 'C', 'G', 'N', 'T'}

## Train the tokenizer

In [6]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=5000,
)

In [7]:
tokenizer.train_from_iterator(SEQS, trainer)






In [8]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

## Get tokenized lengths

In [9]:
lengths = defaultdict(int)

step = 1024

for i in tqdm(range(0, len(SEQS), step)):
    batch = tokenizer.encode_batch(SEQS[i:i+step])
    for tokens in batch:
        t_len = len(tokens)
        lengths[t_len] += 1

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10739/10739 [01:30<00:00, 119.00it/s]


In [10]:
sorted(lengths.items())

[(3, 844547),
 (4, 65580),
 (5, 239649),
 (6, 718210),
 (7, 847881),
 (8, 1066093),
 (9, 931242),
 (10, 919286),
 (11, 840813),
 (12, 730912),
 (13, 649932),
 (14, 544110),
 (15, 458988),
 (16, 380684),
 (17, 281347),
 (18, 238359),
 (19, 202386),
 (20, 155055),
 (21, 145031),
 (22, 102106),
 (23, 93519),
 (24, 70161),
 (25, 64585),
 (26, 57145),
 (27, 40983),
 (28, 32919),
 (29, 21221),
 (30, 26665),
 (31, 25962),
 (32, 15130),
 (33, 16515),
 (34, 16059),
 (35, 5741),
 (36, 5074),
 (37, 8417),
 (38, 6731),
 (39, 13284),
 (40, 4703),
 (41, 3560),
 (42, 6248),
 (43, 7718),
 (44, 4595),
 (45, 4591),
 (46, 3962),
 (47, 4510),
 (48, 5362),
 (49, 4394),
 (50, 5716),
 (51, 9894),
 (52, 5188),
 (53, 3180),
 (54, 3074),
 (55, 1726),
 (56, 2779),
 (57, 7124),
 (58, 1530),
 (59, 1259),
 (60, 3425),
 (61, 4432),
 (62, 5918),
 (63, 3422),
 (64, 955),
 (65, 866),
 (66, 1846),
 (67, 891),
 (68, 531),
 (69, 70),
 (70, 63),
 (71, 594),
 (72, 79),
 (73, 6),
 (74, 2),
 (75, 1)]

## Finalize tokenizer length

In [11]:
pad_id = tokenizer.token_to_id("[PAD]")
tokenizer.enable_padding(pad_id=pad_id, length=80)

In [14]:
output = tokenizer.encode(SEQS[0])
output.tokens

['[CLS]',
 'GGG',
 'TG',
 'CACTAATAACTAGCTCAGTGTG',
 'TCTACGCCAAATTGACCTAAAATCACTCATCGCC',
 'TACTCC',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [13]:
tokenizer.save(str(JSON))