# Tokenize DNA using Byte-Pair Encoding

In [1]:
import sys

sys.path.append('..')

In [2]:
import json
import sqlite3
from collections import defaultdict

import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tqdm import tqdm
from transformers import RobertaTokenizerFast

from adna.pylib import consts

## What characters are used?

In [3]:
with sqlite3.connect(consts.SQL) as cxn:
    RECS = pd.read_sql('select * from seqs', cxn)

SEQS = RECS.seq.tolist()

In [4]:
# CHARS = set()

# for seq in tqdm(SEQS):
#     CHARS |= set(seq)

# CHARS

## Train the tokenizer

In [5]:
tokenizer = ByteLevelBPETokenizer(lowercase=True)

In [6]:
tokenizer.train_from_iterator(
    SEQS,
    vocab_size=consts.VOCAB_SIZE,
    min_frequency=consts.MIN_FREQ,
    special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'],
    show_progress=True,
)






In [7]:
tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

## Get tokenized lengths

In [8]:
lengths = defaultdict(int)

STEP = 1024

In [9]:
for i in tqdm(range(0, len(SEQS), STEP)):
    batch = tokenizer.encode_batch(SEQS[i:i+STEP])
    for tokens in batch:
        t_len = len(tokens)
        lengths[t_len] += 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10739/10739 [01:35<00:00, 112.78it/s]


In [10]:
sorted(lengths.items())

[(3, 741832),
 (4, 32058),
 (5, 259014),
 (6, 695258),
 (7, 889423),
 (8, 1030881),
 (9, 893056),
 (10, 906854),
 (11, 843972),
 (12, 771778),
 (13, 633199),
 (14, 564991),
 (15, 445771),
 (16, 384250),
 (17, 322374),
 (18, 254745),
 (19, 193521),
 (20, 175467),
 (21, 144669),
 (22, 114311),
 (23, 99520),
 (24, 76424),
 (25, 75970),
 (26, 59788),
 (27, 53674),
 (28, 28032),
 (29, 39348),
 (30, 22613),
 (31, 29175),
 (32, 23045),
 (33, 19974),
 (34, 10776),
 (35, 9443),
 (36, 5430),
 (37, 7278),
 (38, 9772),
 (39, 9696),
 (40, 8498),
 (41, 3524),
 (42, 5417),
 (43, 7490),
 (44, 5788),
 (45, 5238),
 (46, 4137),
 (47, 4136),
 (48, 2821),
 (49, 6887),
 (50, 4492),
 (51, 6758),
 (52, 9027),
 (53, 2973),
 (54, 2930),
 (55, 1781),
 (56, 4059),
 (57, 5665),
 (58, 3075),
 (59, 1182),
 (60, 3496),
 (61, 4378),
 (62, 5777),
 (63, 3526),
 (64, 1074),
 (65, 832),
 (66, 1035),
 (67, 1494),
 (68, 835),
 (69, 80),
 (70, 61),
 (71, 598),
 (72, 78),
 (73, 9),
 (74, 2),
 (75, 1)]

Given the above I'm going to use a sequence length of x tokens below.

In [11]:
consts.SEQ_LENGTH

80

## Save the tokenizer

In [12]:
tokenizer.save_model(str(consts.SUB_DIR))

['../data/UF46992/vocab.json', '../data/UF46992/merges.txt']