## Train BERT WordPiece Tokenizer (from scratch)

In [1]:
import os
import tempfile
import shutil

import pandas as pd
from tokenizers import BertWordPieceTokenizer

### Perpare the 16S Dataset

In [2]:
dna_corpus = '/Users/shaharazulay/Downloads/SILVA_parsed_V2.tsv'

In [3]:
dna_corpus_df = pd.read_csv(dna_corpus, sep='\t')
dna_corpus_df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(432033, 13)

In [4]:
def sequence_into_file(seq, file_):
    """
    Insert all 16S genes from the dataset into one file, white-space separated
    """
    max_len_word = 100 # limits of the BPE algorithm
    with open(file_, 'a') as f_out:
        idx = 0
        while idx * max_len_word < len(seq):
            f_out.write(seq[idx * max_len_word: (idx + 1) * max_len_word])
            f_out.write(' ') # seperator
            idx += 1
    return len(seq)      

In [5]:
tmp_corpus_file = tempfile.NamedTemporaryFile(delete=False).name
tmp_corpus_file

'/var/folders/wg/nhqr3m4j5tqct6hlw4d7kpq00000gn/T/tmp2y1segwt'

In [6]:
_ = dna_corpus_df.seq.apply(lambda seq: sequence_into_file(seq, file_=tmp_corpus_file)).values

In [7]:
ls -lh {tmp_corpus_file}

-rw-------  1 shaharazulay  staff   598M Jul 18 07:54 /var/folders/wg/nhqr3m4j5tqct6hlw4d7kpq00000gn/T/tmp2y1segwt


### Create a BERT Tokenizer class

In [8]:
tokenizer = BertWordPieceTokenizer(
    lowercase=False, 
    handle_chinese_chars=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]",
    clean_text=False
)

### Train the Tokenizer with the 16S corpus

In [9]:
vocab_size = 15621  # parallel to k=6 in classic k-mers (for this corpus)

In [10]:
# Customize training
tokenizer.train(
    files=[tmp_corpus_file], 
    vocab_size=vocab_size,
    min_frequency=2
)

In [11]:
# Save the tokenizer 
tokenizer.save('.')

['./vocab.txt']

### Get some vocab statistics

In [12]:
def token_length_stats(vocab):
    lengths = []
    for token in vocab.keys():
        lengths.append(len(token))
    lengths = pd.Series(lengths)
    return lengths.describe()

In [13]:
token_length_stats(tokenizer.get_vocab())

count    15621.000000
mean        19.426157
std         17.122938
min          1.000000
25%          9.000000
50%         13.000000
75%         22.000000
max        100.000000
dtype: float64

### Load the Tokenizer and test it

In [14]:
tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False, handle_chinese_chars=False, clean_text=False)
tokenizer.enable_truncation(512)
tokenizer.enable_padding(max_length=512)

In [15]:
tokenizer.token_to_id('[SEP]')

3

In [16]:
tokenizer.get_vocab()

{'##GGGGAAUCUUGCGCAAUGG': 7842,
 '##GGGUCU': 9808,
 '##CCUAUGU': 3202,
 'GCCCGUCACACCACGAGAGUUUGUAACACCCGAAGUCGGUAGGGUAA': 12864,
 '##AUCCCGCCUGGGGAGU': 10470,
 '##CCUGAA': 4032,
 '##GACUGAGACAC': 7305,
 '##GAAAGCGUGGGGAGCGAACAGGAUUAGAUACCCUGGUAGUCCACGCUGUAAACGAU': 11593,
 '##GUACGCAGGCGG': 13958,
 '##UAAGUAAUCCACCUGGGGAGU': 2917,
 '##AGUUGCUAGUAAUCGCGAAUCAGAAUGU': 10816,
 '##CGCGUCGG': 4863,
 '##CGGGGCUCAACCCCGGAACU': 3762,
 '##GGCCGAUUAGCUAGUUGG': 13458,
 'UACCC': 6025,
 '##AGCUUGCUAC': 4588,
 '##GAGGAAUAUUGG': 15568,
 '##AGGUUUU': 14883,
 '##ACAGAGAGC': 3632,
 'GACCU': 5780,
 '##GGAAUUCCCAGUGUAGCGGUGAAAUGCGUAGAUAUUGGGAGGAACAC': 13735,
 '##GUGUAC': 6601,
 '##GAAGGCAAGC': 12792,
 '##CCAGACUCCUACGGGAGGCAGCAGU': 500,
 '##CAN': 6121,
 '##GCUUCACACAUGCUACAAUGG': 5640,
 '##CAACAGAA': 3861,
 '##UGUCGG': 679,
 '##AGAUGCAA': 14492,
 '##GCCGUGAGG': 576,
 '##GGCUUGAGU': 8105,
 '##AUAGG': 1278,
 '##AGUCCGG': 3669,
 '##ACGGUAACUGAC': 4235,
 '##UGCGCUAA': 10071,
 '##UCUCUGGGCUGU': 8967,
 'GGAGC': 

In [17]:
output = tokenizer.encode("AGUCGAGCGGGCGCAGCAAUGCGUCAGCGGCAG", add_special_tokens=True)
output.tokens, output.ids

(['[CLS]',
  'AGUC',
  '##GAGCG',
  '##GGCGC',
  '##AGCAAUGC',
  '##GUCAGC',
  '##GGCA',
  '##G',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
