## Train k-mer "tokenizer"

In [48]:
import os
import tempfile
import shutil
import textwrap
import json

import pandas as pd
from tqdm import tqdm

### Set the wanted  K

In [2]:
k = 6

### Perpare the 16S Dataset

In [7]:
dna_corpus = 'SILVA_parsed_V2.tsv'

In [8]:
dna_corpus_df = pd.read_csv(dna_corpus, sep='\t')
dna_corpus_df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(432033, 13)

In [29]:
def split_seq_into_kmers(seq, k):
    """
    Spilt a 16S gene seqeunce into k-mers
    """
    kmers = textwrap.wrap(seq, k)
    if len(kmers[-1]) < k:
        return kmers[:-1]
    return kmers

### Define a K-mer "tokenizer" vocab

In [56]:
class KmerVocab(object):
    
    def __init__(self):
        self.token_idx = 0
        self.vocab = {}
    
        self.update('[CLS]')
        self.update('[SEP]')
        self.update('[MASK]')
        self.update('[UNK]')
        self.update('[PAD]')
        
    def update(self, kmer):
        if kmer not in self.vocab:
            self.vocab[kmer] = self.token_idx
            self.token_idx += 1
        
    def save(self, path):
        with open(path, 'w') as f_out:
            json.dump(self.vocab, f_out)
    
    def __len__(self):
        return self.token_idx + 1

In [57]:
vocab = KmerVocab()

### "Train" the tokenizer using a simple hash table

In [58]:
def train_on_seq(seq):
    kmers = split_seq_into_kmers(seq, k=k)
    [vocab.update(kmer) for kmer in kmers] 

In [59]:
tqdm.pandas(position=0, leave=True)

In [60]:
dna_corpus_df.seq.progress_apply(train_on_seq)

100%|██████████| 432033/432033 [08:05<00:00, 889.13it/s] 


0         None
1         None
2         None
3         None
4         None
          ... 
432028    None
432029    None
432030    None
432031    None
432032    None
Name: seq, Length: 432033, dtype: object

In [61]:
vocab.save('kmer_vocab.txt')

In [62]:
len(vocab)

14990

In [64]:
vocab.vocab

{'[CLS]': 0,
 '[SEP]': 1,
 '[MASK]': 2,
 '[UNK]': 3,
 '[PAD]': 4,
 'AGUCGA': 5,
 'GCGGGC': 6,
 'GCAGCA': 7,
 'AUGCGU': 8,
 'CAGCGG': 9,
 'CAGACG': 10,
 'GGUGAG': 11,
 'UAACAC': 12,
 'GUGGGA': 13,
 'ACGUAC': 14,
 'CCUUCG': 15,
 'GUUCGG': 16,
 'AACAAC': 17,
 'CCAGGG': 18,
 'AAACUU': 19,
 'GGGCUA': 20,
 'AUACCG': 21,
 'GAUACG': 22,
 'UCCGUA': 23,
 'AGGAGA': 24,
 'AAGAUU': 25,
 'UAUCGC': 26,
 'CGAAGG': 27,
 'AUCGGC': 28,
 'CCGCGU': 29,
 'CUGAUU': 30,
 'AGCUAG': 31,
 'UUGGUG': 32,
 'UGGUAA': 33,
 'CGGCGC': 34,
 'ACCAAG': 35,
 'GCGACG': 36,
 'AUCAGU': 37,
 'AGCUGG': 38,
 'UCUGAG': 39,
 'AGGAUG': 40,
 'AUCAGC': 41,
 'CACACU': 42,
 'GGGACU': 43,
 'GAGACA': 44,
 'CGGCCC': 45,
 'AGACUC': 46,
 'CUACGG': 47,
 'GAGGCA': 48,
 'GCAGUG': 49,
 'GGGAAU': 50,
 'AUUGGA': 51,
 'CAAUGG': 52,
 'GGGCAA': 53,
 'CCCUGA': 54,
 'UCCAGC': 55,
 'CAUGCC': 56,
 'GCGUGA': 57,
 'GUGAUG': 58,
 'AAGGCC': 59,
 'CUAGGG': 60,
 'UUGUAA': 61,
 'AGCUCU': 62,
 'UUCGGU': 63,
 'GGGGAA': 64,
 'GAUAAU': 65,
 'GACGGU': 66,
 'ACCCAC'