In [1]:
import pandas as pd
from helpers.tokenization_dna import DNATokenizer
import pysam
from tqdm import tqdm

In [2]:
workdir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/fasta/'

In [4]:
transformer_tokenizer = DNATokenizer(vocab_file='../DNABERT/src/transformers/dnabert-config/bert-config-6/vocab.txt',
                        max_len=510)

In [5]:
def kmers_stride1(seq, k=6):
    # splits a sequence into overlapping k-mers
    return [seq[i:i + k] for i in range(0, len(seq)-k+1)]

In [5]:
chunk_len = 510
overlap_bp = 128
max_seq_len = 5000

In [None]:
fa = workdir + '241_mammals.shuffled.fa'

seq_names = pd.read_csv(fa + '.fai', sep='\t', header=None, usecols=[0])[0].squeeze().values

fasta = pysam.FastaFile(fa)

data = []

def seq_generator():
    for seq_name in seq_names:
        seq = fasta.fetch(seq_name).upper().replace('-','')[:max_seq_len]
        for start_idx in range(0,len(seq),chunk_len-overlap_bp):
            chunk = seq[start_idx:start_idx+chunk_len]
            if len(chunk)<6:
                continue
            k_merized_chunk = kmers_stride1(chunk)
            tok = transformer_tokenizer.encode_plus(kmers_stride1(chunk),
                                            add_special_tokens=True,)
            yield seq_name,seq,tok['input_ids']

last_seq_name = ''
pbar = tqdm(total=len(seq_names))

for seq_name,seq,tokenized_seq in seq_generator():
    data.append((seq_name,len(seq),len(tokenized_seq)))
    if seq_name!=last_seq_name:
        last_seq_name = seq_name
        pbar.update(1)

In [7]:
len_df = pd.DataFrame(data,columns=['seq_name','seq_len','tok_len'])

len_df.tok_len.max()

507