In [23]:
import pandas as pd
import textwrap
from tokenizers import BertWordPieceTokenizer
import numpy as np
import datetime
from sklearn.feature_extraction.text import CountVectorizer

### Read database

In [5]:
# Heavy file, takes a minute
silva_db = pd.read_table('C:/Users/efrat/Downloads/silva_138_release/Exports/SILVA_parsed_V2.tsv', index_col = 0,
                        dtype={'raw_id': str, 
                               "full_taxonomy": str, 
                               "seq_length": int,
                               "seq": str, 
                               "kingdom": str, 
                               "phylum": str, 
                               "class": str, 
                               "order": str, 
                               "family": str,
                               "genus": str,
                               "species": str,
                               "strain": str})
silva_db.head()

Unnamed: 0,raw_id,full_taxonomy,seq_length,seq,kingdom,phylum,class,order,family,genus,species,strain
0,HG531388.1.1375,Bacteria;Proteobacteria;Alphaproteobacteria;Rh...,1375,AGUCGAGCGGGCGCAGCAAUGCGUCAGCGGCAGACGGGUGAGUAAC...,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Xanthobacteraceae,Rhodoplanes,Rhodoplanes oryzae,
1,HL281785.3.1301,Bacteria;Bacteroidota;Bacteroidia;Bacteroidale...,1299,AUUCCGGGAUAGCCUUUCGAAAGAAAGAUUAAUACUGGAUAGCAUA...,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,unidentified,
2,AB002644.1.1485,Bacteria;Firmicutes;Bacilli;Bacillales;Bacilla...,1485,GGCUAAUACAUGCAAGUCGAGCGAGUGAACAAACAGAAGCCUUCGG...,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Bacillus,low G+C Gram-positive bacterium HTA454,
3,AB002648.1.1383,Bacteria;Firmicutes;Bacilli;Thermoactinomyceta...,1383,AGCGGCGAACGGGUGAGUAACACGNGGGUAACCUGCCCUCAAGACC...,Bacteria,Firmicutes,Bacilli,Thermoactinomycetales,Thermoactinomycetaceae,Thermoflavimicrobium,low G+C Gram-positive bacterium HTA1422,
4,JN049459.1.1443,Bacteria;Actinobacteriota;Actinobacteria;Strep...,1443,GACAUGGCGCCUCUACCAUGCAGUCGACGAUGACCACCUUCGGGGU...,Bacteria,Actinobacteriota,Actinobacteria,Streptomycetales,Streptomycetaceae,Streptomyces,actinobacterium ZXY010,


In [41]:
# Create list of all sequences in our data and convert to lowercase
seqs = silva_db.seq.values
print(seqs[0])
seqs_100cut = [' '.join(seq[i:i+100] for i in range(0, len(seq), 100)) for seq in seqs]
print(seqs_100cut[0])
print(len(seqs),"sequences total in db")

AGUCGAGCGGGCGCAGCAAUGCGUCAGCGGCAGACGGGUGAGUAACACGUGGGAACGUACCCUUCGGUUCGGAACAACCCAGGGAAACUUGGGCUAAUACCGGAUACGUCCGUAAGGAGAAAGAUUUAUCGCCGAAGGAUCGGCCCGCGUCUGAUUAGCUAGUUGGUGUGGUAACGGCGCACCAAGGCGACGAUCAGUAGCUGGUCUGAGAGGAUGAUCAGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGGACAAUGGGGGCAACCCUGAUCCAGCCAUGCCGCGUGAGUGAUGAAGGCCCUAGGGUUGUAAAGCUCUUUCGGUGGGGAAGAUAAUGACGGUACCCACAGAAGAAGCCCCGGCUAACUUCGUGCCAGCAGCCGCGGUAAUACGAAGGGGGCUAGCGUUGCUCGGAAUCACUGGGCGUAAAGCGCACGUAGGCGGCUUUCUAAGUCAGGGGUGAAAUCCCGGAGCUCAACUCCGGAACUGCCUUUGAUACUGGGAGGCUCGAGUCCGGGAGAGGUGAGUGGAACUGCGAGUGUAGAGGUGAAAUUCGUAGAUAUUCGCAAGAACACCAGUGGCGAAGGCGGCUCACUGGCCCGGUACUGACGCUGAGGUGCGAAAGCGUGGGGAGCAAACAGGAUUAGAUACCCUGGUAGUCCACGCCGUAAACGAUGGAUGCUAGCCGUUGGGCAGCUUGCUGCUCAGUGGCGCAGCUAACGCCUUAAGCAUCCCGCCUGGGGAGUACGGUCGCAAGAUUAAAACUCAAAGGAAUUGACGGGGGCCCGCACAAGCGGUGGAGCAUGUGGUUUAAUUUCGAAGCAACGCGCAGAACCUUACCAGCUCUUGACAUGCCACGACGGUUUCCGGAGACGGACUCCACCCCGCAAGGGGCGUGGACACAGGUGCCUGCAUGGCUGUCGUCAGCUCGUGUCGUGAGAUGUUGGGUUAAGUCCCGCAACGAGCGCAA

### Read tokens & set encoders

In [1]:
# Read BPE tokens of final BERT model
with open('C:/Users/efrat/Documents/DNA_BERT_Data/bert_embeddings/vocab_bpe_final_06082020.txt') as f:
    bpe_tokens = f.readlines()
# Remove whitespace characters like `\n` at the end of each line
bpe_tokens = [x.strip() for x in bpe_tokens] 
print(bpe_tokens)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'A', 'C', 'G', 'N', 'U', '##G', '##A', '##U', '##C', '##N', '##GG', '##AA', '##GC', '##GU', '##AC', '##AU', '##CC', '##UU', '##CU', '##GAA', '##AGC', '##AGG', '##UGG', '##AGU', '##GAC', '##CGG', '##GAU', '##CGC', '##UAA', '##UAC', '##GUC', '##UGC', '##CUU', '##GAGG', '##GA', '##CAA', '##CAC', '##GGU', '##GCAA', '##CGU', '##GGGG', '##CAU', '##UGU', '##CAGC', '##UCC', '##GAGU', '##GAGC', '##ACAC', '##AAAC', '##AAU', '##UAGU', '##AGAU', '##CCUU', '##GCGU', '##AACU', '##UCU', '##GGAU', '##CGAA', '##GUGC', '##AGGAA', '##ACGG', '##AGAA', '##GCAC', '##GGUU', '##AUU', '##GGUGG', '##CAGU', '##GCU', '##GGCU', '##CGAC', '##CAGG', '##GGUGAA', '##GCCC', '##AGAC', '##AUGU', '##AAGC', '##GAGAU', '##CCUGG', '##UCGG', '##AAUU', '##GGUAA', '##AAUGG', '##ACU', '##GUCC', '##GGGU', '##CGCAA', '##GAGAC', '##GAAAGC', '##CGCGU', '##GUCGU', '##AUCC', '##CGAU', '##UGAC', '##GAAGAA', '##GUUGG', '##ACUGG', '##GCUU', '##GCCU', '##GGCC', '##AGAGG', '##AGCU', '##UACGG',

In [22]:
# BPE encoder
bpe_tokenizer = BertWordPieceTokenizer(
    vocab_file='C:/Users/efrat/Documents/DNA_BERT_Data/bert_embeddings/vocab_bpe_final_06082020.txt',
    handle_chinese_chars=False,
    lowercase=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]")
print(bpe_tokenizer)
#print(bpe_tokenizer.get_vocab())
#print(list(bpe_tokenizer.get_vocab().values())[:10])
#print(list(bpe_tokenizer.get_vocab().keys())[:10])

# CURRENTLY NOT USED
bpe_stats_df = pd.DataFrame([list(bpe_tokenizer.get_vocab().values()),
                             list(bpe_tokenizer.get_vocab().keys())]) 
bpe_stats_df = bpe_stats_df.transpose()
bpe_stats_df.columns = ['token_id', 'token']
#bpe_stats_df['token_clean'] = [t.replace('#', '') for t in bpe_stats_df['token'].values]
bpe_stats_df.head()

Tokenizer(vocabulary_size=15621, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=False, strip_accents=True, lowercase=False, wordpieces_prefix=##)


Unnamed: 0,token_id,token
0,3339,##GGAUUCAU
1,7995,##GAUUCAUGACUGG
2,15470,##AAUGAGG
3,3019,##AGGAACACCAGU
4,13567,##GGGGCAU


In [38]:
# Read kmer tokens
def split_seq_into_kmers(seq, k = 6):
    kmers = textwrap.wrap(seq, k)
    if len(kmers[-1]) < k:
        return kmers[:-1]
    return kmers

### Encode sequences

In [36]:
# BPE
# bpe_tokenizer.encode("CAGAUUGAUACGAGCUUGCCUACAAUUAUUCCUGUGAGUCGCGAACGGGUGAGUAACG", add_special_tokens=False).tokens
print(datetime.datetime.now())
cv = CountVectorizer(tokenizer = lambda x: bpe_tokenizer.encode(x, add_special_tokens=False).tokens, lowercase = False)
all_encodings_cv_bpe = cv.fit_transform(seqs_100cut)
token_strings_bpe = cv.get_feature_names()
print(all_encodings_cv_bpe.shape)
print(datetime.datetime.now())

2020-08-06 15:35:44.657442
(432033, 15608)
2020-08-06 15:58:54.962223


In [42]:
#K-mers
print(datetime.datetime.now())
cv_6mer = CountVectorizer(tokenizer = lambda x: split_seq_into_kmers(x), lowercase = False)
all_encodings_cv_6mer = cv_6mer.fit_transform(seqs)
token_strings_6mer = cv_6mer.get_feature_names()
print(all_encodings_cv_6mer.shape)
print(datetime.datetime.now())

2020-08-07 09:26:02.303201
(432033, 14984)
2020-08-07 09:38:41.045869


In [53]:
print(token_strings_bpe[:10])
print(token_strings_6mer[:10])
total_tokens_bpe = all_encodings_cv_bpe.sum()
total_tokens_kmer = all_encodings_cv_6mer.sum()
print("Total number of tokens in entire corpus - BPE:", total_tokens_bpe)
print("Total number of tokens in entire corpus - k-mer:", total_tokens_kmer)

['##A', '##AA', '##AAA', '##AAAA', '##AAAAAC', '##AAAAACCC', '##AAAAAGC', '##AAAAAGG', '##AAAAAU', '##AAAAAUGACGGUAC']
['AAAAAA', 'AAAAAC', 'AAAAAG', 'AAAAAN', 'AAAAAU', 'AAAACA', 'AAAACC', 'AAAACG', 'AAAACN', 'AAAACU']
Total number of tokens in entire corpus - BPE: 60351823
Total number of tokens in entire corpus - k-mer: 103259831


### Collect token stats

In [45]:
# For each token, get its total count of appearances (a token may appear multiple times in a single sequence)
def get_token_appearances(encodings_cv):
    token_appearances = encodings_cv.sum(0).tolist()
    token_appearances = [i for sublist in token_appearances for i in sublist]
    return token_appearances

# For each token, get the number of unique sequences it appears in 
def get_token_appearances_uniq(encodings_cv):
    token_appearances_uniq = (encodings_cv != 0).sum(0).tolist()
    token_appearances_uniq = [i for sublist in token_appearances_uniq for i in sublist]
    return token_appearances_uniq

In [57]:
a = 10**(-3)
a2 = 10**(-4)
a3 = 10**(-5)

In [60]:
# Analyze bpe tokens (appearances stats)
bpe_token_appearances = get_token_appearances(all_encodings_cv_bpe)
bpe_token_appearances_uniq = get_token_appearances_uniq(all_encodings_cv_bpe)
weights1 = [a / (a + f/total_tokens_bpe) for f in bpe_token_appearances_uniq]
weights2 = [a2 / (a2 + f/total_tokens_bpe) for f in bpe_token_appearances_uniq]
weights3 = [a3 / (a3 + f/total_tokens_bpe) for f in bpe_token_appearances_uniq]
# Organize stats into a single data frame - BPE
token_lengths = [len(t) for t in token_strings_bpe]
token_stats_bpe = pd.DataFrame(list(zip(token_strings_bpe, 
                                        token_lengths, 
                                        bpe_token_appearances, 
                                        bpe_token_appearances_uniq,
                                        weights1, 
                                        weights2, 
                                        weights3)), 
                           columns =['token', 'token_length', 'total_appearances_mult_in_seq', 'total_seqs_in', 'weight1', 'weight2', 'weight3']) 
token_stats_bpe.head(10)

Unnamed: 0,token,token_length,total_appearances_mult_in_seq,total_seqs_in,weight1,weight2,weight3
0,##A,3,364066,244017,0.198285,0.024136,0.002467
1,##AA,4,61853,56890,0.514764,0.09591,0.010497
2,##AAA,5,14571,14409,0.807265,0.295203,0.040201
3,##AAAA,6,20408,19021,0.760359,0.240866,0.030753
4,##AAAAAC,8,6159,6087,0.908382,0.497863,0.090205
5,##AAAAACCC,10,684,683,0.98881,0.898336,0.46911
6,##AAAAAGC,9,4949,4944,0.924283,0.549693,0.108791
7,##AAAAAGG,9,1582,1582,0.974457,0.792312,0.276144
8,##AAAAAU,8,4821,4708,0.927636,0.561769,0.113624
9,##AAAAAUGACGGUAC,16,1793,1793,0.971148,0.770956,0.251831


In [61]:
# Analyze k-mer tokens (appearances stats)
kmer_token_appearances = get_token_appearances(all_encodings_cv_6mer)
kmer_token_appearances_uniq = get_token_appearances_uniq(all_encodings_cv_6mer)
weights1 = [a / (a + f/total_tokens_kmer) for f in kmer_token_appearances_uniq]
weights2 = [a2 / (a2 + f/total_tokens_kmer) for f in kmer_token_appearances_uniq]
weights3 = [a3 / (a3 + f/total_tokens_kmer) for f in kmer_token_appearances_uniq]
# Organize
token_stats_6mer = pd.DataFrame(list(zip(token_strings_6mer, kmer_token_appearances, kmer_token_appearances_uniq, weights1, weights2, weights3)), 
                           columns =['token', 'total_appearances_mult_in_seq', 'total_seqs_in', 'weight1', 'weight2', 'weight3']) 
token_stats_6mer.head(10)

Unnamed: 0,token,total_appearances_mult_in_seq,total_seqs_in,weight1,weight2,weight3
0,AAAAAA,8410,8122,0.92708,0.559735,0.112796
1,AAAAAC,15310,15088,0.872511,0.406311,0.064055
2,AAAAAG,18782,18304,0.849429,0.36067,0.053401
3,AAAAAN,14,14,0.999864,0.998646,0.986623
4,AAAAAU,7693,7539,0.931958,0.578001,0.120467
5,AAAACA,5618,5556,0.948941,0.65017,0.156725
6,AAAACC,27868,27171,0.791683,0.275382,0.036612
7,AAAACG,7411,7208,0.93475,0.588913,0.125306
8,AAAACN,8,8,0.999923,0.999226,0.992312
9,AAAACU,53891,52237,0.664064,0.165049,0.019384


In [64]:
# Save to file
token_stats_bpe.to_csv(path_or_buf = "C:/Users/efrat/Documents/DNA_BERT_Data/bert_embeddings/bpe_token_weights.tsv", sep='\t')
token_stats_6mer.to_csv(path_or_buf = "C:/Users/efrat/Documents/DNA_BERT_Data/bert_embeddings/kmer_token_weights.tsv", sep='\t')