In [1]:
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm_notebook as tqdm

In [5]:
def get_kmer_dict(seq, k=3):
    bases = ['A', 'C', 'G', 'T']
    kmers = [''.join(p) for p in itertools.product(bases, repeat=k)]
    
    kmer_counts = {kmer: 0 for kmer in kmers}
    
    for kgram in [seq[i: i+k] for i in range(len(seq) - k + 1)]:
        kmer_counts[kgram] += 1
        
    return kmer_counts

def kmerize_data(fpath_csv, fpath_kmer_csv, k=3):
    df_data = pd.read_csv(fpath_csv)
    
    features = []
    fwrite_started = False
    for idx, row in tqdm(df_data.iterrows(), total=df_data.shape[0]):
        fdict = {'id': row['id']}
        kmers = get_kmer_dict(row['sequence'], k)
        
        maxval = max(list(kmers.values()))
        
        kmers_norm = {k: v/maxval for k,v in kmers.items()}
                
        fdict = {**fdict, **kmers_norm}
        if 'label' in row:
            fdict['label'] = row['label']
        if 'phylum' in row:
            fdict['phylum'] = row['phylum']
        if 'class' in row:
            fdict['class'] = row['class']
        if 'order' in row:
            fdict['order'] = row['order']
        
        features.append(fdict)
        
        if idx % 5000 == 0 and idx != 0:
            fwrite_started = True
            df_kmers = pd.DataFrame(features)
            mode = 'a' if idx != 5000 else 'w'
            header = (idx == 5000)
            df_kmers.to_csv(fpath_kmer_csv, index=None, mode=mode, header=header)
            features = []
    
    if len(features) > 0:
        df_kmers = pd.DataFrame(features)
        mode = 'a' if fwrite_started else 'w'
        header = (not fwrite_started)
        df_kmers.to_csv(fpath_kmer_csv, index=None, mode=mode, header=header)

In [37]:
def generate_kmers(level):
    print('Generating K-Mers for {}'.format(level))
    for k in range(3, 7):
        print('K={}'.format(k))
        kmerize_data('../data/hierarchy/{level}/train.csv'.format(level=level), '../data/kmer/{level}/train_{k}mer.csv'.format(level=level, k=k), k)
        kmerize_data('../data/hierarchy/{level}/val.csv'.format(level=level), '../data/kmer/{level}/val_{k}mer.csv'.format(level=level, k=k), k)

In [36]:
generate_kmers('phylum')
generate_kmers('class')
generate_kmers('order')

Generating K-Mers for phylum
K=3


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=4


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=5


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=6


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=7


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


Generating K-Mers for class
K=3


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=4


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=5


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=6


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=7


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


Generating K-Mers for order
K=3


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=4


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=5


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=6


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))


K=7


HBox(children=(IntProgress(value=0, max=5504), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1835), HTML(value='')))




In [8]:
df_data = pd.read_csv('../data/order_full.csv')

for k in range(3, 7):
    print('Generating {}-mers'.format(k))
    kmerize_data('../data/order_full.csv', '../data/order_full_{}mer.csv'.format(k), k=k)

Generating 3-mers


HBox(children=(IntProgress(value=0, max=32177), HTML(value='')))


Generating 4-mers


HBox(children=(IntProgress(value=0, max=32177), HTML(value='')))


Generating 5-mers


HBox(children=(IntProgress(value=0, max=32177), HTML(value='')))


Generating 6-mers


HBox(children=(IntProgress(value=0, max=32177), HTML(value='')))


