In [None]:
"""test marker representation on sample sequences"""

In [170]:
import os
import sys
import mmh3
import pickle
import random
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt
from itertools import product

In [156]:
# define data paths
data_dir = '/ccb/salz4-4/markus/markraken/data'

seq_pickle_path = os.path.join(data_dir, 'DB.pkl')
HPC_pickle_path = os.path.join(data_dir, 'DB_HPC.pkl')
hashtable_path = os.path.join(data_dir, 'index.pkl')

In [157]:
# load data
with open(seq_pickle_path, 'rb') as f:
    seq_list = pickle.load(f)

with open(HPC_pickle_path, 'rb') as f:
    HPC_list = pickle.load(f)

In [80]:
n_markers = 8000 # number of markers to use for representation of sequence as markers, Shasta default = 8000
marker_length = 10 # length of marker sequence, Shasta default = 10

# randomly select marker set without replacement, surprisingly complex to vectorize, works fast enough like this
random.seed(2019)
marker_set = set()
while len(marker_set) < n_markers:
    marker = [random.sample('ATCG', 1)[0]]
    for i in range(marker_length-1):
        nucs = {'A', 'T', 'C', 'G'}
        nucs.remove(marker[-1])
        marker.append(random.sample(nucs, 1)[0])
    marker_set.add(''.join(marker))

marker_list = list(marker_set)

In [81]:
# TODO sample while considering reverse complement to ensure easy searches regardless of strand

In [86]:
# extract markers from homopolymer compressed sequence
# TODO temprorary code here, marker finding can definitely be improved a lot
def markerize(HPC_seq):
    marker_seq = []
    for i in range(len(HPC_seq)):
        subseq = HPC_seq[i:i+marker_length]
        try:
            idx = marker_list.index(subseq)
            marker_seq.append(idx)
        except:
            pass
        
    return marker_seq

foo = markerize(HPC_list[0])
print(len(HPC_list[0]))
print(len(foo))

5976
558


In [91]:
n_threads = 20

# markerize all HPC seqs
p = mp.Pool(n_threads)
markerized_list = p.map(markerize, HPC_list)
p.close()
p.join()

In [132]:
# convert list to tuple to allow hashing
markerized_chrlist = [''.join([chr(s) for s in x]) for x in markerized_list]

In [151]:
# %%time
# # extract marKmers (short kmers with much larger alphabet) from marker sequences
# # marKmer uniqueness is roughly equivalent to a normal kmer of length = marKmer_length*marker_length
# # e.g. with marker length 10, marKmer 3 ~ kmer 30
# marKmer_length = 4
# hash_seed = 2019

# marKmer_hashtable = dict()
# for i, m in enumerate(markerized_chrlist): # i is stand-in for true taxid, TODO handle taxid with LCA
#     marKmer_set = set()
#     for i in range(len(m)):
#         subseq = m[i:i+marKmer_length] # this might be slow, TODO check if this slices or copies
#         marKmer_set.add(subseq) # adding to set is O(1)
        
#     hashlist = [mmh3.hash(x, hash_seed) for x in list(marKmer_set)]
#     tmp_dict = dict(zip(hashlist, [i]*len(hashlist)))
#     marKmer_hashtable.update(tmp_dict)
    
###### slower in 1 thread, but easier to parallelize in the future

CPU times: user 11.8 s, sys: 460 ms, total: 12.2 s
Wall time: 12.2 s


In [153]:
%%time
# extract marKmers (short kmers with much larger alphabet) from marker sequences
# marKmer uniqueness is roughly equivalent to a normal kmer of length = marKmer_length*marker_length
# e.g. with marker length 10, marKmer 3 ~ kmer 30
marKmer_length = 4
hash_seed = 2019

marKmer_hashtable = dict()
for i, m in enumerate(markerized_chrlist): # i is stand-in for true taxid, TODO handle taxid with LCA
    marKmer_set = set()
    for i in range(len(m)):
        subseq = m[i:i+marKmer_length] # this might be slow, TODO check if this slices or copies
        marKmer_set.add(subseq) # adding to set is O(1)
    for m in marKmer_set:
        marKmer_hashtable.update({m:i}) # TODO check if adding to dict is O(1)

CPU times: user 8.73 s, sys: 485 ms, total: 9.22 s
Wall time: 9.2 s


In [155]:
# save index
with open(hashtable_path, 'wb') as f:
    pickle.dump(marKmer_hashtable, f)

In [169]:
# examine compression ratio
singleseq = ''.join(seq_list)
print(len(pickle.dumps(marKmer_hashtable))/1e9, 'gigabytes for index') # guaranteed conservative estimate of object size
print(len(pickle.dumps(singleseq))/1e9, 'Gbp of sequence')

0.209241084 gigabytes for index
0.116762843 Gbp of sequence


In [None]:
#TODO examine accuracy in perfect reads
#TODO examine accuracy as a function of sequencing error

In [None]:
lenlist = [len(x) for x in markerized_list]
plt.hist(lenlist)
plt.show()
print(np.sum(np.asarray(lenlist)>10000))