In [45]:
from packages.metagenomics import sampling2, encoding2
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq

### Test sampling on toy dataset

In [72]:
%%time

seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling-toy-2000'
sample_length = 20
coverage = 1
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

CPU times: user 22 ms, sys: 4.86 ms, total: 26.8 ms
Wall time: 25.2 ms


In [73]:
input_dir = 'data/sampling-toy-2000'
pattern = 'fragments*.npy'

fragments_toy = sampling2.read_fragments(input_dir, pattern)
fragments_toy.shape

(409, 21)

In [74]:
# ['NC_013451', 'NC_006375', 'NC_019565', 'NC_015407', 'NC_016841']
# [1280, 1590, 210, 2102, 573]

for each in fragments_toy.astype('str').tolist():
    print(each)

['g', 'c', 'a', 'g', 't', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', '1280']
['t', 't', 'g', 'a', 't', 't', 'g', 'a', 't', 'a', 'g', 't', 'g', 'g', 'c', 'a', 'a', 'g', 'a', 'c', '1280']
['g', 'c', 'c', 't', 'g', 'a', 'c', 't', 'g', 't', 't', 't', 'a', 't', 'a', 't', 't', 't', 'g', 'c', '1280']
['t', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', 'g', 'c', 'a', 'a', '1280']
['c', 't', 'g', 'a', 't', 'g', 't', 't', 't', 't', 'g', 'c', 'c', 'g', 'c', 't', 'a', 't', 'a', 'c', '1280']
['g', 'c', 'a', 'a', 'g', 'g', 'g', 'c', 't', 't', 'a', 'g', 'c', 'a', 't', 't', 'g', 'c', 'c', 'c', '1280']
['a', 'g', 'c', 't', 'a', 'a', 'a', 'a', 't', 'a', 'a', 'a', 't', 'a', 'a', 'a', 'a', 'a', 'a', 'g', '1280']
['a', 'g', 't', 't', 't', 'g', 't', 't', 'g', 'g', 't', 'a', 'c', 'a', 'a', 'a', 'a', 'a', 't', 't', '1280']
['g', 'c', 'a', 'a', 'g', 'c', 'a', 'c', 'a', 't', 'a', 't', 'a', 'c', 'g', 'c', 'g', 'c', 'g', 'a', '1280']
['t', 't', 't', 't'

In [55]:
def concatenate_columns(row):
    l = row.tolist()
    fragment = l[:-1]
    taxid = l[-1]
    
    sequence = ''.join(fragment)
    return sequence, taxid

    
print('test concatenate_columns(): PASS')
test_a = np.array(['1','2','3','4','5'])
test_s, test_t = concatenate_columns(test_a)
assert test_s == '1234'
assert test_t == '5'

test concatenate_columns(): PASS


In [5]:
# # confirm all fragments are in the expected data
# mapping = {'NC_013451':b'1280',
#            'NC_006375':b'1590',
#            'NC_019565':b'210',
#            'NC_015407':b'2102',
#            'NC_016841':b'573'}

# # read in sequences
# input_file = 'data/train_small-db_toy-2000.fasta'
# for seq_record in SeqIO.parse(input_file, "fasta"):
    
#     key = mapping[seq_record.id]
#     print('seq length:', len(seq_record.seq))

#     # get only those fragments for the species
#     fragments_to_consider = fragments_toy[np.where(fragments_toy[:,1] == key)]
#     print(fragments_to_consider.shape)

#     # check whether all fragments for this species can be found in the sequence
#     for each in fragments_to_consider:
#         frag = each[0].decode('utf-8')
#         assert frag in seq_record.seq, frag
#     print('All fragments found in this sequence.')
#     print()


seq length: 1442
(72, 2)
All fragments found in this sequence.

seq length: 1917
(96, 2)
All fragments found in this sequence.

seq length: 1634
(82, 2)
All fragments found in this sequence.

seq length: 1840
(92, 2)
All fragments found in this sequence.

seq length: 1308
(65, 2)
All fragments found in this sequence.



### Test sampling on small-db (coverage=0.05)

In [75]:
%%time
# 2 min with sampling2

seq_file = '/Users/ryanqnelson/Downloads/large-scale-metagenomics-1.0/data/train-dataset/train_small-db.fasta'
taxid_file = 'data/train_small-db.species-level.taxid'
output_dir = 'data/sampling-small-05'
sample_length = 200
coverage = 0.05
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

CPU times: user 1min 49s, sys: 4.32 s, total: 1min 53s
Wall time: 1min 54s


In [76]:
input_dir = 'data/sampling-small-05'
pattern = 'fragments*.npy'

fragments_small = sampling2.read_fragments(input_dir, pattern)
fragments_small.shape

(690539, 201)

### Test encoding on toy set

In [77]:
k=4
X_toy_encoded, y_toy = encoding2.encode_fragment_dataset(fragments_toy,k)
print(X_toy_encoded.shape)
print(y_toy.shape)

(409, 921)
(409,)


In [78]:
X_toy_encoded.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [79]:
y_toy_encoded

array(['1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1280', '1280', '1280', '1280', '1280', '1280', '1280',
       '1280', '1590', '1590', '1590', '1590', '1590', '1590', '1590',
       '1590', '1590', '1590', '1590', '1590', '1590', '1590', '1590',
       '1590', '1590', '1590', '1590', '1590', '1590', '1590', '1590',
       '1590', '1590', '1590', '1590', '1590', '1590', '1590', '1590',
       '1590', '1590', '1590', '1590', '1590', '1590', '1590', '1590',
      

### Test encoding on small-db (coverage=0.05)

In [80]:
%%time 
# 40 sec, (689790, 135168) with sampling,encoding
# 1 min , (690539, 135168) with sampling2,encoding2

k=6
X_small_encoded, y_small = encoding2.encode_fragment_dataset(fragments_small,k)
print(X_small_encoded.shape)

(690539, 135168)
CPU times: user 60 s, sys: 1.02 s, total: 1min 1s
Wall time: 1min 1s


In [81]:
%%time 
# 1.5 min, (689790, 8853492) with sampling,encoding
# 54 s, (690539, 81920) with sampling2,encoding2

k=10
X_small_encoded, y_small = encoding2.encode_fragment_dataset(fragments_small,k)
print(X_small_encoded.shape)

(690539, 81920)
CPU times: user 54.2 s, sys: 1.07 s, total: 55.3 s
Wall time: 55.8 s


In [82]:
%%time 
# 1 min, (689790, 6771300) with sampling,encoding
# 50s, (690539, 40960) with sampling2,encoding2

k=20
X_small_encoded, y_small = encoding2.encode_fragment_dataset(fragments_small,k)
print(X_small_encoded.shape)

(690539, 40960)
CPU times: user 47.4 s, sys: 1.05 s, total: 48.5 s
Wall time: 48.7 s
