In [5]:

from packages.metagenomics import sampling2, encoding2, naive_bayes
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq
import os

## Sampling

In [3]:
def concat_cols(row):
    l = row.tolist()
    fragment = l[:-1]
    taxid = l[-1]
    sequence = ''.join(fragment)
    return [sequence, taxid]

print('test concat_cols(): PASS')
test_a = np.array(['1','2','3','4','5'])
test_row = concat_cols(test_a)
assert test_row[0] == '1234'
assert test_row[1] == '5'


test concat_cols(): PASS


In [16]:
%%time

seq_file ='data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000-2'
sample_len = 20
coverage = 1
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_len, coverage, seed)

Wall time: 116 ms


In [17]:
input_dir ='data/sampling/sampling-toy-2000-2'
pattern = 'fragments*.npy'

fragments_toy = sampling2.read_fragments(input_dir, pattern)
fragments_toy.shape

(409, 21)

In [18]:
for each in fragments_toy.astype("str").tolist()[:5]:
    print(each)

['g', 'c', 'a', 'g', 't', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', '1280']
['t', 't', 'g', 'a', 't', 't', 'g', 'a', 't', 'a', 'g', 't', 'g', 'g', 'c', 'a', 'a', 'g', 'a', 'c', '1280']
['g', 'c', 'c', 't', 'g', 'a', 'c', 't', 'g', 't', 't', 't', 'a', 't', 'a', 't', 't', 't', 'g', 'c', '1280']
['t', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', 'g', 'c', 'a', 'a', '1280']
['c', 't', 'g', 'a', 't', 'g', 't', 't', 't', 't', 'g', 'c', 'c', 'g', 'c', 't', 'a', 't', 'a', 'c', '1280']


In [19]:
fragments_toy_rows = []
for each in fragments_toy.astype("str"):
    row = concat_cols(each)
    fragments_toy_rows.append(row)
len(fragments_toy_rows)

409

In [21]:
mapping = {"NC_013451":b'1280',
          'NC_006375':b'1590',
           'NC_019565':b'210',
           'NC_015407':b'2102',
           'NC_016841':b'573'}

input_file = "data/train_small-db_toy-2000.fasta"
for seq_record in SeqIO.parse(input_file, "fasta"):
    key = mapping[seq_record.id]
    print("seq length:", len(seq_record.seq))

    fragments_to_consider = fragments_toy[np.where(fragments_toy[:,-1] == key)].astype("str")
    print(fragments_to_consider.shape)

    for each in fragments_to_consider:
        row = concat_cols(each)
        assert row[0] in seq_record.seq, row[0]
        assert row[1] == key.decode("utf-8")
    print("All fragments found in this sequence")
    print()

seq length: 1442
(73, 21)
All fragments found in this sequence

seq length: 1917
(96, 21)
All fragments found in this sequence

seq length: 1634
(82, 21)
All fragments found in this sequence

seq length: 1840
(92, 21)
All fragments found in this sequence

seq length: 1308
(66, 21)
All fragments found in this sequence



In [1]:
%%time
# 1min 50sec

seq_file = "large-scale-metagenomics-1.0/data/train-dataset/train_small-db.fasta"
taxid_file = "data/train_small-db.species-level.taxid"
output_dir = "data/sampling/sampling-small-05-2"
sample_len = 200
coverage = .05
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_len, coverage, seed)

NameError: name 'sampling2' is not defined

## Encoding


#### Toy Dataset

In [23]:
input_dir = "data/sampling/sampling-toy-2000-2"
pattern = "fragments*.npy"

fragments_toy2 = sampling2.read_fragments(input_dir, pattern)
fragments_toy2.shape

k=4
X_toy_encoded2, y_toy2 = encoding2.encode_fragment_dataset(fragments_toy2, k)
print(X_toy_encoded2.shape)
print(y_toy2.shape)

(409, 921)
(409,)


In [None]:
taxid_prob = naive_bayes.taxid_probability(y_toy2)
print(taxid_prob)

#### Small Dataset

In [24]:
%%time
# 1min 30sec

input_dir = 'data/sampling/sampling-small-05-2'
pattern = 'fragments*.npy'

fragments_small2 = sampling2.read_fragments(input_dir, pattern)
fragments_small2.shape

k=6
X_small_encoded2, y_small2 = encoding2.encode_fragment_dataset(fragments_small2, k)
print(X_small_encoded2.shape)




(690539, 135168)
Wall time: 1min 30s
