In [1]:
from packages.metagenomics import sampling2, encoding2, sampling, encoding
from Bio import SeqIO
import numpy as np
from Bio.Seq import Seq

# 1. Test sampling process

### Functions

In [2]:
def concatenate_columns(row):
    l = row.tolist()
    fragment = l[:-1]
    taxid = l[-1]
    
    sequence = ''.join(fragment)
    return [sequence, taxid]

    
print('test concatenate_columns(): PASS')
test_a = np.array(['1','2','3','4','5'])
test_row = concatenate_columns(test_a)
assert test_row[0] == '1234'
assert test_row[1] == '5'

test concatenate_columns(): PASS


### Test sampling on toy dataset

In [3]:
%%time

seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000'
sample_length = 20
coverage = 1
seed = 42

sampling.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

ValueError: ('Output directory already exists:', 'data/sampling/sampling-toy-2000')

In [4]:
input_dir = 'data/sampling/sampling-toy-2000'
pattern = 'fragments*.npy'

fragments_toy = sampling.read_fragments(input_dir, pattern)
fragments_toy.shape

(409, 2)

In [5]:
# ['NC_013451', 'NC_006375', 'NC_019565', 'NC_015407', 'NC_016841']
# [1280, 1590, 210, 2102, 573]

for each in fragments_toy.astype('str').tolist()[:5]:
    print(each)

['gcagttaaacccgacgcata', '1280']
['ttgattgatagtggcaagac', '1280']
['gcctgactgtttatatttgc', '1280']
['ttaaacccgacgcatagcaa', '1280']
['ctgatgttttgccgctatac', '1280']


In [6]:
# confirm all fragments are in the expected data
mapping = {'NC_013451':b'1280',
           'NC_006375':b'1590',
           'NC_019565':b'210',
           'NC_015407':b'2102',
           'NC_016841':b'573'}

# read in sequences
input_file = 'data/train_small-db_toy-2000.fasta'
for seq_record in SeqIO.parse(input_file, "fasta"):
    
    key = mapping[seq_record.id]
    print('seq length:', len(seq_record.seq))

    # get only those fragments for the species
    fragments_to_consider = fragments_toy[np.where(fragments_toy[:,1] == key)]
    print(fragments_to_consider.shape)

    # check whether all fragments for this species can be found in the sequence
    for each in fragments_to_consider:
        frag = each[0].decode('utf-8')
        assert frag in seq_record.seq, frag
    print('All fragments found in this sequence.')
    print()


seq length: 1442
(73, 2)
All fragments found in this sequence.

seq length: 1917
(96, 2)
All fragments found in this sequence.

seq length: 1634
(82, 2)
All fragments found in this sequence.

seq length: 1840
(92, 2)
All fragments found in this sequence.

seq length: 1308
(66, 2)
All fragments found in this sequence.



### Test sampling2 on toy dataset

In [7]:
%%time

seq_file = 'data/train_small-db_toy-2000.fasta'
taxid_file = 'data/train_small-db_toy-2000.taxid'
output_dir = 'data/sampling/sampling-toy-2000-2'
sample_length = 20
coverage = 1
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

ValueError: ('Output directory already exists:', 'data/sampling/sampling-toy-2000-2')

In [8]:
input_dir = 'data/sampling/sampling-toy-2000-2'
pattern = 'fragments*.npy'

fragments_toy2 = sampling2.read_fragments(input_dir, pattern)
fragments_toy2.shape

(409, 21)

In [9]:
# ['NC_013451', 'NC_006375', 'NC_019565', 'NC_015407', 'NC_016841']
# [1280, 1590, 210, 2102, 573]

for each in fragments_toy2.astype('str').tolist()[:5]:
    print(each)

['g', 'c', 'a', 'g', 't', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', '1280']
['t', 't', 'g', 'a', 't', 't', 'g', 'a', 't', 'a', 'g', 't', 'g', 'g', 'c', 'a', 'a', 'g', 'a', 'c', '1280']
['g', 'c', 'c', 't', 'g', 'a', 'c', 't', 'g', 't', 't', 't', 'a', 't', 'a', 't', 't', 't', 'g', 'c', '1280']
['t', 't', 'a', 'a', 'a', 'c', 'c', 'c', 'g', 'a', 'c', 'g', 'c', 'a', 't', 'a', 'g', 'c', 'a', 'a', '1280']
['c', 't', 'g', 'a', 't', 'g', 't', 't', 't', 't', 'g', 'c', 'c', 'g', 'c', 't', 'a', 't', 'a', 'c', '1280']


In [10]:
fragments_toy2_rows = []
for each in fragments_toy2.astype('str'):
    row = concatenate_columns(each)
    fragments_toy2_rows.append(row)
len(fragments_toy2_rows)

409

In [11]:
# compare results from two sampling modules
assert fragments_toy.astype('str').tolist() == fragments_toy2_rows

In [12]:
# confirm all fragments are in the expected data
mapping = {'NC_013451':b'1280',
           'NC_006375':b'1590',
           'NC_019565':b'210',
           'NC_015407':b'2102',
           'NC_016841':b'573'}

# read in sequences
input_file = 'data/train_small-db_toy-2000.fasta'
for seq_record in SeqIO.parse(input_file, "fasta"):
    
    key = mapping[seq_record.id]

    print('seq length:', len(seq_record.seq))

    # get only those fragments for the species
    fragments_to_consider = fragments_toy2[np.where(fragments_toy2[:,-1] == key)].astype('str')
    print(fragments_to_consider.shape)

    # check whether all fragments for this species can be found in the sequence
    for each in fragments_to_consider:
        row = concatenate_columns(each)
        assert row[0] in seq_record.seq, row[0]
        assert row[1] == key.decode('utf-8')
    print('All fragments found in this sequence.')
    print()


seq length: 1442
(73, 21)
All fragments found in this sequence.

seq length: 1917
(96, 21)
All fragments found in this sequence.

seq length: 1634
(82, 21)
All fragments found in this sequence.

seq length: 1840
(92, 21)
All fragments found in this sequence.

seq length: 1308
(66, 21)
All fragments found in this sequence.



### Test sampling on small-db (coverage=0.05)

In [13]:
%%time
# 34s

seq_file = '/Users/ryanqnelson/Downloads/large-scale-metagenomics-1.0/data/train-dataset/train_small-db.fasta'
taxid_file = 'data/train_small-db.species-level.taxid'
output_dir = 'data/sampling/sampling-small-05'
sample_length = 200
coverage = 0.05
seed = 42

sampling.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

ValueError: ('Output directory already exists:', 'data/sampling/sampling-small-05')

In [14]:
input_dir = 'data/sampling/sampling-small-05'
pattern = 'fragments*.npy'

fragments_small = sampling.read_fragments(input_dir, pattern)
fragments_small.shape

(690539, 2)

### Test sampling on small-db (coverage=0.05)

In [15]:
%%time
# 2 min with sampling2

seq_file = '/Users/ryanqnelson/Downloads/large-scale-metagenomics-1.0/data/train-dataset/train_small-db.fasta'
taxid_file = 'data/train_small-db.species-level.taxid'
output_dir = 'data/sampling/sampling-small-05-2'
sample_length = 200
coverage = 0.05
seed = 42

sampling2.generate_fragment_data(seq_file, taxid_file, output_dir, sample_length, coverage, seed)

ValueError: ('Output directory already exists:', 'data/sampling/sampling-small-05-2')

In [16]:
input_dir = 'data/sampling/sampling-small-05-2'
pattern = 'fragments*.npy'

fragments_small2 = sampling2.read_fragments(input_dir, pattern)
fragments_small2.shape

(690539, 201)

In [17]:
fragments_small2_rows = []
for each in fragments_small2.astype('str'):
    row = concatenate_columns(each)
    fragments_small2_rows.append(row)
len(fragments_small2_rows)

690539

In [18]:
# compare results from two sampling modules
assert fragments_small.astype('str').tolist() == fragments_small2_rows

# 2. Test encoding process

### Test encoding on toy set

In [35]:
input_dir = 'data/sampling/sampling-toy-2000'
pattern = 'fragments*.npy'

fragments_toy = sampling.read_fragments(input_dir, pattern)
fragments_toy.shape

k=4
X_toy = np.delete(fragments_toy,1,axis=1).reshape(-1,)
y_toy = fragments_toy[:,-1]
print(X_toy.shape)
print(y_toy.shape)

X_toy_encoded = encoding.encode_fragment_dataset(X_toy,k)
print(X_toy_encoded.shape)

(409,)
(409,)
(409, 921)


### Test encoding2 on toy set

In [20]:
input_dir = 'data/sampling/sampling-toy-2000-2'
pattern = 'fragments*.npy'

fragments_toy2 = sampling2.read_fragments(input_dir, pattern)
fragments_toy2.shape

k=4
X_toy_encoded2, y_toy2 = encoding2.encode_fragment_dataset(fragments_toy2,k)
print(X_toy_encoded2.shape)
print(y_toy2.shape)

(409, 921)
(409,)


In [21]:
# compare two encoding techniques
np.testing.assert_array_equal(X_toy_encoded.toarray(), X_toy_encoded2.toarray())
np.testing.assert_array_equal(y_toy.astype('str'), y_toy2)

### Test encoding on small-db (coverage=0.05)

In [22]:
%%time 
# 40 sec, (689790, 135168) with sampling,encoding

input_dir = 'data/sampling/sampling-small-05'
pattern = 'fragments*.npy'

fragments_small = sampling.read_fragments(input_dir, pattern)
fragments_small.shape


k=6
X_small = np.delete(fragments_small,1,axis=1).reshape(-1,)
y_small = fragments_small[:,-1]
print(X_small.shape)
print(y_small.shape)

X_small_encoded = encoding.encode_fragment_dataset(X_small,k)
print(X_small_encoded.shape)

(690539,)
(690539,)
(690539, 135168)
CPU times: user 35.4 s, sys: 928 ms, total: 36.3 s
Wall time: 36.4 s


### Test encoding2 on small-db (coverage=0.05)

In [23]:
%%time 
# 1 min , (690539, 135168) with sampling2,encoding2

input_dir = 'data/sampling/sampling-small-05-2'
pattern = 'fragments*.npy'

fragments_small2 = sampling2.read_fragments(input_dir, pattern)
fragments_small2.shape


k=6
X_small_encoded2, y_small2 = encoding2.encode_fragment_dataset(fragments_small2,k)
print(X_small_encoded2.shape)

(690539, 135168)
CPU times: user 1min 2s, sys: 1.35 s, total: 1min 3s
Wall time: 1min 3s


In [24]:
%%time
# 2 min

# compare two encoding techniques
np.testing.assert_array_equal(X_small_encoded[:10000].toarray(), X_small_encoded2[:10000].toarray())
np.testing.assert_array_equal(y_small.astype('str'),y_small2)


CPU times: user 37 s, sys: 54.6 s, total: 1min 31s
Wall time: 1min 53s


### Test larger k values

In [25]:
%%time 
# 1.5 min with sampling,encoding

input_dir = 'data/sampling/sampling-small-05'
pattern = 'fragments*.npy'

fragments_small = sampling.read_fragments(input_dir, pattern)
fragments_small.shape

k=10

X_small = np.delete(fragments_small,1,axis=1).reshape(-1,)
y_small = fragments_small[:,-1]
print(X_small.shape)
print(y_small.shape)

X_small_encoded = encoding.encode_fragment_dataset(X_small,k)
print(X_small_encoded.shape)

(690539,)
(690539,)
(690539, 8859697)
CPU times: user 1min 29s, sys: 2.93 s, total: 1min 32s
Wall time: 1min 33s


In [26]:
%%time 
# 54 s, (690539, 81920) with sampling2,encoding2

input_dir = 'data/sampling/sampling-small-05-2'
pattern = 'fragments*.npy'

fragments_small2 = sampling2.read_fragments(input_dir, pattern)
fragments_small2.shape

k=10

X_small_encoded2, y_small2 = encoding2.encode_fragment_dataset(fragments_small2,k)
print(X_small_encoded2.shape)

(690539, 8859697)
CPU times: user 1min 58s, sys: 3.04 s, total: 2min 1s
Wall time: 2min 2s


In [32]:
# ensure kmers are the same

input_dir = 'data/sampling/sampling-small-05'
pattern = 'fragments*.npy'
k = 10
fragments_small = sampling.read_fragments(input_dir, pattern)
fragments_small.shape
X_small = np.delete(fragments_small,1,axis=1).reshape(-1,)
y_small = fragments_small[:,-1]
print(X_small.shape)
print(y_small.shape)


X_small_kmers = encoding._generate_kmers(X_small,k)
print(X_small_kmers.shape)

(690539,)
(690539,)
(690539, 20)


In [33]:
input_dir = 'data/sampling/sampling-small-05-2'
pattern = 'fragments*.npy'

k = 10

fragments_small2 = sampling2.read_fragments(input_dir, pattern)
fragments_small2.shape

X_small_kmers2, y_small2 = encoding2._group_kmers(fragments_small2,k)
print(X_small_kmers2.shape)

(690539, 20)


In [34]:
np.testing.assert_array_equal(X_small_kmers, X_small_kmers2)