In [7]:
import crispr_assembler as ca
import numpy as np

from multiprocessing import Pool

from tqdm import tqdm_notebook
# %load_ext autoreload
# %autoreload 2

In [2]:
read = 'CCC_111111_AAACCC_2222222_AAACCC_3333333_AAA'
pattern = 'AAACCC'
pattern_rc = 'GGGTTT'

In [3]:
repeat = ca.Repeat('GG(GT)TT', 'GGGT', 'GTTT', 'GGGTTT')

In [7]:
r, id1, id2 = ca.check_reverse_complementarity(read, pattern_rc, e=0)

In [14]:
id2

[slice(12, 18, None), slice(27, 33, None)]

In [17]:
ca.split_read_by_repeat(ca.rc(read,r=1), repeat, id2, e=0)

[slice(4, 12, None), slice(18, 27, None), slice(33, 40, None)]

In [20]:
for i in ca.split_read_by_repeat(ca.rc(read,r=1), repeat, e=0):
    print(ca.rc(read, r=1)[i])

3333333_
_2222222_
_111111


In [22]:
%timeit ca.split_read_by_repeat(ca.rc(read,r=1), repeat, id2, e=0)

102 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [23]:
%timeit ca.split_read_by_repeat(ca.rc(read,r=1), repeat, e=0)

133 µs ± 188 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [25]:
ca.split_function('',read,'+',read, repeat,e=0)

(1,
 ['3333333_', '_2222222_', '_111111'],
 [['3333333_', '_2222222_', '_111111']])

In [24]:
with open("../article/data/fastq/es3_250.fastq") as f:
    lines = f.readlines()

In [25]:
lines[:10]

['@M02459:39:000000000-AHGUU:1:1101:15529:1333 1:N:0:14\n',
 'CGCTGGCGCGGGGAACTCAGTGCTGGACCATTTCAAATTCACAATAGATTCGGTTTATCCCCGCTGGTGCGGGGAACACGGAATGATATTTCAATAAATAATTATAACAAT\n',
 '+\n',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n',
 '@M02459:39:000000000-AHGUU:1:1101:15932:1476 1:N:0:14\n',
 'GCGCCAGCGGGATAAACCGAGCACAAATATCATCGCTCAAACCACTTACGGGTGTTCCCCGCGCCAGCGGGGATAAACCGCCTCGCTGTAAATTCCAAAAACGATCTCTATAGTGTTCCCCGCGCCAGCGGGGATAAACCGAC\n',
 '+\n',
 'AIIIIIIIIIIIGIIGIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGIIIIIIIIIIHIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n',
 '@M02459:39:000000000-AHGUU:1:1101:17965:1514 1:N:0:14\n',
 'GCGCCAGCGTGATAAACCGGCGTCCGTCAAAGAGGAAGCCAAAGCCATGCAGTGTTCCCCGCGCCAGCG\n']

In [26]:
def process_func(position, read, plus, quality):
    return ca.split_function(position,read,plus,quality,ca.ecoli_r_2,e=3)

In [27]:
fastq_p = ca.FastqProcessor(process_function=process_func, source=lines[:256])

In [28]:
p = Pool(6)

In [29]:
%%time
res_batch = []
for i, res in enumerate(fastq_p.process_by_batch(p, len(lines))):
    res_batch.extend(res)

CPU times: user 4.17 ms, sys: 2.11 ms, total: 6.28 ms
Wall time: 17.7 ms


In [30]:
%%time
res_no_batch = []
for i, res in tqdm_notebook(enumerate(fastq_p.process())):
    res_no_batch.append(res)

A Jupyter Widget


CPU times: user 86.6 ms, sys: 1.44 ms, total: 88 ms
Wall time: 84.4 ms


In [32]:
res_no_batch[2]

(-1,
 'GCGCCAGCGTGATAAACCGGCGTCCGTCAAAGAGGAAGCCAAAGCCATGCAGTGTTCCCCGCGCCAGCG\n',
 ['GCGCCAGCGTGATAAACCGGCGTCCGTCAAAGAGGAAGCCAAAGCCATGCAGTGTTCCCCGCGCCAGCG\n'],
 ['?I??III?I2BIIIIII>@I@E>IIAI@IIIIII>II@>III?I?IIAAIIII?IIIAI@???IIIIII\n'])

In [34]:
['\t'.join(x[2]) for x in res_batch]

['AGTGCTGGACCATTTCAAATTCACAATAGA\tGGAATGATATTTCAATAAATAATTATAACAAT\n',
 '\n\tTATAGAGATCGTTTTTGGAATTTACAGCG\tCCGTAAGTGGTTTGAGCGATGATATTTGTGC',
 'GCGCCAGCGTGATAAACCGGCGTCCGTCAAAGAGGAAGCCAAAGCCATGCAGTGTTCCCCGCGCCAGCG\n',
 'GACAGAACGGCCTCAGTAGTCTCGTCAGG\tACACGCCCTTATCAAATAGCTGTGATTTAC\tAT\n',
 'CAAGTGATATCCATCATCGCATCCAGTGCG\tAGTGCTGGACCATTTCAAATTCACAATAGA\tGGAATGATATTTCAATAAATAATTATAAC\tCCCTCACACCGATTCGCCAAACGGTGGAG\tAGCTGGGCGAAATTTTGATTCATCGTGATGACCGGTTTATCCCGCTG\n',
 'GACAGAACGGCCTCAGTAGTCTCGTCAGG\tACACGCCCTTATCAAATAGCTGTGATTTAC\tAT',
 '\nGGGAACACTATAGAGATCGTTTTTGGAATTTACAGCGA\tCCGTAAGTGGTTTGAGCGATGATATTTGTGCT',
 'TACCGTACAGACTGCCGGATATTATTTTTT\tCTGCTGCTCGAGCTGGTGGAGTGCTGCTATAGCGGTTTATC\n',
 'CCCTCACACCCATTCGCCAAACGGTGGAG\tAGCTGGGCGAAATTTTGATTCATCGTGATGA',
 'TACCGTACAGACTGCCGGATATTATTTTT\tCTGCTGCTCGAGCTGGTGGAGTGCTGCTA\tAACGTCAGGTTGTCGCCGCTCTGCGTGGT\tTGCCTGGCTTTGGCTTCCT',
 'GGGGAACACTGCATGGCTTTGGCTTCCTCTTTGACGGAC\tATATATCAGA\n',
 'ATAGACCCCGAACAACAATACGCGCAAAC\tTACCGTACAGACTGCCGGATATTATT

In [8]:
len(res_batch)

250

In [9]:
len(res_no_batch)

250

In [67]:
res_batch[0]

(0,
 'CGCTGGCGCGGGGAACTCAGTGCTGGACCATTTCAAATTCACAATAGATTCGGTTTATCCCCGCTGGTGCGGGGAACACGGAATGATATTTCAATAAATAATTATAACAAT\n',
 ['AGTGCTGGACCATTTCAAATTCACAATAGA', 'GGAATGATATTTCAATAAATAATTATAACAAT\n'],
 [['IIIIIIIIIIIIIIIIIIIIIIIIIIIIII', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n']])

In [68]:
res_no_batch[0]

(0,
 'CGCTGGCGCGGGGAACTCAGTGCTGGACCATTTCAAATTCACAATAGATTCGGTTTATCCCCGCTGGTGCGGGGAACACGGAATGATATTTCAATAAATAATTATAACAAT\n',
 ['AGTGCTGGACCATTTCAAATTCACAATAGA', 'GGAATGATATTTCAATAAATAATTATAACAAT\n'],
 [['IIIIIIIIIIIIIIIIIIIIIIIIIIIIII', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\n']])

In [10]:
all(np.array(res_batch) == np.array(res_no_batch))

True