Skip to content

Commit

Permalink
Merge pull request #25 from martinghunt/master
Browse files Browse the repository at this point in the history
sanity check input; contig filtering
  • Loading branch information
nds committed Jul 29, 2015
2 parents 554de8c + 8caf563 commit 0fa4a78
Show file tree
Hide file tree
Showing 20 changed files with 353 additions and 20 deletions.
47 changes: 43 additions & 4 deletions circlator/bamfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def __init__(
bam,
outprefix,
length_cutoff=100000,
min_read_length=250,
contigs_to_use=None,
discard_unmapped=False,
log_prefix='[bamfilter]',
):
self.bam = os.path.abspath(bam)
Expand All @@ -22,6 +25,9 @@ def __init__(
self.reads_fa = os.path.abspath(outprefix + '.fasta')
self.log = os.path.abspath(outprefix + '.log')
self.log_prefix = log_prefix
self.contigs_to_use = self._get_contigs_to_use(contigs_to_use)
self.discard_unmapped = discard_unmapped
self.min_read_length = min_read_length


def _get_ref_lengths(self):
Expand All @@ -30,6 +36,33 @@ def _get_ref_lengths(self):
return dict(zip(sam_reader.references, sam_reader.lengths))


def _get_contigs_to_use(self, contigs_to_use):
'''If contigs_to_use is a set, returns that set. If it's None, returns an empty set.
Otherwise, assumes it's a file name, and gets names from the file'''
if type(contigs_to_use) == set:
return contigs_to_use
elif contigs_to_use is None:
return set()
else:
f = pyfastaq.utils.open_file_read(contigs_to_use)
contigs_to_use = set([line.rstrip() for line in f])
pyfastaq.utils.close(f)
return contigs_to_use


def _check_contigs_to_use(self, ref_dict):
'''Checks that the set of contigs to use are all in the reference
fasta lengths dict made by self._get_ref_lengths()'''
if self.contigs_to_use is None:
return True

for contig in self.contigs_to_use:
if contig not in ref_dict:
raise Error('Requested to use contig "' + contig + '", but not found in input BAM file "' + self.bam + '"')

return True


def _all_reads_from_contig(self, contig, fout):
'''Gets all reads from contig called "contig" and writes to fout'''
sam_reader = pysam.Samfile(self.bam, "rb")
Expand Down Expand Up @@ -97,7 +130,7 @@ def _get_region(self, contig, start, end, fout, min_length=250):

if read.is_reverse:
seq.revcomp()

if len(seq) >= min_length:
print(seq, file=fout)

Expand All @@ -110,15 +143,19 @@ def run(self):
print(self.log_prefix, '#contig', 'length', 'reads_kept', sep='\t', file=f_log)

for contig in sorted(ref_lengths):
if len(self.contigs_to_use) > 0 and contig not in self.contigs_to_use:
print(self.log_prefix, contig, ref_lengths[contig], 'skipping', sep='\t', file=f_log)
continue

if ref_lengths[contig] <= self.length_cutoff:
self._all_reads_from_contig(contig, f_fa)
print(self.log_prefix, contig, ref_lengths[contig], 'all', sep='\t', file=f_log)
else:
end_bases_keep = int(0.5 * self.length_cutoff)
start = end_bases_keep - 1
end = max(end_bases_keep - 1, ref_lengths[contig] - end_bases_keep)
self._get_region(contig, 0, start, f_fa)
self._get_region(contig, end, ref_lengths[contig], f_fa)
self._get_region(contig, 0, start, f_fa, min_length=self.min_read_length)
self._get_region(contig, end, ref_lengths[contig], f_fa, min_length=self.min_read_length)
print(
self.log_prefix,
contig,
Expand All @@ -128,6 +165,8 @@ def run(self):
file=f_log
)

self._get_all_unmapped_reads(f_fa)
if not self.discard_unmapped:
self._get_all_unmapped_reads(f_fa)

pyfastaq.utils.close(f_fa)
pyfastaq.utils.close(f_log)
14 changes: 13 additions & 1 deletion circlator/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import sys
import os
import subprocess

version = '0.14.1'
class Error (Exception): pass

version = '0.15.0'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
Expand Down Expand Up @@ -36,3 +39,12 @@ def decode(x):
except:
return x
return s


def check_files_exist(filenames):
'''Dies if any files in the list of filenames does not exist'''
files_not_found = [x for x in filenames if not os.path.exists(x)]
if len(files_not_found):
for filename in files_not_found:
print('File not found: "', filename, '"', sep='', file=sys.stderr)
raise Error('File(s) not found. Cannot continue')
39 changes: 36 additions & 3 deletions circlator/tasks/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def run():
mapreads_group.add_argument('--bwa_opts', help='BWA options, in quotes [%(default)s]', default='-x pacbio', metavar='STRING')

bam2reads_group = parser.add_argument_group('bam2reads options')
bam2reads_group.add_argument('--b2r_discard_unmapped', action='store_true', help='Use this to not keep unmapped reads')
bam2reads_group.add_argument('--b2r_only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --b2r_discard_unmapped is used). Note: the whole assembly is still used as a reference when mapping', metavar='FILENAME')
bam2reads_group.add_argument('--b2r_length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
bam2reads_group.add_argument('--b2r_min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')

assemble_group = parser.add_argument_group('assemble options')
assemble_group.add_argument('--assemble_spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,121,111,101,95,91,85,81,75,71', metavar='k1,k2,k3,...')
Expand Down Expand Up @@ -58,9 +61,20 @@ def run():
print_message('{:_^79}'.format(' Checking external programs '), options)
circlator.external_progs.check_all_progs(verbose=options.verbose)

files_to_check = [options.assembly, options.reads]
if options.b2r_only_contigs:
files_to_check.append(options.b2r_only_contigs)
options.b2r_only_contigs = os.path.abspath(options.b2r_only_contigs)

if options.genes_fa:
files_to_check.append(options.genes_fa)

circlator.common.check_files_exist(files_to_check)

original_assembly = os.path.abspath(options.assembly)
original_reads = os.path.abspath(options.reads)


try:
os.mkdir(options.outdir)
except:
Expand All @@ -69,6 +83,7 @@ def run():

os.chdir(options.outdir)

original_assembly_renamed = '00.input_assembly.fasta'
bam = '01.mapreads.bam'
filtered_reads_prefix = '02.bam2reads'
filtered_reads = filtered_reads_prefix + '.fasta'
Expand All @@ -81,11 +96,17 @@ def run():
fixstart_prefix = '06.fixstart'
fixstart_fasta = fixstart_prefix + '.fasta'

pyfastaq.tasks.to_fasta(
original_assembly,
original_assembly_renamed,
strip_after_first_whitespace=True,
check_unique=True
)

#-------------------------------- mapreads -------------------------------
print_message('{:_^79}'.format(' Running mapreads '), options)
circlator.mapping.bwa_mem(
original_assembly,
original_assembly_renamed,
original_reads,
bam,
threads=options.threads,
Expand All @@ -99,7 +120,10 @@ def run():
bam_filter = circlator.bamfilter.BamFilter(
bam,
filtered_reads_prefix,
length_cutoff=options.b2r_length_cutoff
length_cutoff=options.b2r_length_cutoff,
min_read_length=options.b2r_min_read_length,
contigs_to_use=options.b2r_only_contigs,
discard_unmapped=options.b2r_discard_unmapped,
)
bam_filter.run()

Expand All @@ -116,6 +140,15 @@ def run():
a.run()


#------------------------------ filter original assembly -----------------
if options.b2r_only_contigs:
print_message('{:_^79}'.format(' --b2r_only_contigs used - filering contigs '), options)
assembly_to_use = merge_prefix + '.00.filtered_assembly.fa'
pyfastaq.tasks.filter(original_assembly_renamed, assembly_to_use, ids_file=options.b2r_only_contigs)
else:
assembly_to_use = original_assembly_renamed


#-------------------------------- merge ----------------------------------
print_message('{:_^79}'.format(' Running merge '), options)
if not options.no_pair_merge:
Expand All @@ -125,7 +158,7 @@ def run():
options.merge_opts.extend(['--reads', filtered_reads])

m = circlator.merge.Merger(
original_assembly,
assembly_to_use,
reassembly,
merge_prefix,
nucmer_diagdiff=options.merge_diagdiff,
Expand Down
8 changes: 7 additions & 1 deletion circlator/tasks/bam2reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@ def run():
parser = argparse.ArgumentParser(
description = 'Make reads from mapping to be reassembled',
usage = 'circlator bam2reads [options] <in.bam> <outprefix>')
parser.add_argument('--discard_unmapped', action='store_true', help='Use this to not keep unmapped reads')
parser.add_argument('--only_contigs', help='File of contig names (one per line). Only reads that map to these contigs are kept (and unmapped reads, unless --discard_unmapped is used).', metavar='FILENAME')
parser.add_argument('--length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT')
parser.add_argument('--min_read_length', type=int, help='Minimum length of read to output [%(default)s]', default=250, metavar='INT')
parser.add_argument('bam', help='Name of input bam file', metavar='in.bam')
parser.add_argument('outprefix', help='Prefix of output filenames')
options = parser.parse_args()

bam_filter = circlator.bamfilter.BamFilter(
options.bam,
options.outprefix,
length_cutoff=options.length_cutoff
length_cutoff=options.length_cutoff,
min_read_length=options.min_read_length,
contigs_to_use=options.only_contigs,
discard_unmapped=options.discard_unmapped,
)
bam_filter.run()

81 changes: 72 additions & 9 deletions circlator/tests/bamfilter_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import filecmp
import os
import pyfastaq
import pyfastaq
from circlator import bamfilter

modules_dir = os.path.dirname(os.path.abspath(bamfilter.__file__))
Expand All @@ -20,11 +20,38 @@ def test_get_ref_lengths(self):
self.assertEqual(expected, b._get_ref_lengths())


def test_get_contigs_to_use(self):
'''test _get_contigs_to_use'''
b = bamfilter.BamFilter(os.path.join(data_dir, 'bamfilter_test_get_contigs_to_use.bam'), 'out')
test_file = os.path.join(data_dir, 'bamfilter_test_get_contigs_to_use.infile')
self.assertEqual(b._get_contigs_to_use(test_file), {'contig42', 'contig4444244'})
self.assertEqual(b._get_contigs_to_use(None), set())
self.assertEqual(b._get_contigs_to_use({'42', '43'}), {'42', '43'})


def test_check_contigs_to_use(self):
'''test _check_contigs_to_use'''
input_bam = os.path.join(data_dir, 'bamfilter_test_check_contigs_to_use.bam')
b = bamfilter.BamFilter(input_bam, 'out')
ref_lengths = b._get_ref_lengths()
self.assertTrue(b._check_contigs_to_use(ref_lengths))

b = bamfilter.BamFilter(input_bam, 'out', contigs_to_use={'1'})
self.assertTrue(b._check_contigs_to_use(ref_lengths))

b = bamfilter.BamFilter(input_bam, 'out', contigs_to_use={'1', '2'})
self.assertTrue(b._check_contigs_to_use(ref_lengths))

with self.assertRaises(bamfilter.Error):
b = bamfilter.BamFilter(input_bam, 'out', contigs_to_use={'42'})
self.assertTrue(b._check_contigs_to_use(ref_lengths))


def test_all_reads_from_contig(self):
'''test _all_reads_from_contig'''
b = bamfilter.BamFilter(os.path.join(data_dir, 'bamfilter_test_all_reads_from_contig.bam'), 'out')
tmp = 'tmp.test_all_reads_from_contig.out.fa'
f = pyfastaq.utils.open_file_write(tmp)
f = pyfastaq.utils.open_file_write(tmp)
expected = os.path.join(data_dir, 'bamfilter_test_all_reads_from_contig.reads.fa')
b._all_reads_from_contig('1', f)
pyfastaq.utils.close(f)
Expand All @@ -37,7 +64,7 @@ def test_get_all_unmapped_reads(self):
b = bamfilter.BamFilter(os.path.join(data_dir, 'bamfilter_test_get_all_unmapped_reads.bam'), 'out')
expected = os.path.join(data_dir, 'bamfilter_test_get_all_unmapped_reads.reads.fa')
tmp = 'tmp.test_get_all_unmapped_reads.out.fa'
f = pyfastaq.utils.open_file_write(tmp)
f = pyfastaq.utils.open_file_write(tmp)
b._get_all_unmapped_reads(f)
pyfastaq.utils.close(f)
self.assertTrue(filecmp.cmp(expected, tmp, shallow=False))
Expand All @@ -49,10 +76,10 @@ def test_break_reads(self):
b = bamfilter.BamFilter(os.path.join(data_dir, 'bamfilter_test_break_reads.bam'), 'out')
expected = os.path.join(data_dir, 'bamfilter_test_break_reads.broken_reads.fa')
tmp = 'tmp.test_break_reads.out.fa'
f = pyfastaq.utils.open_file_write(tmp)
f = pyfastaq.utils.open_file_write(tmp)
b._break_reads('contig1', 390, f, min_read_length=5)
pyfastaq.utils.close(f)
self.assertTrue(filecmp.cmp(expected, tmp))
self.assertTrue(filecmp.cmp(expected, tmp, shallow=False))
os.unlink(tmp)


Expand All @@ -61,10 +88,10 @@ def test_exclude_region(self):
b = bamfilter.BamFilter(os.path.join(data_dir, 'bamfilter_test_exclude_region.bam'), 'out')
expected = os.path.join(data_dir, 'bamfilter_test_exclude_region.reads.fa')
tmp = 'tmp.test_exclude_reads.out.fa'
f = pyfastaq.utils.open_file_write(tmp)
f = pyfastaq.utils.open_file_write(tmp)
b._exclude_region('1', 500, 700, f)
pyfastaq.utils.close(f)
self.assertTrue(filecmp.cmp(expected, tmp))
self.assertTrue(filecmp.cmp(expected, tmp, shallow=False))
os.unlink(tmp)


Expand All @@ -76,7 +103,7 @@ def test_get_region_start(self):
f = pyfastaq.utils.open_file_write(tmp)
b._get_region('1', 0, 64, f, min_length=20)
pyfastaq.utils.close(f)
self.assertTrue(filecmp.cmp(expected, tmp))
self.assertTrue(filecmp.cmp(expected, tmp, shallow=False))
os.unlink(tmp)


Expand All @@ -88,5 +115,41 @@ def test_get_region_end(self):
f = pyfastaq.utils.open_file_write(tmp)
b._get_region('2', 379, 499, f, min_length=20)
pyfastaq.utils.close(f)
self.assertTrue(filecmp.cmp(expected, tmp))
self.assertTrue(filecmp.cmp(expected, tmp, shallow=False))
os.unlink(tmp)


def test_run_keep_unmapped(self):
'''test run keep unmapped'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
contigs_to_use={'contig1', 'contig3', 'contig4'}
)
b.run()
expected = os.path.join(data_dir, 'bamfilter_test_run_keep_unmapped.out.reads.fa')
self.assertTrue(filecmp.cmp(expected, outprefix + '.fasta', shallow=False))
os.unlink(outprefix + '.fasta')
os.unlink(outprefix + '.log')


def test_run_discard_unmapped(self):
'''test run keep unmapped'''
outprefix = 'tmp.bamfilter_run'
b = bamfilter.BamFilter(
os.path.join(data_dir, 'bamfilter_test_run.bam'),
outprefix,
length_cutoff=600,
min_read_length=100,
contigs_to_use={'contig1', 'contig3', 'contig4'},
discard_unmapped=True
)
b.run()
expected = os.path.join(data_dir, 'bamfilter_test_run_discard_unmapped.out.reads.fa')
self.assertTrue(filecmp.cmp(expected, outprefix + '.fasta', shallow=False))
os.unlink(outprefix + '.fasta')
os.unlink(outprefix + '.log')

15 changes: 15 additions & 0 deletions circlator/tests/common_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import unittest
import os
from circlator import common

modules_dir = os.path.dirname(os.path.abspath(common.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')


class TestCommon(unittest.TestCase):
def test_check_files_exist(self):
'''test check_files_exist'''
file_exists = os.path.join(data_dir, 'common_test_file_exists')
common.check_files_exist([file_exists])
with self.assertRaises(common.Error):
common.check_files_exist([file_exists, 'thisisnotafileandshouldcauseanerror'])
Binary file not shown.
Binary file not shown.
Empty file.
2 changes: 2 additions & 0 deletions circlator/tests/data/bamfilter_test_get_contigs_to_use.infile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
contig42
contig4444244
Binary file added circlator/tests/data/bamfilter_test_run.bam
Binary file not shown.
Binary file added circlator/tests/data/bamfilter_test_run.bam.bai
Binary file not shown.

0 comments on commit 0fa4a78

Please sign in to comment.