Skip to content

Commit

Permalink
added uts
Browse files Browse the repository at this point in the history
  • Loading branch information
sadkat committed Sep 26, 2018
1 parent ffe4b12 commit 1c5aaed
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 12 deletions.
51 changes: 39 additions & 12 deletions magellan/magellan/_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _sort_and_index(in_file, out_file):

def __init__(self, in_file):
'''
in_file could be bam or sam format. an extension of si (for sorted,indexed) is added
in_file could be bam or sam format. an extension of si is added to the sorted, indexed bam file that is generated
'''
self._base_path, fname = os.path.split(in_file)
f_base, f_ext = os.path.splitext(fname)
Expand All @@ -93,16 +93,27 @@ def __init__(self, in_file):

self._alignment_file = pysam.AlignmentFile(self._bam_file, 'rb')

#check indices (0 baseD?)
def count_by_region(self,
ref_name,
start,
stop,
read_filter_callback='nofilter'):
'''
Count reads mapped to a specified region (specified by reference name, start and end positions.
A callback function for filtering reads can be provided with read_filter_callback (default 'nofilter' for no filtering).
Reads returning True, will be retained
Returns - integer representing total number of reads mapped to region
'''
return self._alignment_file.count(contig=ref_name, start=start, stop=stop, read_callback=read_filter_callback)

def count_all_references(self,
read_filter_callback='nofilter'):
'''
Counts reads for all references appearing in the alignemnt file across their entire length. Internally calls count_by_region.
A callback function for filtering reads can be provided with read_filter_callback (default 'nofilter' for no filtering).
Reads returning True, will be retained
Returns a pandas dataframe with the following fields: name, length, count
'''
counts = []
for r, l in zip(self._alignment_file.references, self._alignment_file.lengths):
count_entry = {
Expand All @@ -117,34 +128,50 @@ def filter(self,
start=None,
stop=None,
read_filter_callback='nofilter'):
'''
Creates an iterator over all reads either in a region or in the entire alignemnt file that meet a specific criteria specified by read_filter_callback
(default 'nofilter' for no filtering).
'''
reads_iterator = self._alignment_file.fetch(contig, start, stop)
for r in reads_iterator:
if read_filter_callback != 'nofilter' and read_filter_callback(r):
yield r

def write_reads(self,
out_file,
write_sam=False,
contig=None,
start=None,
stop=None,
read_filter_callback='nofilter'):

filtered_reads = pysam.AlignmentFile(out_file, 'wb', template=self._alignment_file)
'''
write reads to file, optionally using read filtering specified by read_filter_callback (default 'nofilter' for no filtering).
Can also be used to convert bam to sam format
'''
if write_sam:
filtered_reads = pysam.AlignmentFile(out_file, 'w', template=self._alignment_file)
else:
filtered_reads = pysam.AlignmentFile(out_file, 'wb', template=self._alignment_file)
reads_iterator = self._alignment_file.fetch(contig, start, stop)
for r in reads_iterator:
if read_filter_callback != 'nofilter' and read_filter_callback(r):
if read_filter_callback == 'nofilter' or read_filter_callback(r):
filtered_reads.write(r)
filtered_reads.close()


def is_high_map_quality(read, map_qual_threshold=5):
return read.mapping_quality > map_qual_threshold
def is_high_map_quality(read, min_qual_threshold=5):
'''
Example filter to be used as read_filter_callback when calling AlignmentParser functions.
Returns True is MAPQ for read is >= min_qual_threshold (default 5), False otherwise
'''
return read.mapping_quality >= min_qual_threshold


def has_low_mismatches(read, max_num_mismatches=5):
if not read.has_tag('NM'):
'''
Example filter to be used as read_filter_callback when calling AlignmentParser functions.
Returns True is number of mismatches for read is <= max_num_mismatches (default 5), False otherwise
'''
if not read.has_tag('NM'): # unaligned
return False
return read.get_tag('NM') <= max_num_mismatches




57 changes: 57 additions & 0 deletions magellan/test/_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,63 @@ def test_missing_inputs(self):
if os.path.isfile(l):
os.remove(l)


class _TestAlignmentParser(unittest.TestCase):
@classmethod
def setUpClass(cls):

magellan.bowtie2_build_index(
os.path.join(_sample_data_dir, 'sample_bac_genome_short.fa'),
os.path.join(_sample_data_dir, 'test_index'))

magellan.bowtie2_map(
os.path.join(_sample_data_dir, 'sample_reads_for_mapper.fastq'),
os.path.join(_sample_data_dir, 'test_index'),
os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'),
os.path.join(_sample_data_dir, 'sample_reads_for_mapper.log'))

@classmethod
def tearDownClass(cls):

index_files = glob.glob(os.path.join(_sample_data_dir, 'test_index'))

for i in index_files:
os.remove(i)

os.remove(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'))
os.remove(os.path.join(_sample_data_dir, 'sample_reads_for_mapper_si.bam'))
os.remove(os.path.join(_sample_data_dir, 'sample_reads_for_mapper_si.bam.bai'))
os.remove(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.log'))

def test_basic(self):
ap = magellan.AlignmentParser(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'))

self.assertTrue(ap._alignment_file.check_index)

ap.write_reads(os.path.join(_sample_data_dir, 'sample_reads_for_mapper_copy.sam'), write_sam=True)
os.remove(os.path.join(_sample_data_dir, 'sample_reads_for_mapper_copy.sam'))

def test_count(self):
ap = magellan.AlignmentParser(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'))
read_df = ap.count_all_references()
read_df.set_index('name', inplace=True)
read_dict = read_df['count'].to_dict()
sam_df = pd.read_csv(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'), sep='\t', skiprows=5, header=None)
sam_dict = sam_df.groupby(2)[0].count().to_dict()
self.assertDictEqual(read_dict, sam_dict)

def test_filter(self):
ap = magellan.AlignmentParser(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'))

it = ap.filter(read_filter_callback=lambda read: magellan.is_high_map_quality(read, 4))
filtered_reads = []
for r in it:
filtered_reads.append(r.query_name)

sam_df = pd.read_csv(os.path.join(_sample_data_dir, 'sample_reads_for_mapper.sam'), sep='\t', skiprows=5, header=None)
filtered_sam_reads = sam_df[sam_df[4] > 4][0].values.tolist()
self.assertListEqual(sorted(filtered_reads), sorted(filtered_sam_reads))


if __name__ == '__main__':
unittest.main()

0 comments on commit 1c5aaed

Please sign in to comment.