Skip to content

Commit

Permalink
Removed a legacy caution warning
Browse files Browse the repository at this point in the history
  • Loading branch information
scikal committed Jul 8, 2021
1 parent c9df3b9 commit 92dde7e
Showing 1 changed file with 51 additions and 53 deletions.
104 changes: 51 additions & 53 deletions MAKE_OBS_TAB.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,56 +12,55 @@
Daniel Ariad (daniel@ariad.org)
Jan 3rd, 2021
"""
import sys, os, time, random, warnings, argparse, re, pickle, gzip, bz2

import sys, os, time, random, argparse, re, pickle, gzip, bz2

try:
import pysam
except ModuleNotFoundError:
print('caution: the module pysam is missing.')

warnings.formatwarning = lambda message, category, filename, lineno, file=None, line=None: 'Caution: %s\n' % message
print('Caution: The module pysam is missing.')

def read_impute2(filename,**kwargs):
""" Reads an IMPUTE2 file format (LEGEND/HAPLOTYPE/SAMPLE) and builds a list
of lists, containing the dataset. """

filetype = kwargs.get('filetype', None)

def leg_format(line):
rs_id, pos, ref, alt = line.strip().split()
return ('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt)
return ('chr'+rs_id[:2].rstrip(':'), int(pos), ref, alt)

def sam_format(line):
sample_id, group1, group2, sex = line.strip().split(' ')
return (sample_id, group1, group2, int(sex))
return (sample_id, group1, group2, int(sex))

with (gzip.open(filename,'rt') if filename[-3:]=='.gz' else open(filename, 'r')) as impute2_in:
if filetype == 'leg':
if filetype == 'leg':
impute2_in.readline() # Bite off the header
result = tuple(map(leg_format,impute2_in))

elif filetype == 'hap':
firstline = impute2_in.readline() # Get first line
a0 = int(firstline.replace(' ', ''), 2)
a1 = (int(line.replace(' ', ''), 2) for line in impute2_in)
hap_tab = (a0, *a1)
number_of_haplotypes = len(firstline.strip().split())
result = hap_tab, number_of_haplotypes
elif filetype == 'sam':

elif filetype == 'sam':
impute2_in.readline() # Bite off the header
result = tuple(map(sam_format,impute2_in))

else:
result = tuple(line.strip().split() for line in impute2_in)
return result

return result

def save_obs(obs_tab,info,compress,bam_filename,output_filename,output_dir):
""" Saves the observations table together with information about
the chromosome number, depth of coverage, and flags that were used.
Also, data compression is supported in gzip and bzip2 formats. """

Open = {'bz2': bz2.open, 'gz': gzip.open}.get(compress, open)
ext = ('.'+compress) * (compress in ('bz2','gz'))
default_output_filename = re.sub('.bam$',f".{info['chr_id']:s}.obs.p{ext:s}",bam_filename.strip().split('/')[-1])
Expand All @@ -70,22 +69,22 @@ def save_obs(obs_tab,info,compress,bam_filename,output_filename,output_dir):
if output_dir!='' and not os.path.exists(output_dir): os.makedirs(output_dir)
with Open(output_dir + output_filename, "wb") as f:
pickle.dump(obs_tab, f, protocol=4)
pickle.dump(info, f, protocol=4)
pickle.dump(info, f, protocol=4)
return output_dir + output_filename

def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_observations,min_bq,min_mq,max_depth,output_filename,compress,**kwargs):
""" Retrives observed bases from known SNPs position. """
time0 = time.time()
random.seed(a=None, version=2) #I should set a=None after finishing to debug the code.

UNMAP, SECONDARY, QCFAIL, DUP = 0x4,0x100,0x200,0x400 #From http://www.htslib.org/doc/samtools-flags.html

if not os.path.isfile(bam_filename): raise Exception('Error: BAM file does not exist.')
if not os.path.isfile(legend_filename): raise Exception('Error: LEGEND file does not exist.')
if fasta_filename!='' and not os.path.isfile(fasta_filename): raise Exception('Error: FASTA file does not exist.')

obs_tab = list()

try:
genome_reference = pysam.FastaFile(fasta_filename) if fasta_filename!='' else None
samfile = pysam.AlignmentFile(bam_filename, 'rb' )
Expand All @@ -95,7 +94,7 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob
raise Exception('Error: Unsuitable legend file. All SNP positions should refer to the same chr_id.')
else:
chr_id = leg_tab[0][0]

arg = {'contig': chr_id, # The chr_id of the considered chromosome.
'start': leg_tab[0][1]-1, # The first snp in chr_id (start is inclusive).
'end': leg_tab[-1][1], # The last snp in chr_id (end is exclusive).
Expand All @@ -107,25 +106,24 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob
'ignore_overlaps': True, # If set to True, detect if read pairs overlap and only take the higher quality base.
'flag_filter': UNMAP | SECONDARY | QCFAIL | DUP, # Ignore reads where any of the bits in the flag are set. The default of samtools is UNMAP, SECONDARY, QCFAIL and DUP.
'ignore_orphans': True, # Ignore orphans (paired reads that are not in a proper pair).
'fastafile': genome_reference, # FastaFile object of a reference sequence.
'fastafile': genome_reference, # FastaFile object of a reference sequence.
'compute_baq': True} # By default, performs re-alignment computing per-Base Alignment Qualities (BAQ), if a reference sequence is given.'
leg_tab_iterator = iter(leg_tab)
pos = 0

leg_tab_iterator = iter(leg_tab)
pos = 0
for pileupcolumn in samfile.pileup(**arg):

while pileupcolumn.pos > pos-1: ### Chromosomal position starts from 1 in bam files, while it starts from 0 in obs files.
chr_id,pos,ref,alt = next(leg_tab_iterator)
chr_id,pos,ref,alt = next(leg_tab_iterator)

if pileupcolumn.pos == pos-1:
rows = [(pos, pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position])
for pileupread in pileupcolumn.pileups if pileupread.query_position!=None] # query_position is None if the base on the padded read is a deletion or a skip (e.g. spliced alignment).

rows = [(pos, pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position])
for pileupread in pileupcolumn.pileups if pileupread.query_position!=None] # query_position is None if the base on the padded read is a deletion or a skip (e.g. spliced alignment).

if pileupcolumn.get_num_aligned()==1:
obs_tab.extend(rows) #Each tuple in the list has the form (chromosomal posistion, associated line number in the legend file, reads id, observed allele)
else:
warnings.warn('Multiple reads were found to overlap at one or more SNP positions.')
if handle_multiple_observations=='all':
obs_tab.extend(rows)
elif handle_multiple_observations=='first':
Expand All @@ -135,57 +133,57 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob
elif handle_multiple_observations=='skip':
pass
else:
raise Exception('error: handle_multiple_observations only supports the options \"skip\", \"all\", \"first\" and \"random\".')
raise Exception('Error: handle_multiple_observations only supports the options \"skip\", \"all\", \"first\" and \"random\".')

info = {'redo-BAQ': fasta_filename!='',
'handle-multiple-observations' : handle_multiple_observations,
'min-bq': min_bq,
'min-mq' : min_mq,
'max-depth' : max_depth,
'chr_id': chr_id,
'depth': len(obs_tab)/len(leg_tab)}
'depth': len(obs_tab)/len(leg_tab)}

if output_filename!=None:
save_obs(obs_tab,info,compress,bam_filename,output_filename,kwargs.get('output_dir', 'results'))

finally:
if genome_reference!=None: genome_reference.close()
samfile.close()

time1 = time.time()
print('Done building the observations table in %.2f sec.' % (time1-time0))
return tuple(obs_tab), info

if __name__ == "__main__":
if __name__ == "__main__":

parser = argparse.ArgumentParser(
description='Builds a table of single base observations at known SNP positions.')
parser.add_argument('bam_filename', metavar='BAM_FILENAME', type=str,
parser.add_argument('bam_filename', metavar='BAM_FILENAME', type=str,
help='BAM file')
parser.add_argument('legend_filename', metavar='LEG_FILENAME', type=str,
parser.add_argument('legend_filename', metavar='LEG_FILENAME', type=str,
help='IMPUTE2 legend file')
parser.add_argument('-f','--fasta_filename', type=str,metavar='FASTA_FILENAME', default='',
help='The faidx-indexed reference file in the FASTA format. '
help='The faidx-indexed reference file in the FASTA format. '
'Supplying a reference file will reduce false SNPs caused by misalignments using the Base Alignment Quality (BAQ) method described in the paper “Improving SNP discovery by base alignment quality”, Heng Li, Bioinformatics, Volume 27, Issue 8.')
parser.add_argument('-u', '--handle-multiple-observations', type=str,
metavar='all/first/random/skip', default='all',
parser.add_argument('-u', '--handle-multiple-observations', type=str,
metavar='all/first/random/skip', default='all',
help='We expect to observe at most a single base per SNP. When encountering an exception the default behavior is to keep all the observations.'
'However, a few alternative options to handle multiple observations are available: (a) take the first observed base, (b) pick randomly an observed base and (c) skip the observed bases.')
parser.add_argument('-b', '--min-bq', type=int,
metavar='INT', default=30,
parser.add_argument('-b', '--min-bq', type=int,
metavar='INT', default=30,
help='Minimum base quaility for observations. Default value is 30.')
parser.add_argument('-m', '--min-mq', type=int,
parser.add_argument('-m', '--min-mq', type=int,
metavar='INT', default=30,
help='Minimum mapping quaility for observations. Default value 30.')
parser.add_argument('-d', '--max-depth', type=int,
parser.add_argument('-d', '--max-depth', type=int,
metavar='INT', default=0,
help='Maximum depth coverage to be considered (inclusive). Default value is 0, effectively removing the depth limit.')
parser.add_argument('-o', '--output-filename', metavar='OUTPUT_FILENAME', type=str, default='',
help='Output filename. The default filename is the same as the BAM filename, but with an extension of .chr_id.obs.p')
parser.add_argument('-c', '--compress', metavar='gz/bz2/unc', type=str, default='unc',
help='Output compressed via gzip, bzip2 or uncompressed. Default is uncompressed.')

retrive_bases(**vars(parser.parse_args()))
sys.exit(0)
else:
print('The module MAKE_OBS_TAB was imported.')
print('The module MAKE_OBS_TAB was imported.')

0 comments on commit 92dde7e

Please sign in to comment.