Skip to content

Commit

Permalink
Added breakpoint coverage calculation function.
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanabo committed May 26, 2016
1 parent 1570026 commit 0ee8de2
Showing 1 changed file with 64 additions and 143 deletions.
207 changes: 64 additions & 143 deletions breakmer/results/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
BreaKmer results module
'''

import sys

import pysam
import breakmer.utils as utils


Expand All @@ -20,15 +21,18 @@ class SVResult(object):
'''
'''

def __init__(self, sv_event, contig, target_region_values, disc_reads):
def __init__(self, sv_event, sample_bam_file, contig, target_region_values, disc_reads):
self.logging_name = 'breakmer.results.results'
self.sample_bam = sample_bam_file
self.sv_event = sv_event
self.contig = contig
self.disc_reads = disc_reads
self.filter = False
# self.query_region = target_region_values
self.query_region = target_region_values
self.sv_type = None
self.rearr_desc = None
self.query_cov = []
self.breakpoint_values = None
self.values = {'anno_genes': None,
'target_breakpoints': None,
'align_cigar': '',
Expand All @@ -55,6 +59,12 @@ def set_values(self):
return

self.sv_type = self.sv_event.get_event_type()
self.query_cov = [0] * len(self.contig.seq.value)
for realigned_segment in self.sv_event.realignments:
for i in range(realigned_segment.get_coords('query')[0], realigned_segment.get_coords('query')[1]):
i = min(i, len(self.query_cov)-1)
self.query_cov[i] += 1

if self.sv_type == 'indel':
self.set_indel_values()
else:
Expand All @@ -74,6 +84,7 @@ def set_indel_values(self):
self.values['align_cigar'] = realigned_segment.cigar
self.values['sv_type'] = 'indel'
self.values['split_read_count'] = ",".join([str(self.contig.get_contig_counts().get_counts(x, x, 'indel')) for x in realigned_segment.query_brkpts])
self.values['breakpoint_coverages'] = self.get_brkpt_coverages()

def set_rearrangement_values(self):

Expand Down Expand Up @@ -127,8 +138,8 @@ def set_rearrangement_values(self):
# self.br_sorted = sorted(self.br_sorted, key=lambda br: br[1])

if not self.multiple_genes(brkpts['chrs'], brkpts['r'], res_values['anno_genes']):
brkpt_counts, brkpt_kmers, brkpt_rep_filt = self.get_brkpt_counts_filt(brkpts, 'rearr')

brkpt_counts, brkpt_kmers, brkpt_rep_filter = self.get_brkpt_counts_filt(brkpts, 'rearr')
self.breakpoint_values = {'kmers': brkpt_kmers, 'counts': brkpt_counts, 'rep_filter': brkpt_rep_filter, 'ref_pos': brkpts['r']}
sv_type, sv_subtype, disc_read_support = self.define_rearr(self.disc_reads, brkpts['r'], res_values['strands'], brkpts['tcoords'], brkpts['qcoords'])
# if not self.filter_rearr(query_region, params, brkpts['r'], brkpt_counts, brkpt_kmers, rearr_type, disc_read_support):
res_values['sv_type'] = sv_type
Expand All @@ -139,7 +150,8 @@ def set_rearrangement_values(self):
res_values['split_read_count'] = brkpt_counts['b']
# result = self.format_result(res_values)
else:
brkpt_counts, brkpt_kmers, brkpt_rep_filt = self.get_brkpt_counts_filt(brkpts, 'trl')
brkpt_counts, brkpt_kmers, brkpt_rep_filter = self.get_brkpt_counts_filt(brkpts, 'trl')
self.breakpoint_values = {'kmers': brkpt_kmers, 'counts': brkpt_counts, 'rep_filter': brkpt_rep_filter, 'ref_pos': brkpts['r']}
# if not self.filter_trl(valid_rearrangement, query_region, params, brkpt_counts, brkpt_kmers, disc_read_count, res_values['anno_genes'], max_repeat, brkpt_rep_filt):
res_values['disc_read_count'] = self.check_disc_reads(brkpts['t'], self.disc_reads['disc'])
res_values['sv_type'] = ['rearrangement']
Expand All @@ -150,6 +162,8 @@ def set_rearrangement_values(self):
for key in res_values.keys():
if key in self.values:
self.values[key] = res_values[key]
self.values['breakpoint_coverages'] = self.get_brkpt_coverages()
self.valid_rearrangement = valid_rearrangement

def get_brkpt_info(self, br, brkpt_d, i, last_iter):

Expand Down Expand Up @@ -197,6 +211,49 @@ def get_brkpt_info(self, br, brkpt_d, i, last_iter):
brkpt_d['formatted'].append(str(br.get_name('hit')) + ":" + "-".join([str(x) for x in tbrkpt]))
return brkpt_d

def get_brkpt_coverages(self):

'''
'''

brkpts = []
tbp = self.values['target_breakpoints']
if not isinstance(tbp, list):
if tbp.find("(") > -1:
tbp = [self.values['target_breakpoints'].split()[0]]
# tbp = tbp.split(',')
for bp in tbp:
chrom, locs = bp.split(':')
chrom = chrom.replace('chr', '')
ll = locs.split('-')
if len(ll) > 1:
brkpts.append((chrom, int(ll[0]), int(ll[0])+1))
brkpts.append((chrom, int(ll[1]), int(ll[1])+1))
else:
brkpts.append((chrom, int(ll[0]), int(ll[0])+1))

bamfile = pysam.Samfile(self.sample_bam,'rb')

covs = [0]*len(brkpts)
bp_index = 0
for bp in brkpts:
cov = 0
c,s,e = bp
try:
areads = bamfile.fetch(str(c), s, e)
except:
# print 'Passing on brkpt', c, s, e
continue


for aread in areads:
if aread.is_duplicate or aread.is_qcfail or aread.is_unmapped or aread.mapq < 10:
continue
cov += 1
covs[bp_index] = cov
bp_index += 1
return ",".join([str(x) for x in covs])

def multiple_genes(self, chrs, brkpts, anno_genes):

'''
Expand Down Expand Up @@ -405,7 +462,7 @@ def get_brkpt_counts_filt(self, brkpts, sv_type):
brkpt_counts['n'].append(min(bc))
brkpt_counts['d'].append(min(self.contig.get_contig_counts().get_counts((qb[0]-1), (qb[0]+1), sv_type)))
brkpt_counts['b'].append(self.contig.get_contig_counts().get_counts(qb[0], qb[0], sv_type))
# brkpt_kmers.append(self.contig_kmer_locs[qb[0]])
brkpt_kmers.append(self.contig.get_kmer_locs()[qb[0]])
brkpt_rep_filt = brkpt_rep_filt or (comp_vec[qb[0]] < (avg_comp/2))
utils.log(self.logging_name, 'debug', 'Read count around breakpoint %d: %s'%(qb[0],",".join([str(x) for x in bc])))
utils.log(self.logging_name, 'debug', 'Kmer count around breakpoints %s'%(",".join([str(x) for x in brkpt_kmers])))
Expand Down Expand Up @@ -788,139 +845,3 @@ def set_event_type(self, event_type):
# rs = max(rs)
# return type, rs

# def filter_rearr(self, query_region, params, brkpts, brkpt_counts, brkpt_kmers, rearr_type, disc_read_count):
# print 'Breakpoint counts', brkpt_counts
# print params.opts
# in_ff, span_ff = utils.filter_by_feature(brkpts, query_region, params.get_param('keep_intron_vars'))
# filter = (min(brkpt_counts['n']) < params.get_param('rearr_sr_thresh')) or self.br_sorted[0][1] < params.get_param('rearr_minseg_len') or (in_ff and span_ff) or (disc_read_count < 1) or (rearr_type == 'rearrangement') or (min(brkpt_kmers) == 0)
# utils.log(self.logging_name, 'info' ,'Check filter for rearrangement')
# utils.log(self.logging_name, 'info', 'Filter by feature for being in exon (%r) or spanning exon (%r)'%(in_ff, span_ff))
# utils.log(self.logging_name, 'info', 'Split read threshold %d, breakpoint read counts %d'%(min(brkpt_counts['n']), params.get_param('rearr_minseg_len')))
# utils.log(self.logging_name, 'info', 'Minimum segment length observed (%d) meets threshold (%d)'%(self.br_sorted[0][1], params.get_param('rearr_minseg_len')))
# utils.log(self.logging_name, 'info', 'Minimum discordant read pairs for rearrangement (%d)'%(disc_read_count))
# return filter

# def filter_trl(self, br_valid, query_region, params, brkpt_counts, brkpt_kmers, disc_read_count, anno_genes, max_repeat, rep_filt):
# filter = br_valid[1] or (max(brkpt_counts['d']) < params.get_param('trl_sr_thresh')) #or not br_valid[0]
# utils.log(self.logging_name, 'debug', 'Check translocation filter')
# utils.log(self.logging_name, 'debug', 'All blat result segments are within annotated or pre-specified regions %r' % br_valid[0])
# utils.log(self.logging_name, 'debug', 'All blat result segments are within simple repeat regions that cover > 75.0 percent of the segment %r' % br_valid[1])
# utils.log(self.logging_name, 'debug', 'The maximum read count support around breakpoints %d meets split read threshold %d' % (max(brkpt_counts['d']), params.get_param('trl_sr_thresh')))
# utils.log(self.logging_name, 'debug', 'The minimum number of kmers at breakpoints %d' % min(brkpt_kmers))
# utils.log(self.logging_name, 'debug', 'The maximum repeat overlap by a blat result: %f' % max_repeat)
# if not filter:
# utils.log(self.logging_name, 'debug', 'Filter %r, checking discordant read counts %d'%(filter, disc_read_count))
# if disc_read_count < 2:
# # print 'Filter due to repeat', rep_filt
# if (self.br_sorted[0][1] < params.get_param('trl_min_seg_len')) or (min(brkpt_counts['n']) < params.get_param('trl_sr_thresh')) or (min(brkpt_kmers)==0) or rep_filt:
# utils.log(self.logging_name, 'debug', 'Shortest segment is < %d bp with %d discordant reads. Filtering.'%(params.get_param('trl_minseg_len'), disc_read_count))
# utils.log(self.logging_name, 'debug', 'The minimum read count support for breakpoints %d meets split read threshold %d'%(min(brkpt_counts['n']),params.get_param('trl_sr_thresh')))
# utils.log(self.logging_name, 'debug', 'The minimum number of kmers at breakpoints %d' % min(brkpt_kmers))
# filter = True
# elif disc_read_count == 0:
# # Check a number of metrics for shortest blat segment
# br_qs = self.br_sorted[0][0].qstart()
# br_qe = self.br_sorted[0][0].qend()
# low_complexity = self.minseq_complexity(self.contig_seq[br_qs:br_qe],3) < 25.0 # Complexity of blat segment
# missing_qcov = self.missing_query_coverage() > 5.0
# short = self.br_sorted[0][1] <= round(float(len(self.contig_seq))/float(4.0))
# utils.log(self.logging_name, 'debug', 'Checking length of shortest sequence, considered too short %r, %d, %f'%(short, self.br_sorted[0][1], round(float(len(self.contig_seq))/float(4.0))) )
# overlap = max(self.br_sorted[0][0].seg_overlap) > 5
# gaps_exist = max(self.br_sorted[0][0].gaps['query'][0], self.br_sorted[0][0].gaps['hit'][0]) > 0
# low_uniqueness = self.check_uniqueness()
# intergenic_regions = 'intergenic' in anno_genes
# read_strand_bias = self.check_read_strands()
# check_values = [low_complexity, missing_qcov, short, overlap, gaps_exist, low_uniqueness, read_strand_bias, intergenic_regions]
# utils.log(self.logging_name, 'debug', 'Discordant read count of 0 checks %s' % (",".join([str(x) for x in check_values])))
# num_checks = 0
# for check in check_values:
# if check:
# num_checks += 1
# if num_checks > 1:
# utils.log(self.logging_name, 'info', 'Two or more filter checks, setting filtering to true for contig')
# filter = True
# return filter

# def check_uniqueness(self):
# low_unique = False
# for br_vals in self.br_sorted:
# if not br_vals[0].in_target:
# if br_vals[0].mean_cov > 4:
# low_unique = True
# else:
# if br_vals[0].mean_cov > 10:
# low_unique = True
# return low_unique

# def check_read_strands(self):
# same_strand = False
# strands = []
# for read in self.contig_reads:
# strand = read.id.split("/")[1]
# strands.append(strand)
# if len(set(strands)) == 1:
# same_strand = True
# utils.log(self.logging_name, 'debug', 'Checking read strands for contig reads %s' % (",".join([read.id for read in self.contig_reads])))
# utils.log(self.logging_name, 'debug', 'Reads are on same strand: %r' % same_strand)
# return same_strand

# def minseq_complexity(self, seq, N):
# utils.log(self.logging_name, 'debug', 'Checking sequence complexity of blat result segment %s using %d-mers' % (seq, N))
# nmers = {}
# total_possible = len(seq) - 2
# for i in range(len(seq) - (N - 1)):
# nmers[str(seq[i:i+N]).upper()] = True
# complexity = round((float(len(nmers))/float(total_possible))*100,4)
# utils.log(self.logging_name, 'debug', 'Complexity measure %f, based on %d unique %d-mers observed out of a total of %d %d-mers possible' % (complexity, len(nmers), N, total_possible, N))
# return complexity

# def missing_query_coverage(self):
# missing_cov = 0
# for i in self.query_cov:
# if i == 0:
# missing_cov += 1
# else:
# break

# for i in reversed(self.query_cov):
# if i == 0:
# missing_cov += 1
# else:
# break

# perc_missing = round((float(missing_cov)/float(len(self.contig_seq)))*100, 4)
# utils.log(self.logging_name, 'debug', 'Calculated %f missing coverage of blat query sequence at beginning and end' % perc_missing)
# return perc_missing

# def multiple_genes(self, chrs, brkpts, anno_genes):

# '''
# '''

# mult_genes = True
# if len(set(anno_genes)) == 1:
# utils.log(self.logging_name, 'debug', 'One annotated gene among SVs breakpoints: %s' % ",".join(anno_genes))
# mult_genes = False
# elif self.dup_gene_names(anno_genes) and len(set(chrs)) == 1 and ((max(brkpts) - min(brkpts)) < 10000 ):
# utils.log(self.logging_name, 'debug', 'Anno genes are not the same, but similar and breakpoints are less than 10Kb from each other %s' % ",".join(anno_genes))
# mult_genes = False
# utils.log(self.logging_name, 'debug', 'Test whether SVs breakpoints are in multiple genes %r' % mult_genes)
# return mult_genes

# def dup_gene_names(self, anno_genes):
# found_dup = False
# for i in range(len(anno_genes)-1):
# g1 = anno_genes[i]
# for g2 in anno_genes[(i+1):]:
# if (g1.find(g2) > -1) or (g2.find(g1) > -1):
# found_dup = True
# return found_dup

# def check_disc_reads(self, brkpts, query_region, disc_reads ):
# disc_read_count = 0
# if brkpts['other'][0] in disc_reads:
# for p1, p2 in disc_reads[brkpts['other'][0]]:
# d1 = abs(p1 - brkpts['in_target'][1])
# d2 = abs(p2 - brkpts['other'][1])
# if d1 <= 1000 and d2 <= 1000: disc_read_count += 1
# return disc_read_count

0 comments on commit 0ee8de2

Please sign in to comment.