##dump-phasing-from-BAM

dumps read info including special phasing tag into a TSV file.

In [52]:
%run ipy_setup.py

In [53]:
def get_refmap(bamread):
    bpos = bamread.get_aligned_pairs(matches_only=True)
    positions = [i[0] for i in bpos]
    refpos =  [i[1] for i in bpos] # positions in reference
    refmap = dict(zip(refpos, positions))
    return refmap

def get_matched_bases_in_read(bamread, in_pos, refmap):
    outseq = [bamread.seq[refmap[i]] if i in refmap else 'N' for i in in_pos]
    outseq = ''.join(outseq)
    return outseq

def get_position_in_read(bamread, in_pos, refmap):
    outpos = None
    if in_pos in refmap:
        outpos = refmap[in_pos]
    return outpos

def reverse_compl(seq):
    translate = {'A':'T', 
                 'T':'A', 
                 'C':'G',
                 'G':'C', 
                 'N':'N'}
    return ''.join([translate[s] for s in seq])

class BlockVariant:
    def __init__ (self, variantline):
        # variant_id haplotype_1 haplotype_2 chromosome position refallele variantallele genotype allele_counts:genotype_likelihoods:delta:MEC_variant
        ll = variantline.strip().split("\t")
        var_id, hap1, hap2, chrom, pos, r_allele, v_allele, genotype, info_str = ll
        self.chrom, self.r_allele, self.v_allele, self.info_str = chrom, r_allele, v_allele, info_str
        self.var_id, self.hap1, self.hap2, self.pos = int(var_id), hap1, hap2, int(pos)
        allele_counts, genotype_likelihoods, delta, MEC_variant = info_str.split(":")[0:4]
        self.ref_count, self.alt_count = map(int, allele_counts.split(","))
        gen_00, gen_01, gen_11 = map(float, genotype_likelihoods.split(","))
        self.gen_like = {"0/0":gen_00, "0/1":gen_01, "1/1":gen_11}
        self.delta = float(delta)
        self.MEC_variant = MEC_variant
    def __repr__ (self):
        return "<BlockVariant, var_id: %s>" % str(self.var_id)


class Block: # part of block reader
    def __init__ (self, blockline):
        # "BLOCK: offset:" first_variant_block "len:" length_of_block "phased": phased_variants_block SPAN: 
        # lengthspanned MECscore score fragments #fragments

        ll               = blockline.strip().split()
        self.offset      = int(ll[2])
        self.total_len   = int(ll[4])
        self.phased      = int(ll[6])
        self.span        = int(ll[8])
        self.MECscore    = float(ll[10])
        self.fragments   = int(ll[12])
        self.variants 	 = dict() # default to empty
        self.variant_ids = set()
        self.chrom = None
        self.start = None
        self.end = None
        self.informative_reads = []
        self.unphased_reads = []
        self.unphased_read_set = set()
        self.read_count = 0
        self.read_set = set()

    def __repr__ (self):
        return "<Block, offset_id: %s>" % str(self.offset)

    def addVariant(self, variantline):
        variant = BlockVariant(variantline)
        self.variants[variant.var_id] = variant
        self.variant_ids.add(variant.var_id)
        self.updatePosition(variant)

    def updatePosition(self, variant): # we need to do this because sometimes the variant isn't associated with a block
        if (self.chrom != None) & (self.start != None) & (self.end != None):
            if variant.pos < self.start:
                self.start = variant.pos
            elif variant.pos > self.end:
                self.end = variant.pos
            if self.chrom != variant.chrom:
                sys.stderr.write("WARNING: Cannot add variants from other contigs to current block.")
                return 1
        else:
            self.chrom = variant.chrom
            self.start = variant.pos
            self.end = variant.pos

    def concordance(self, input_read):
        ''' this should return a dict of (#T,#F) tuples per variant
         each element is a variant's concordance with the reads
         using the read's haplotype information, we can establish whether the read's phasing
         is consistent with how the variant was phased '''
        variant_concord = dict()
        support_reads_hap2 = 0
        against_reads_hap2 = 0
        support_reads_hap1 = 0
        against_reads_hap1 = 0
        for k,variant in self.variants.iteritems():
            for read in input_reads:
                if variant.var_id in read.positions:
                    read_allele = read.alleles[read.positions.index(variant.var_id)]
                    hapstate = read.haplotypes[self.offset]
                    if hapstate == 2:
                        if variant.hap2 != read_allele:
                            against_reads_hap2 += 1
                        else:
                            support_reads_hap2 += 1
                    else:
                        if variant.hap1 != read_allele:
                            against_reads_hap1 += 1
                        else:
                            support_reads_hap1 += 1
        variant_concord[self.offset] = {"hap1": (support_reads_hap1, against_reads_hap1), 
                                               "hap2": (support_reads_hap2, against_reads_hap2)}
        return variant_concord

    def variant(self, var_id):
        try:
            return self.variants[var_id]
        except:
            return None

class HapCutReader: # hapcut file reader

    def __init__ ( self, fn ):
        self.fn = fn
        self.blocks = dict()
        self.translate = dict()
        for block in self.read_file_to_blocks(fn):
            self.blocks[block.offset] = block
            for v in block.variant_ids: # v is an id
                self.translate[v] = block.offset

    def loc(self, block_id):
        try:
            return self.blocks[self.translate[block_id]]
        except:
            return None

    def read_file_to_blocks(self, fn):
        with open(fn) as f:
            currBlock = None
            for l in f:
                if l[0] == "B": # starting a new block
                    currBlock = Block(l)
                elif l[0] == "*": # ending a block
                    yield currBlock
                else:
                    currBlock.addVariant(l)

    def __repr__ (self):
        return "<HapCutReader, filename: %s>" % self.fn



In [82]:
basedir = "/hpc/users/neffr01/jason_new/hapcut_outputs/hg002_re_000000F"
bam_file = basedir + "/hapcut_qv13_mq10/hg002.000000F.qv13mq10.redo.merged.ann.bam"
hapcut_file = basedir + "/hapcut_qv13_mq10/hg002_hapcut_000000F.hapcut"
out_file = open("/hpc/users/neffr01/2work/hg002_dump_read_phasing.txt", 'w')

bam_reader = pysam.AlignmentFile(bam_file)
block_reader = HapCutReader(hapcut_file)
pbar = ProgressBar(len(block_reader.blocks))
for bid, block in block_reader.blocks.iteritems():
    pbar.animate()
    block_variants = [(v.pos, v.r_allele, v.v_allele, int(v.hap1), int(v.hap2)) for v \
              in block.variants.values()]
    for read in bam_reader.fetch(region="000000F", start=block.start, end=block.end):
        start, end, read_haps = read.reference_start, read.reference_end, []
        refmap = get_refmap(read)
        block_count = 0
        try:
            zhtag = read.get_tag("ZH").split(";")
            read_blocks = [int(a.split(",")[0]) for a in zhtag]
            read_haps   = [int(a.split(",")[1]) for a in zhtag]
            block_index = read_blocks.index(bid)
        except:
            continue # 
        block_hap   = read_haps[block_index]
        read_variants = [x for x in block_variants if ((x[0] >= start) & (x[0] <= end))]
        supp = {0:0, 1:0}
        block_hap = block_hap-1
        for var in read_variants:
            readbase = get_matched_bases_in_read(read, [var[0]-1], refmap)
            base_to_hap = { var[1]:var[3], 
                            var[2]:var[4]}
            try:
                supp[base_to_hap[readbase]] +=1
            except:
                continue
        print >>out_file, bid, block_hap, supp[0], supp[1], read.qname + "#" + str(block_count)
out_file.close()

[****************100%******************]  292 of 293 complete Elapsed: 91.5 minutes	 Remaining: 0.3 minutes
