In [265]:
!hostname
try:
    import readline
except ImportError:
    print("Module readline not available.")
else:
    import rlcompleter
    readline.parse_and_bind("tab: complete")

minerva4


In [6]:
# Imports / style (run this first always)

%matplotlib inline
from IPython.display import FileLink, FileLinks
from IPython.core import display
from collections import defaultdict
import json
import sys
import time

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

class AwesomeError(Exception):
     def __init__(self, value):
         self.value = value
         pass
     def __str__(self):
         return repr(self.value)
         pass

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

import Bio as bp
from Bio.Sequencing.Applications import BwaAlignCommandline as bwa_aln
from Bio.Sequencing.Applications import BwaSamseCommandline as bwa_samse
from Bio.Sequencing.Applications import BwaSampeCommandline as bwa_sampe
from Bio.Sequencing.Applications import BwaIndexCommandline as bwa_index
from Bio.Sequencing.Applications import BwaBwaswCommandline as bwa_bwasw
import HTSeq as ht
import subprocess

In [493]:
# %load /hpc/users/neffr01/2work/documents/scripts/greedy_partitioner.py
#!/usr/bin/env python

# imports
import os, sys, getopt
import pysam
from itertools import groupby
import pandas as pd
import numpy as np
global count
count = 0

# init main
def main(argv):
    hairs_file = ''
    hapcut_file = ''
    bam_file = ''
    out_file = ''
    help = 'greedy_partitioner.py -h <input.hairs> -c <input.hapcut> -i <input.bam> -o <output.ann.bam>'
    try:
        opts, args = getopt.getopt(argv,"h:c:i:o:",["hairs=","hapcut=", "input=", "output="])
    except getopt.GetoptError:
        print help
        sys.exit(2)
    for opt, arg in opts:
        if opt == '--help':
            print help
            sys.exit()
        elif opt in ("-h", "--hairs"):
            hairs_file = arg
        elif opt in ("-c", "--hapcut"):
            hapcut_file = arg
        elif opt in ("-i", "--input"):
            bam_file = arg
        elif opt in ("-o", "--output"):
            out_file = arg
        else:
            assert False, "unhandled option"

    assert pysam.Samfile(bam_file, 'rb'), 'ERROR: Cannot open bam file for reading.'
    assert open(bam_file + '.bai', 'rb'), 'ERROR: bam file is not indexed!'
    bam_fp = pysam.Samfile(bam_file, 'rb')

    if out_file==None:
        out_file = bam_file + ".ann_haplotypes_" + time.strftime("%m%d%y_%H%M%S") + '.bam'

    assert pysam.AlignmentFile(out_file, "wb", template=bam_fp), 'ERROR: Cannot open output file for writing.'
    out_fp = pysam.AlignmentFile(out_file, "wb", template=bam_fp)

    assert open(hairs_file), 'ERROR: Cannot access hairs file.'
    assert open(hapcut_file), 'ERROR: Cannot open hapcut file.'

    hair_reader = HairReader(hairs_file)
    block_reader = HapCutReader(hapcut_file)
    stats_file = hairs_file + ".interblock_stats.tsv"
    sys.stdout.write("Loaded greedy_partitoner.py, beginning execution. \n")
    sys.stdout.flush()
    tag_reads(bam_fp, hair_reader, block_reader, out_fp) #begin tagging reads
    interblock_stats(hair_reader, block_reader, stats_file) #generate interblock stats
    bam_fp.close()
    out_fp.close()

# end of main

### CLASSES ###

class BlockVariant:
    def __init__ (self, variantline):
        # variant_id haplotype_1 haplotype_2 chromosome position refallele variantallele genotype allele_counts:genotype_likelihoods:delta:MEC_variant
        ll = variantline.strip().split("\t")
        var_id, hap1, hap2, chrom, pos, r_allele, v_allele, genotype, info_str = ll
        self.chrom, self.r_allele, self.v_allele, self.info_str = chrom, r_allele, v_allele, info_str
        self.var_id, self.hap1, self.hap2, self.pos = int(var_id), hap1, hap2, int(pos)
        allele_counts, genotype_likelihoods, delta, MEC_variant = info_str.split(":")[0:4]
        self.ref_count, self.alt_count = map(int, allele_counts.split(","))
        gen_00, gen_01, gen_11 = map(float, genotype_likelihoods.split(","))
        self.gen_like = {"0/0":gen_00, "0/1":gen_01, "1/1":gen_11}
        self.delta = float(delta)
        self.MEC_variant = MEC_variant
    def __repr__ (self):
        return "<BlockVariant, var_id: %s>" % str(self.var_id)


class Block:
    def __init__ (self, blockline):
        # "BLOCK: offset:" first_variant_block "len:" length_of_block "phased": phased_variants_block SPAN: 
        # lengthspanned MECscore score fragments #fragments

        ll               = blockline.strip().split()
        self.offset      = int(ll[2])
        self.total_len   = int(ll[4])
        self.phased      = int(ll[6])
        self.span        = int(ll[8])
        self.MECscore    = float(ll[10])
        self.fragments   = int(ll[12])
        self.variants 	 = [] # default to empty
        self.variant_ids = []
        self.chrom = 0
        self.start = 0
        self.end = 0
        self.informative_reads = []
        self.read_count = 0
        self.read_set = set()

    def __repr__ (self):
        return "<Block, offset_id: %s>" % str(self.offset)

    def addVariant(self, variantline):
        variant = BlockVariant(variantline)
        self.variants.append(variant)
        self.variant_ids.append(variant.var_id)
        self.updatePosition()

    def updatePosition(self): # we need to do this because sometimes the variant isn't associated with a block
        positions = []
        chrom = None
        for variant in self.variants:
            if chrom == None:
                chrom = variant.chrom
            positions.append(variant.pos)
        self.chrom = chrom
        self.start = np.min(positions)
        self.end = np.max(positions)

    def addReadsToBlock(self, read_array):
        self.informative_reads = []
        for read in read_array:
            read_ids = [var[0] for block in read.blocks for var in block]
            if len(set(read_ids).intersection(set(self.variant_ids))) > 0:
                self.informative_reads.append(read)
        self.read_count = len(self.informative_reads)
        self.read_set = frozenset([x.read_id for x in self.informative_reads])

    def concordance(self, input_reads):
        ''' this should return a dict of (#T,#F) tuples per variant
         each element is a variant's concordance with the reads
         using the read's haplotype information, we can establish whether the read's phasing
         is consistent with how the variant was phased '''
        variant_concord = dict()
        for variant in self.variants:
            support_reads_hap2 = 0
            against_reads_hap2 = 0
            support_reads_hap1 = 0
            against_reads_hap1 = 0
            for read in input_reads:
                if variant.var_id in read.positions:
                    read_allele = read.alleles[read.positions.index(variant.var_id)]
                    hapstate = read.haplotypes[self.offset]
                    if hapstate == 2:
                        if variant.hap2 != read_allele:
                            against_reads_hap2 += 1
                        else:
                            support_reads_hap2 += 1
                    else:
                        if variant.hap1 != read_allele:
                            against_reads_hap1 += 1
                        else:
                            support_reads_hap1 += 1
            variant_concord[variant.var_id] = {"hap1": (support_reads_hap1, against_reads_hap1), 
                                               "hap2": (support_reads_hap2, against_reads_hap2)}
        return variant_concord

    def variant(self, var_id):
        return next((x for x in self.variants if var_id == x.var_id), None)

    def interblock_reads(self, input_reads):
        out_reads = []
        for read in input_reads:
            if read.read_id not in self.read_set:
                if read.chrom == self.chrom:
                    if ((read.start < self.end) & (read.end > self.end)) | \
                        ((read.end > self.start) & (read.start < self.start)) | \
                        ((read.end <= self.end) & (read.start >= self.start)):
                        out_reads.append(read)
        return out_reads

class HapCutReader:

    def __init__ ( self, fn ):
        self.fn = fn
        self.blocks = list(self.read_file_to_blocks(fn))

    def loc(self, block_id):
        return next((x for x in self.blocks if block_id in x.variant_ids), None)

    def read_file_to_blocks(self, fn):
        with open(fn) as f:
            currBlock = None
            for l in f:
                if l[0] == "B": # starting a new block
                    currBlock = Block(l)
                elif l[0] == "*": # ending a block
                    yield currBlock
                else:
                    currBlock.addVariant(l)

    def __repr__ (self):
        return "<HapCutReader, filename: %s>" % self.fn                    

class HapCutRead:

    def __init__ (self, hairline):
        #Column 1 is the number of consecutive set of SNPs covered by the fragment, NOT haplotype blocks.
        #Column 2 is the fragment id. 
        #Column 3 is the offset of the first block of SNPs covered by the fragment followed by the alleles at the SNPs in this block.
        #Column 5 is the offset of the second block of SNPs covered by the fragment followed by the alleles at the SNPs in this block.
        #...
        #The last column is a string with the quality values (Sanger fastq format) for all the alleles covered by the fragment (concatenated for all blocks). 
        #For example, if a read/fragment covers SNPs 2,3 and 5 with the alleles 0, 1 and 0 respectively, then the input will be:
        #2 read_id 2 01 5 0 AAC
        #Here AAC is the string corresponding to the quality values at the three alleles. The encoding of 0/1 is arbitrary but following the VCF format, 0 is reference and 1 is alternate. 
        hairlist = hairline.strip().split()
        self.blockcount = 0                # this information must be determined afterwards 
        self.read_id    = hairlist[1]      # read_id
        self.blocks     = []		       # an array of tuples corresponding to blocks
        self.positions  = []
        self.alleles    = []
        self.chrom      = None
        self.start      = None
        self.end        = None
        self.haplotypes = dict()             # an array of {"block_offset":"haplotype"} 
                                           # after partitioning
        for i in range(2, len(hairlist)-1, 2):
            position = int(hairlist[i])
            allele = hairlist[i+1]
            block = zip(range(position, position+len(allele)), allele)
            self.blocks.append(block)
            self.positions.extend(range(position, position+len(allele)))
            self.alleles.extend(allele)
            self.qualities  = hairlist[-1]         # a matched arary of the qualities of allele calls

    def __repr__(self):
        return "<HapCutRead, read_id: %s>" % str(self.read_id)

    def haplotype_fields(self):
        haps = ";".join([','.join([str(key), str(self.haplotypes[key])]) for key in self.haplotypes])
        haptag = [("ZH", haps), ("ZB", int(self.blockcount))]
        return haptag
    
    def addGenomicPositions(self, block_reader):
        arr = []
        chrom = None
        for position in self.positions:
            b = block_reader.loc(position)
            if b == None:
                continue
            if chrom == None:
                chrom = b.chrom
            arr.append(b.variant(position).pos)
        if len(arr) > 0:
            self.chrom = chrom
            self.start = np.min(arr)
            self.end = np.max(arr)
        else:
            self.chrom = '*'
            self.start = None
            self.end = None
    
class HairReader:

    def __init__ (self, fn):
        self.fn = fn
        self.reads = []
        with open (fn) as f:
            for l in f:
                self.reads.append(HapCutRead(l))
        self.read_set = frozenset([x.read_id for x in self.reads])

    def __repr__ (self):
        return "<HairReader, filename: %s>" % self.fn

    def loc(self, read_id):
        return next((x for x in self.reads if read_id == x.read_id))

### FUNCTIONS ###

'''

tag_reads()

Usage: Tags reads from a bam file corresponding to a particular haplotype, with haplotype
definitions from HapCut, under the optional tag "ZH".

Inputs:
    bam_fp
    A pysam.Samfile object pointing to the input file
    hair_reader
    A HairReader object pointing to the hairs file.
    block_reader
    A HapcutReader object pointing to the hapcut file.
    out_fp
    A pysam.AlignmentFile pointing to the output bam.
Outputs:
    (none - writes to out_fp)

'''

def tag_reads(bam_fp, hair_reader, block_reader, out_fp):
    ''' tag_reads(bam_fp, hair_reader, block_reader, out_fp)'''
    global count
    for bamread in bam_fp.fetch():
        count += 1
    if (count % 100) == 0:
        sys.stdout.write("\rWritten %s lines to output." % str(count))
        sys.stdout.flush()
        if bamread.query_name in hair_reader.read_set:
            read = hair_reader.loc(bamread.query_name)
            read = greedy_partition(read, block_reader)
            bamread.tags += read.haplotype_fields()      # add the haplotype information
        out_fp.write(bamread)

'''
greedy_partition()
Ryan Neff

inputs:
read
    a HapCutRead object
block_reader
    of the type HapCutReader

outputs:
    the original read, now with haplotype information.

translate hairfile alleles into blockvar IDs
get alleles in each read spanning blockvars
determine alleles for the two blocks from blockvar
partition read based on locally most probable alignment

'''

def greedy_partition(read, block_reader):
    # it turns out that the blocks provided in a hapcut file don't actually correspond to real blocks
    # just contiguous alleles?
    read.blocks = [] # because they are useless
    lastBlock = -1
    for ix, pos in enumerate(read.positions):
        currBlock = block_reader.loc(pos)
        if currBlock == None:
            continue
        currBlock = currBlock.offset
        if currBlock != lastBlock:
            read.blocks.append([])
        read.blocks[-1].append((pos, read.alleles[ix]))
        lastBlock = currBlock
    read.blockcount = len(read.blocks)
    for readblock in read.blocks:
        positions = [x[0] for x in readblock]
        alleles = [x[1] for x in readblock]
        allele_state = []
        offset = positions[0]
        hap = 0
        block = block_reader.loc(offset) #retrieve block the read is in
        offset = block.offset
        if block == None:
            continue
        for ix, varpos in enumerate(positions):
            blockvar = block.variant(varpos)
            if blockvar.hap1 == alleles[ix]:
                allele_state.append(-1)
            elif blockvar.hap2 == alleles[ix]:
                allele_state.append(1)
            else:
                sys.stderr.write("\nERROR: read allele matched no haplotypes.\n")
                sys.stderr.write("\nHair read: %s" % read.read_id)
                sys.stderr.write("\nAlleles: %s" % str(alleles[ix]))
                sys.stderr.write("\n Hap 1: %s, Hap 2: %s\n" % (blockvar.hap1, blockvar.hap2))
                sys.stderr.flush()
                continue
        if len(allele_state) < 1:
            sys.stderr.write("Warning: no haplotype information in read.\n")
            hap = -1
        if sum(allele_state) < 0:
            hap = 1
        elif sum(allele_state) > 0:
            hap = 2
        else:
            hap = 0
        read.haplotypes[offset] = hap
    return read

'''
interblock_stats()

Usage: Creates a tab-separated values file with statistics about reads overlapping
between nearby blocks, and finds the concordance of these interblock reads
with haplotypes in other blocks. 

inputs:
    hair_reader
        A HairReader object
    block_reader
        A HapcutReader object
    out_stats
        A string where the .tsv should be written. Defaults
        to the hairs filename given in the input + 'interblock_stats.tsv'
outputs:
    none-writes to file directly

'''

def interblock_stats(hair_reader, block_reader, out_stats=hairs_file + ".interblock_stats.tsv"):
    blockdist = []
    lastChr = None
    lastPos = None
    lastBlock = None
    lastReads = set()
    for read in hair_reader.reads:
        if read.haplotypes == dict():
            read = greedy_partition(read, block_reader)
        read.addGenomicPositions(block_reader)
    for block in block_reader.blocks:
        if block.read_set == set():
            block.addReadsToBlock(hair_reader.reads)
        currBlock = block.offset
        currChr = block.chrom
        currPos = block.start
        if lastBlock != None:
            if lastChr == currChr:
                interblock_reads = block.interblock_reads(lastBlock_obj.informative_reads)
                row=[lastBlock, currBlock, currChr, lastPos, currPos, currPos-lastPos, 
                     len(lastBlock_obj.variant_ids), len(block.variant_ids), 
                     len(lastBlock_obj.informative_reads), len(block.informative_reads),
                     len(interblock_reads), 
                     lastBlock_obj.concordance(lastBlock_obj.informative_reads), 
                     block.concordance(block.informative_reads)]
                blockdist.append(row)
            else:
                continue
        lastBlock = currBlock
        lastBlock_obj = block
        lastChr = currChr
        lastPos = block.end
    header = ['block1', 'block2', 'chrom', 'block1_end', 'block2_start', 'distance', 'block1_variants', 'block2_variants', 
              'block1_reads', 'block2_reads', 'interblock_reads', 'block1_concordance', 'block2_concordance']
    info = pd.DataFrame(blockdist, columns=header)
    info.to_csv(out_stats, sep="\t")

# run the program if called from the command line
#if __name__ == "__main__":
#   main(sys.argv[1:])


**Interblock statistics**

* length of block
* variant count in block
* total reads inside block
* Informative reads inside block
* Distance between blocks
* Which blocks overlap
* Interblock reads
    * Read count between junctions
    * 2x2 matrix with support for linking blocks if at junction
* Total coverage between blocks


In [510]:
#interblock distance
import pandas as pd
import numpy as np

def interblock_stats(hair_reader, block_reader, out_stats=hairs_file + ".interblock_stats.tsv"):
    blockdist = []
    lastChr = None
    lastPos = None
    lastBlock = None
    lastReads = set()
    for read in hair_reader.reads:
        if read.haplotypes == dict():
            read = greedy_partition(read, block_reader)
        read.addGenomicPositions(block_reader)
    for block in block_reader.blocks:
        if block.read_set == set():
            block.addReadsToBlock(hair_reader.reads)
        currBlock = block.offset
        currChr = block.chrom
        currPos = block.start
        if lastBlock != None:
            if lastChr == currChr:
                interblock_reads = block.interblock_reads(lastBlock_obj.informative_reads)
                row=[lastBlock, currBlock, currChr, lastPos, currPos, currPos-lastPos, 
                     len(lastBlock_obj.variant_ids), len(block.variant_ids), 
                     len(lastBlock_obj.informative_reads), len(block.informative_reads),
                     len(interblock_reads), 
                     lastBlock_obj.concordance(lastBlock_obj.informative_reads), 
                     block.concordance(block.informative_reads)]
                blockdist.append(row)
            else:
                continue
        lastBlock = currBlock
        lastBlock_obj = block
        lastChr = currChr
        lastPos = block.end
    header = ['block1', 'block2', 'chrom', 'block1_end', 'block2_start', 'distance', 'block1_variants', 'block2_variants', 
              'block1_reads', 'block2_reads', 'interblock_reads', 'block1_concordance', 'block2_concordance']
    info = pd.DataFrame(blockdist, columns=header)
    info.to_csv(out_stats, sep="\t")

# debug

In [494]:
bam_fp = pysam.Samfile('/sc/orga/scratch/bashia02/collaborations/hardik_shah/jason_new/hapcut_outputs/hg002_000000F/hg002_000000F.new.merged.bam', 'rb')
out_fp = pysam.AlignmentFile('/sc/orga/scratch/bashia02/collaborations/hardik_shah/jason_new/hapcut_outputs/hg002_000000F/hg002.000000F.debug.bam', 'wb', template=bam_fp)
hairs_file = '/sc/orga/scratch/bashia02/collaborations/hardik_shah/jason_new/hapcut_outputs/hg002_000000F/hg002_hapcut.000000F.hairs'
hapcut_file = '/sc/orga/scratch/bashia02/collaborations/hardik_shah/jason_new/hapcut_outputs/hg002_000000F/hg002_hapcut.000000F.hapcut'

hair_reader = HairReader(hairs_file)
block_reader = HapCutReader(hapcut_file)

In [422]:
print block_reader.loc(506).variant_ids
for i in block_reader.loc(506).informative_reads:
    print i
    print i.positions
    
print ""
print block_reader.loc(510).variant_ids
for i in block_reader.loc(510).informative_reads:
    print i
    print i.positions

[506, 507, 518]

[510, 511, 512, 514, 516, 517, 519, 520]


In [508]:
for x,block in enumerate(block_reader.blocks):
    lastBlock_obj = block_reader.blocks[x-1]
    interblock_reads = block.interblock_reads(hair_reader.reads)
    #print interblock_reads
    print lastBlock_obj.start
    print lastBlock_obj.end
    print [x.start for x in block.informative_reads]
    print interblock_reads

32976279
32980826
[5369, 5369, 9894]
[]
5369
17923
[29393, 29393, 29393, 29393, 29393, 30672, 29393, 29393, 29393, 29393, 30672, 30672]
[]
29393
36129
[78238, 75950, 75950, 78238, 75950, 75950, 75950, 75950, 78787, 78238, 75950, 75950, 75950, 78787, 78808, 78787, 78808, 75950, 78238, 75950, 78808, 75950, 75950, 75950, 79280, 75950, 78238, 75950, 75950, 78238, 78808, 75950, 75950, 81918, 78808, 78238, 78808, 78238, 78238, 78787, 78787, 78238, 78238, 78238, 78808, 79608, 81918, 78787, 78787, 78787, 81918, 78808, 79994, 79608, 81918, 79608, 84342, 84342, 81918, 81918, 81732, 81918, 89737, 84342, 84342, 84342, 88691, 93044, 91657, 84342, 84342, 84342, 84342, 94103, 84342, 91238, 84342, 84535, 91238, 84342, 86714, 91657, 86714, 86714, 88691, 89737, 88691, 86714, 86714, 94011, 86714, 86737, 89737, 86714, 86737, 86714, 86714, 86737, 86737, 91657, 91657, 89737, 88691, 89737, 89737, 88691, 94011, 89737, 89737, 89737, 89737, 89737, 89737, 89737, 91657, 91238, 94103, 91238, 91657, 91238, 91238, 9

In [514]:
ir = []
count = 0
for block in block_reader.blocks:
    ir.extend(block.informative_reads)
    count += len(block.informative_reads)
print len(ir)
print count

97551
97551


In [511]:
interblock_stats(hair_reader, block_reader, hairs_file + ".interblock_stats.tsv")

In [438]:
%debug

> [1;32m<ipython-input-430-eca5d994879a>[0m(148)[0;36mconcordance[1;34m()[0m
[1;32m    147 [1;33m                    [0mread_allele[0m [1;33m=[0m [0mread[0m[1;33m.[0m[0malleles[0m[1;33m[[0m[0mread[0m[1;33m.[0m[0mpositions[0m[1;33m.[0m[0mindex[0m[1;33m([0m[0mvariant[0m[1;33m.[0m[0mvar_id[0m[1;33m)[0m[1;33m][0m[1;33m[0m[0m
[0m[1;32m--> 148 [1;33m                    [0mhapstate[0m [1;33m=[0m [0mread[0m[1;33m.[0m[0mhaplotypes[0m[1;33m[[0m[0mself[0m[1;33m.[0m[0moffset[0m[1;33m][0m[1;33m[0m[0m
[0m[1;32m    149 [1;33m                    [1;32mif[0m [0mhapstate[0m [1;33m==[0m [1;36m2[0m[1;33m:[0m[1;33m[0m[0m
[0m
ipdb> read
<HapCutRead, read_id: m150222_081617_42156_c100777432550000001823160908051597_s1_p0/145946/0_12481>
ipdb> read.haplotypes
{55: 2}
ipdb> self
<Block, offset_id: 51>
ipdb> read.positions
[55, 57, 58]
ipdb> block.variant_ids
[510, 511, 512, 514, 516, 517, 519, 520]
ipdb> self.variant_ids
[51,

In [345]:
read = None
for read in hair_reader.reads:
    read2 = greedy_partition(read, block_reader)
    if read2.blockcount > 1:
        print read2.blocks
        print read2.haplotypes, read2.blockcount


ERROR: read allele matched no haplotypes.

Hair read: m150223_174943_42177R_c100788362550000001823173308251553_s1_p0/136032/3792_26206
Alleles: ['0', '0', '1']
 Block 1: 2, Block 2: 1

ERROR: read allele matched no haplotypes.

Hair read: m150224_191026_42177R_c100788362550000001823173308251557_s1_p0/89671/0_17273
Alleles: ['0', '0', '1', '0', '0', '1', '0', '1', '0', '0', '1', '1', '0', '0', '1', '0', '1', '1', '1', '1', '0', '0', '1', '0', '1', '1', '1', '0']
 Block 1: 2, Block 2: 0

ERROR: read allele matched no haplotypes.

Hair read: m150222_081617_42156_c100777432550000001823160908051597_s1_p0/22778/15931_29708
Alleles: ['0', '1', '1', '1']
 Block 1: 2, Block 2: 1

ERROR: read allele matched no haplotypes.

Hair read: m150225_013529_42156_c100779742550000001823166508251511_s1_p0/28885/0_3804
Alleles: ['0', '1']
 Block 1: 2, Block 2: 1

ERROR: read allele matched no haplotypes.

Hair read: m150222_081617_42156_c100777432550000001823160908051597_s1_p0/125623/13816_26172
Alleles: [