In [1]:
!hostname

minerva4


In [None]:
# Imports / style (run this first always)

%matplotlib inline
from IPython.display import FileLink, FileLinks
from IPython.core import display
from collections import defaultdict
import json
import sys
import time

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

class AwesomeError(Exception):
     def __init__(self, value):
         self.value = value
         pass
     def __str__(self):
         return repr(self.value)
         pass

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

import Bio as bp
from Bio.Sequencing.Applications import BwaAlignCommandline as bwa_aln
from Bio.Sequencing.Applications import BwaSamseCommandline as bwa_samse
from Bio.Sequencing.Applications import BwaSampeCommandline as bwa_sampe
from Bio.Sequencing.Applications import BwaIndexCommandline as bwa_index
from Bio.Sequencing.Applications import BwaBwaswCommandline as bwa_bwasw
import HTSeq as ht
import subprocess

In [3]:
import os, sys


class BlockVariant:
    def __init__ (self, variantline):
        # variant_id haplotype_1 haplotype_2 chromosome position refallele variantallele genotype allele_counts:genotype_likelihoods:delta:MEC_variant
        ll = variantline.strip().split("\t")
        var_id, hap1, hap2, chrom, pos, r_allele, v_allele, genotype, info_str = ll
        self.var_id, self.hap1, self.hap2, self.pos = int(var_id), int(hap1), int(hap2), int(pos)
        allele_counts, genotype_likelihoods, delta, MEC_variant = info_str.split(":")
        self.ref_count, self.alt_count = map(int, allele_counts.split(","))
        gen_00, gen_01, gen_11 = map(float, genotype_likelihoods.split(","))
        self.gen_like = {"0/0":gen_00, "0/1":gen_01, "1/1":gen_11}
        self.delta = float(delta)
        self.MEC_variant = MEC_variant


class Block:
    def __init__ (self, blockline):
        # "BLOCK: offset:" first_variant_block "len:" length_of_block "phased": phased_variants_block SPAN: 
        # lengthspanned MECscore score fragments #fragments
        
        ll               = blockline.strip().split()
        self.offset      = int(ll[2])
        self.total_len   = int(ll[4])
        self.phased      = int(ll[6])
        self.span        = int(ll[8])
        self.MECscore    = float(ll[10])
        self.fragments   = int(ll[12])
        variants = [] # default to empty

    def addVariant (variantline):
        variants.append(BlockVariant(variantline))


class HapCutReader:
    
    def __init__ ( self, fn ):
        self.blocks = read_file_to_blocks (fn)
        

    def read_file_to_blocks (self, fn):
        with open (hapcutoutfn) as f:
            f.readline()
            blocks = []
            currBlock = None
            prevBlock = False
            for l in f:
                if l[0] == "B":
                    if prevBlock:
                        yield block, snpDict
                    else:
                        prevBlock = True
                        currBlock = Block(l)
                else:
                    currBlock.addVariant(l)

                        

class HapCutRead:

    def __init__ (self, hairline):
        #Column 1 is the number of blocks (consecutive set of SNPs covered by the fragment). 
        #Column 2 is the fragment id. 
        #Column 3 is the offset of the first block of SNPs covered by the fragment followed by the alleles at the SNPs in this block.
        #Column 5 is the offset of the second block of SNPs covered by the fragment followed by the alleles at the SNPs in this block.
        #...
        #The last column is a string with the quality values (Sanger fastq format) for all the alleles covered by the fragment (concatenated for all blocks). 
        #For example, if a read/fragment covers SNPs 2,3 and 5 with the alleles 0, 1 and 0 respectively, then the input will be:
        #2 read_id 2 01 5 0 AAC
        #Here AAC is the string corresponding to the quality values at the three alleles. The encoding of 0/1 is arbitrary but following the VCF format, 0 is reference and 1 is alternate. 
        hairlist = hairline.strip().split()
        self.blockcount = int(hairlist[0])     # number of blocks
        self.read_id    = hairlist[1]          # read_id
        positions       = []                   # an array with the indices covered by read
        alleles         = []                   # a matched array with the allele calls at each position
        for i in range(2, len(hairlist)-1, 2):
            position = hairlist[i]
            allele = hairlist[i+1]
            positions.extend(range(position, position+len(allele)))
            alleles.extend(allele)
        self.qualities  = hairlist[-1]         # a matched arary of the qualities of allele calls

class HairReader:

    def __init__ (self, fn):
        self.reads = []
        with open (fn) as f:
            for l in f:
                self.reads.append(HapCutRead(l))

In [None]:
'''

extract_reads
Ryan Neff
7/5/2015

Usage: Extracts reads from a bam file corresponding to a particular haplotype, with haplotype
definitions from HapCut, and places them into a separate file.

Inputs:
    bam_file (string)
        The filename of a bam file on which the haplotype cuts were generated.
    read_array (list of <HapCutRead>)
        An array of HapCutRead objects that list which reads correspond to a particular haplotype block.
    blockvar_array (list of <BlockVariants>)
        An array of all BlockVariant objects in read_array that help convert read blocks to chromosomal positions. 
Outputs:
    out_file (string)
        The filename of the output (bam file).
        
'''

# required imports - additional to ipy_setup.py
import pysam
import subprocess
from itertools import groupby

def extract_reads(bam_file, read_array, blockvar_array, out_file):
    assert pysam.Samfile(bam_file, 'rb'), 'ERROR: Cannot open bam file for reading.'
    bam_fp = pysam.Samfile(bam_file, 'rb')
    
    if out_file==None:
        out_file = bam_file + ".extract_reads_" + time.strftime("%m%d%y_%H%M%S") + '.bam'
    
    assert pysam.AlignmentFile(out_file, "wb", template=bam_fp), 'ERROR: Cannot open output file for writing.'
    out_fp = pysam.AlignmentFile(out_file, "wb", template=bam_fp)
    
    # let's group the reads by read.positions[0] to speed up reading from the BAM file    
    for block_id, reads in groupby(read_array, lambda x: x.positions[0]):
        
        block = next((x for x in blockvar_array if x.var_id == block_id), None) # retrieve block
       
        for bamread in bam_fp.pileup(block.chrom, block.pos, block.pos + 1):  # get reads from bamfile
            if bamread.id in reads:
                out_fp.write(bamread) # we've found a read from the group, so output it
            else:
                continue # this is a non-informative read, so ignore it

In [None]:
def greedy_partitioner(read_array, blockvar_array, out_b1, out_b2, out_am):
    '''
        for read in read_array: 
        translate hairfile alleles into blockvar IDs
        get alleles in each read spanning blockvars
        determine alleles for the two blocks from blockvar
        partition reads based on locally most probable alignment
        
    '''
    for read in read_array:
        allele_state = []
        for ix, readpos in enumerate(read.positions):
            block = next((x for x in blockvar_array if x.var_id == block_id), None)
            if block.hap1 == read.alleles[readpos]:
                allele_state.append(-1)
            elif block.hap2 == read.alleles[readpos]:
                allele_state.append(1)
            else:
                print "Unexpected error: read allele matched no haplotypes."
                raise
        if sum(allele_state) < 0:
            out_b1.append(read)
        elif sum(allele_state) > 0:
            out_b2.append(read)
        else:
            out_am.append(read)            