In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import itertools as it
import subprocess as sp
import sys
import re
import collections
import json
import pysam
import enum
import bisect

import colorlog

import pgenlib as pg

In [2]:
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

In [3]:
sys.path.append('./')

In [4]:
from Position import Position
from FastaFile import FastaFile
from BimFile import BimFile
from Read import Read
from Nucleotide import Nucleotide

In [12]:
bamfile_name = './data/rel3.chr20.12500.10k.head.bam'
fasta_name = './data/chr'
fast_ext = 'fa'
bim_name = './data/ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes-pgen.bim'

In [13]:
fasta = FastaFile(fasta_name, fast_ext=fast_ext)
bim = BimFile(bim_name)
bam = pysam.AlignmentFile(filename = bamfile_name, mode = 'rb')

In [7]:
def filter_mismatches(read, bim, quality_threshod=14):
    """
    Given a read and a bim file (and quality threshold), 
    identify list of SNPs 
      - is a mismatch
      - base call quality value >= quality threshold
      - the position is on bim file
      - read is minor allele in the bim file 
      - reference is major allele in the bim file
    """
    # Corresponding id interval (on bim file) is
    read_bim_l, read_bim_r = bim.find_index_interval(query_l = read.l, query_r = read.r)
    # Get polymorphic SNP positions (written on bim)
    read_snp_pos = bim.get_bp(r.chr)[read_bim_l : read_bim_r]
    # Get a candidate of SNPs 
    #  - Is a mismatch 
    #  - Base call quality value >= quality_threshod
    #  - The position is on bim file
    read_poly_mm = r.get_mismatches_on_polymorphic_sites(read_snp_pos, quality_threshod)
    # take the position of candidate SNPs
    read_poly_mm_pos = np.array([x.reference_position for x in read_poly_mm])
    # convert to bim indeces
    read_poly_mm_bim_id = bim.find_index_list([Position(r.chr, ref_pos) for ref_pos in read_poly_mm_pos])
    # minor allele on bim file
    read_bim_a1 = [Nucleotide[bim.get_allele_1(r.chr)[id]] for id in read_poly_mm_bim_id]
    # major allele on bim file
    read_bim_a2 = [Nucleotide[bim.get_allele_2(r.chr)[id]] for id in read_poly_mm_bim_id]
    # return list of mismatches
    return [read_poly_mm[i] for i in range(len(read_poly_mm)) if 
            read_poly_mm[i].reference == read_bim_a2[i] and
            read_poly_mm[i].read == read_bim_a1[i]]

In [8]:
def find_haplotype(read_poly_mm, read_bim_l, read_bim_r):
    """Given list of polymorphic SNPs on a read and corresponding interval on bim file, 
    construct a haplotype on the read
    """
    read_poly_mm_pos = np.array([x.reference_position for x in read_poly_mm])
    read_poly_mm_bim_id = bim.find_index_list([Position(r.chr, ref_pos) 
                                               for ref_pos in read_poly_mm_pos])
    read_hap = np.array([0 if i in set(read_poly_mm_bim_id) else 1 
                         for i in range(read_bim_l, read_bim_r)])
    return read_hap

In [28]:
def process_read(r, bim, quality_threshod=14):
    """Given a read, find bim interval and haplotype representation    
    """
    # bim id interval (on bim file)
    read_bim_l, read_bim_r = bim.find_index_interval(query_l = r.l, query_r = r.r)
    # polymorphic SNPs on reads
    read_poly_mm = filter_mismatches(r, bim, quality_threshod)
    # Construct the haplotype of the read
    read_hap = find_haplotype(read_poly_mm, read_bim_l, read_bim_r)
    # convert to string representation
    read_hap_str = ''.join([str(x) for x in read_hap])
    
    return (read_bim_l, read_bim_r, read_hap_str)


In [11]:
for x in bam:
    r = Read(x, fasta)
    try:
        read_bim_l, read_bim_r, read_hap_str = process_read(r, bim, quality_threshod=14)
    except KeyError as e:
        print "KeyError", e
    else:
        # name, chr, start, end, MAPQ, bim_l, bim_r, hapstr
        read_output = (r.reference_name, r.l.get_pos(), r.r.get_pos(), r.query_name, r.mapping_quality, read_bim_l, read_bim_r, read_hap_str)
        print [str(x) for x in read_output]


In [14]:
x = bam.next()

In [15]:
r = Read(x, fasta)

In [18]:
read_bim_l, read_bim_r, read_hap_str = process_read(r, bim, quality_threshod=14)

In [22]:
r.n_mismatch()


7