In [4]:
import os
import sys
import pysam
import vcf
import itertools
import string
import numpy as np
from collections import Counter
from copy import deepcopy

In [17]:
def readVcfToDict (vcf_reader, sample_id = "hg002"):
    snpChromDict = {}
    snpChromDictUnphased = {}
    chrom = None
    rcount = 0
    for record in vcf_reader:
        #print record
        rcount += 1
        if chrom != record.CHROM:
            print >>sys.stderr, "...reading chrom %s" %(record.CHROM)
            snpChromDict.setdefault(record.CHROM, {})
            snpChromDictUnphased.setdefault(record.CHROM, {})
        if rcount % 1000 == 0:
            print >>sys.stderr, "...read %s vars" %(rcount)
        if record.CHROM != "000000F":
            print >>sys.stderr, "...exiting out of function early after first contig"
            return snpChromDict, snpChromDictUnphased
        chrom, ref, alt, pos = record.CHROM, record.REF, record.ALT, record.POS

        gt_sample = record.genotype(sample_id)['GT']
        if gt_sample != None:
            if "|" in gt_sample:
                mat_gt, pat_gt = map(int, gt_sample.split("|"))
                if mat_gt == pat_gt:
                    continue
                snpChromDict[chrom][int(pos)] = (mat_gt, pat_gt)
            if "/" in gt_sample:
                mat_gt, pat_gt = map(int, gt_sample.split("/"))
                if mat_gt == pat_gt:
                    continue
                snpChromDictUnphased[chrom][int(pos)] = (mat_gt, pat_gt)
    return snpChromDict, snpChromDictUnphased

In [6]:
def parseHapCutOut (hapcutoutfn):
    with open (hapcutoutfn) as f:
        f.readline()
        block = []
        for l in f:
            if l[0] == "B":
                yield block
                block = []
            else:
                ll = l.split("\t")
                if len(ll) > 5:
                    hap1, hap2, chrom, pos = ll[1:5]
                    hap1, hap2, pos = int(hap1), int(hap2), int(pos)
                    block.append((hap1, hap2, chrom, pos))
                
        yield block

In [7]:
def calculateSwitchError (switch_list):
    switches = 0
    if len(switch_list) == 0:
        return switches, 0
    curr_s = switch_list[0]
    for s in switch_list[1:]:
        if s != curr_s:
            switches += 1
        curr_s = s
    if len(switch_list) == 1:
        return 0, 0
    else:
        return switches, switches/(float(len(switch_list))-1)

In [8]:
def measureBlockConcordance (block, snpDict, snpDictUnphased={}):
    pcount = 0
    mcount = 0
    missing_from_trio = 0
    missing_from_file = 0
    missing_other = 0
    switch_list = []
    for var in block:
        pos = var[3]
        if pos in snpDict:
            if snpDict[pos][0] == var[0]:
                mcount += 1
                switch_list.append(0)
            elif snpDict[pos][1] == var[0]:
                pcount += 1
                switch_list.append(1)
            else:
                missing_other += 1
        elif pos in snpDictUnphased:
            missing_from_trio += 1
        else:
            missing_from_file += 1
    if missing_from_trio + missing_from_file == len(block):
        return 0, max(pcount, mcount), missing_from_trio, missing_from_file, missing_other, switch_list
    else:
        denom = (len(block)-missing_from_trio-missing_from_file)
        return float(max(pcount, mcount))/denom, max(pcount, mcount), missing_from_trio, missing_from_file, missing_other, switch_list


In [9]:
def calculateX50 ( rawvals ):
    vals = deepcopy(rawvals)
    vals.sort(reverse=True)
    vsum = sum(vals)
    currsum = 0
    for v in vals:
        currsum += v
        if currsum >= .5*vsum:
            return v

In [22]:
def getVCF(vcffn):
    print >>sys.stderr, "opening vcf file %s" %(vcffn)
    vcf_reader = vcf.Reader(filename=vcffn)
    print >>sys.stderr, "reading vcf file . . ."
    snpChromDict, snpChromDictUnphased = readVcfToDict (vcf_reader)
    print >>sys.stderr, "finished reading vcf file"
    return snpChromDict, snpChromDictUnphased

In [23]:
def phaseLocalRegions (snpChromDict, snpChromDictUnphased, hapcutfn):    
    blocklens = []
    blockspans = []
    triomissings = []
    filemissings = []
    concordances = []
    concounts = []
    switch_counts = []
    switch_errors = []
    othermissings = []
    print >>sys.stderr, "reading hapcut blocks: "
    bcount = 0
    for block in parseHapCutOut(hapcutfn):
        bcount += 1
        if bcount % 10000 == 0:
            print >>sys.stderr, ". . . read %i blocks" %(bcount)
        positions = map(lambda x: x[3], block)
        span = max(positions)-min(positions)
        blocklens.append(len(positions))
        blockspans.append(span)
        chrom = block[0][2]
        concordance, concount, triomissing, filemissing, othermissing, switch_list = \
            measureBlockConcordance ( block, snpChromDict[chrom], snpChromDictUnphased[chrom])
        switches, switch_error = calculateSwitchError (switch_list)
        switch_counts.append(switches)
        switch_errors.append(switch_error)
        triomissings.append(triomissing)
        concordances.append(concordance)
        filemissings.append(filemissing)
        othermissings.append(othermissing)
        concounts.append(concount)
    denomsum = sum(blocklens) -  sum(filemissings) - sum(triomissings)
    conrate = sum(concounts)/float(denomsum)
    switchrate = float(sum(switch_counts))/denomsum
    print "switches              : %i" %(sum(switch_counts))
    print "concordant snps       : %i" %(sum(concounts))
    print "total snps in blocks  : %i" %(sum(blocklens))
    print "missing from trio snps: %i" %(sum(triomissings))
    print "missing from file snps: %i" %(sum(filemissings))
    print "other missing     snps: %i" %(sum(othermissings))
    print "denominator snps      : %i" %(denomsum)
    print "haplotype accuracy    : %f" %(conrate)
    print "switch error rate     : %f" %(switchrate)
    
    print "S50: %i" %(calculateX50 ( blocklens ))
    print "N50: %i" %(calculateX50 ( blockspans ))


In [25]:
vcf_fn = "/dev/shm/aj_trio_phasebytrans.phased-trio.vcf"
hapcut_fn1 = "/Users/alibashir/sshfs/jason_new/hapcut_outputs/hg002_re_000000F/long_short_reads/hg002_hapcut_longshort_regen_000000F.hapcut"
hapcut_fn2 = "/Users/alibashir/sshfs/jason_new/hapcut_outputs/hg002_re_000000F/long_short_reads/hg002_hapcut_longshort_000000F.hapcut"
hapcut_fn3 = "/Users/alibashir/sshfs/jason_new/hapcut_outputs/hg002_re_000000F/hg002_hapcut_000000F.hapcut"
hapcut_fn4 = "/Users/alibashir/sshfs/jason_new/hapcut_outputs/hg002_re_000000F/hg002_more_hapcut.000000F.hapcut"
hapcut_fn5 = "/hpc/users/neffr01/jason_new/hapcut_outputs/hg002_re_000000F/fake_hairs/hg002_methyl_snps_longread.hapcut"

In [27]:
#snpChromDict, snpChromDictUnphased = getVCF(vcf_fn)
phaseLocalRegions (snpChromDict, snpChromDictUnphased, hapcut_fn5)

switches              : 1207
concordant snps       : 5137
total snps in blocks  : 90818
missing from trio snps: 11443
missing from file snps: 71237
other missing     snps: 0
denominator snps      : 8138
haplotype accuracy    : 0.631236
switch error rate     : 0.148317
S50: 1989
N50: 608155


reading hapcut blocks: 


In [68]:
phaseLocalRegions (vcf_fn, hapcut_fn1)
phaseLocalRegions (vcf_fn, hapcut_fn2)
phaseLocalRegions (vcf_fn, hapcut_fn3)
phaseLocalRegions (vcf_fn, hapcut_fn4)

opening vcf file /Users/alibashir/sshfs/jason_new/trio_analysis/aj_trio_phasebytrans.phased-trio.vcf
reading vcf file . . .
...reading chrom 000000F
...read 100000 vars
...reading chrom 000001F
...exiting out of function early after first contig
finished reading vcf file
reading hapcut blocks: 
opening vcf file /Users/alibashir/sshfs/jason_new/trio_analysis/aj_trio_phasebytrans.phased-trio.vcf
reading vcf file . . .
...reading chrom 000000F
...read 100000 vars
...reading chrom 000001F

switches              : 3294
concordant snps       : 6384
total snps in blocks  : 73369
missing from trio snps: 14845
missing from file snps: 46911
other missing     snps: 0
denominator snps      : 11613
haplotype accuracy    : 0.549729
switch error rate     : 0.283648
S50: 58877
N50: 27040168
switches              : 4262


...exiting out of function early after first contig
finished reading vcf file
reading hapcut blocks: 
opening vcf file /Users/alibashir/sshfs/jason_new/trio_analysis/aj_trio_phasebytrans.phased-trio.vcf
reading vcf file . . .
...reading chrom 000000F
...read 100000 vars
...reading chrom 000001F


concordant snps       : 11697
total snps in blocks  : 51861
missing from trio snps: 26575
missing from file snps: 7723
other missing     snps: 0
denominator snps      : 17563
haplotype accuracy    : 0.666002
switch error rate     : 0.242669
S50: 1103
N50: 626525
switches              : 410


...exiting out of function early after first contig
finished reading vcf file
reading hapcut blocks: 
opening vcf file /Users/alibashir/sshfs/jason_new/trio_analysis/aj_trio_phasebytrans.phased-trio.vcf
reading vcf file . . .
...reading chrom 000000F
...read 100000 vars
...reading chrom 000001F


concordant snps       : 7155
total snps in blocks  : 19682
missing from trio snps: 11458
missing from file snps: 136
other missing     snps: 0
denominator snps      : 8088
haplotype accuracy    : 0.884644
switch error rate     : 0.050692
S50: 204
N50: 220011
switches              : 336
concordant snps       : 7049
total snps in blocks  : 19901
missing from trio snps: 11622
missing from file snps: 148
other missing     snps: 0
denominator snps      : 8131
haplotype accuracy    : 0.866929
switch error rate     : 0.041323
S50: 259
N50: 287557



...exiting out of function early after first contig
finished reading vcf file
reading hapcut blocks: 
