In [1]:
%matplotlib inline

from __future__ import print_function

import sys
import os
import logging
from logging.config import dictConfig
import numpy as np
import pandas as pd
import itertools as it
import collections as cl
import bisect

import pgenlib as pg

# from numba import jit

logging_config = dict(
    version = 1,
    formatters = {
        'f': {'format':
              '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}
        },
    handlers = {
        'h': {'class': 'logging.StreamHandler',
              'formatter': 'f',
              'level': logging.DEBUG}
        },
    root = {
        'handlers': ['h'],
        'level': logging.DEBUG,
        },
)
dictConfig(logging_config)

In [2]:
logger_nb = logging.getLogger('notebook')
logger_nb.info('hello')

2017-07-03 19:41:19,416 notebook     INFO     hello


In [3]:
read_dir='/oak/stanford/groups/mrivas/public_data/nanopore-wgs-consortium/rel3/hg19/chr20'
hap_basename='rel3.chr20.12500.10k-chr20impv1-keep-maf0005-snv-biallelic-geno01-hwe1e-10.head.hap'
hap_f='{}/{}'.format(read_dir, hap_basename)

data_dir='/oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data'

block_tsv_f='{}/unphased_data/{}'.format(
    data_dir,
    'chr20impv1-keep-maf0005-snv-biallelic-geno01-hwe1e-10-block-stronglow050-stronghigh083-infofrac10.tsv'
)

prior_count_dir='{}/{}'.format(data_dir, 'prior_count')
log_likelihood_dir='{}/{}/{}'.format(data_dir, 'log_likelihood', hap_basename[:-4])
posterior_dir='{}/{}/{}'.format(data_dir, 'posterior', hap_basename[:-4])

In [4]:
def read_prior_cnts(block_df, prior_count_dir):
    
    logger_cnt = logging.getLogger('read_prior_cnts')    
    logger_cnt.info(
        'reading prior counts from {}'.format(prior_count_dir)
    )        
    
    prior_cnt_keys = [None] * len(block_df)
    prior_cnt_vals = [None] * len(block_df)
    for block_id in range(len(block_df)):
        if(block_id % 100 == 0):
            logger_cnt.info(
                'reading block {} of {}'.format(block_id, len(block_df))
            )    
        cnt = np.load('{}/{}.npz'.format(prior_count_dir, block_id))
        prior_cnt_keys[block_id] = cnt['keys']
        prior_cnt_vals[block_id] = cnt['vals']
        
    logger_cnt.info(
        'prior counts is loaded on memory'
    )    
        
    return prior_cnt_keys, prior_cnt_vals

In [5]:
def read_log_posterior(block_df, posterior_dir):
    
    logger_post = logging.getLogger('read_posterior')    
    logger_post.info(
        'reading posterior from {}'.format(posterior_dir)
    )        
    
    skipped_blocks = []
    
    log_posterior = dict([])
    for block_id in range(len(block_df)):
        if(block_id % 100 == 0):
            logger_post.info(
                'reading block {} of {}'.format(block_id, len(block_df))
            )    
        npz_file = '{}/{}.npz'.format(posterior_dir, block_id)
        if not os.path.isfile(npz_file):
            skipped_blocks.append(block_id)
        else:
            lp = np.load(npz_file)
            log_posterior[block_id] = lp['lp']

    if(len(skipped_blocks) > 0):
        logger_post.info(
            'skipped blocks are {}'.format(skipped_blocks)
        )
        
    
    logger_post.info(
        'log posterior is loaded on memory'
    )    
        
    return log_posterior

In [4]:
block_df = pd.read_csv(block_tsv_f, sep='\t')

In [5]:
block_df.head()

Unnamed: 0,chrom,chromStart,chromEnd,bimStart,bimEnd,nt_size,n_SNPs,n_haps,n_uniq_haps
0,chr20,61097,94951,0,78,33854,78,119900,4833
1,chr20,94951,266791,78,915,171840,837,24264,16646
2,chr20,266791,288775,915,991,21984,76,120130,6487
3,chr20,288775,344630,991,1218,55855,227,70042,16571
4,chr20,344630,346675,1218,1224,2045,6,202090,26


In [8]:
prior_cnt_keys, prior_cnt_vals = read_prior_cnts(block_df, prior_count_dir)

2017-07-03 19:35:30,307 read_prior_cnts INFO     reading prior counts from /oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data/prior_count
2017-07-03 19:35:30,309 read_prior_cnts INFO     reading block 0 of 642
2017-07-03 19:35:31,277 read_prior_cnts INFO     reading block 100 of 642
2017-07-03 19:35:32,507 read_prior_cnts INFO     reading block 200 of 642
2017-07-03 19:35:33,802 read_prior_cnts INFO     reading block 300 of 642
2017-07-03 19:35:34,836 read_prior_cnts INFO     reading block 400 of 642
2017-07-03 19:35:35,975 read_prior_cnts INFO     reading block 500 of 642
2017-07-03 19:35:36,843 read_prior_cnts INFO     reading block 600 of 642
2017-07-03 19:35:37,336 read_prior_cnts INFO     prior counts is loaded on memory


In [9]:
log_posterior = read_log_posterior(block_df, posterior_dir)

2017-07-03 19:35:37,343 read_posterior INFO     reading posterior from /oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data/posterior/rel3.chr20.12500.10k-chr20impv1-keep-maf0005-snv-biallelic-geno01-hwe1e-10.head
2017-07-03 19:35:37,345 read_posterior INFO     reading block 0 of 642
2017-07-03 19:35:37,350 read_posterior INFO     reading block 100 of 642
2017-07-03 19:35:37,352 read_posterior INFO     reading block 200 of 642
2017-07-03 19:35:37,353 read_posterior INFO     reading block 300 of 642
2017-07-03 19:35:37,355 read_posterior INFO     reading block 400 of 642
2017-07-03 19:35:37,356 read_posterior INFO     reading block 500 of 642
2017-07-03 19:35:37,358 read_posterior INFO     reading block 600 of 642
2017-07-03 19:35:37,359 read_posterior INFO     skipped blocks are [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 5

In [11]:
!ls $data_dir/unphased_data/chr20impv1-keep-maf0005-snv-biallelic-pg.pgen

/oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data/unphased_data/chr20impv1-keep-maf0005-snv-biallelic-pg.pgen


In [6]:
def read_alleles_block(pgen_f, block_df, block_id):
    """wrapper function of pgen.PgenReader.read_alleles_range for a LD block"""
    bim_interval = [block_df.bimStart[block_id], block_df.bimEnd[block_id]]
    with pg.PgenReader(pgen_f) as pgr:
        buf_ndary = np.zeros(
            (bim_interval[1] - bim_interval[0], pgr.get_raw_sample_ct() * 2), 
            dtype=np.int32
        )
        pgr.read_alleles_range(bim_interval[0], bim_interval[1], buf_ndary)
    return buf_ndary

In [7]:
b0 = read_alleles_block(
    '{}/unphased_data/chr20impv1-keep-maf0005-snv-biallelic-pg.pgen'.format(data_dir),
    block_df, 0
)

In [8]:
b0.shape

(78, 224676)

In [9]:
tmpDir='{}/tmp'.format(data_dir)

In [40]:
with pg.PgenWriter(
    '{}/test.pgen'.format(tmpDir),
    b0.shape[1], b0.shape[0], True
) as pgw:
    pgw.append_biallelic_batch(b0, all_phased = True)

RuntimeError: PgenWriter.close() called when number of written variants (0) unequal to initially declared value (78).

In [10]:
b0.dtype

dtype('int32')

In [11]:
pgw = pg.PgenWriter(
    '{}/test.pgen'.format(tmpDir),
    b0.shape[1], b0.shape[0], True
)

In [13]:
pgw.append_alleles(b0[0])

In [None]:
for snp in range(b0.shape[0]):
    print(snp)
    pgw.append_alleles(b0[snp])