In [1]:
from __future__ import print_function

import sys
import os
import logging
from logging.config import dictConfig
import numpy as np
import collections as cl

import pgenlib as pg

logging_config = dict(
    version = 1,
    formatters = {
        'f': {'format':
              '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}
        },
    handlers = {
        'h': {'class': 'logging.StreamHandler',
              'formatter': 'f',
              'level': logging.DEBUG}
        },
    root = {
        'handlers': ['h'],
        'level': logging.DEBUG,
        },
)
dictConfig(logging_config)

In [2]:
pgen_in = '/oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data/unphased_data/chr20impv1-keep-maf0005-snv-biallelic-pg.pgen'
pgen_out = '/oak/stanford/groups/mrivas/users/ytanigaw/sandbox/pgen_write/test.pg'

In [3]:
def read_alleles_range(pgen_f, rangeStart, rangeEnd):
    logger_read_alleles_range = logging.getLogger('read_alleles_range')    
    logger_read_alleles_range.debug(
        'reading alleles range {}:{} from {}'.format(rangeStart, rangeEnd, pgen_f)
    )            
    with pg.PgenReader(pgen_f) as pgr:
        buffer = np.zeros((rangeEnd - rangeStart, pgr.get_raw_sample_ct() * 2), dtype=np.int32)
        pgr.read_alleles_range(rangeStart, rangeEnd, buffer)
    logger_read_alleles_range.debug(
        'The shape of the numpy nd-array is {}'.format(buffer.shape)
    )                    
    return buffer        

In [4]:
buf = read_alleles_range(pgen_in, 0, 5)

2017-07-04 00:27:26,593 read_alleles_range DEBUG    reading alleles range 0:5 from /oak/stanford/groups/mrivas/users/ytanigaw/nanopore-data/unphased_data/chr20impv1-keep-maf0005-snv-biallelic-pg.pgen
2017-07-04 00:27:26,607 read_alleles_range DEBUG    The shape of the numpy nd-array is (5, 224676)


In [5]:
buf.dtype, buf.shape

(dtype('int32'), (5, 224676))

In [9]:
pgw = pg.PgenWriter(pgen_out, buf.shape[1] / 2, buf.shape[0], True)

In [None]:
pgw.append_alleles(buf[0])
pgw.append_alleles(buf[1])
pgw.append_alleles(buf[2])
pgw.append_alleles(buf[3])
pgw.append_alleles(buf[4])

In [8]:
pgw.close()

RuntimeError: PgenWriter.close() called when number of written variants (5) unequal to initially declared value (112338).

In [7]:
pgw.append_alleles(buf[0])

In [8]:
pgw.append_alleles(buf[1])
pgw.append_alleles(buf[2])
pgw.append_alleles(buf[3])
pgw.append_alleles(buf[4])
pgw.append_alleles(buf[5])
pgw.append_alleles(buf[6])

In [9]:
cl.Counter(buf[7])

Counter({-9: 4598, 0: 145208, 1: 74870})

In [10]:
cl.Counter(buf[6])

Counter({-9: 2910, 0: 218107, 1: 3659})

In [None]:
pgw.append_alleles(buf[7])

In [None]:
pgw.append_alleles(buf[8])

In [None]:
pgw.append_alleles(buf[9])

In [None]:
for i in range(1, 10):
    print(i)
    pgw.append_alleles(buf[i])

In [5]:
with pg.PgenWriter(pgen_out, buf.shape[1] / 2, buf.shape[0], True) as pgw:
    pgw.append_alleles_batch(buf)

RuntimeError: PgenWriter.close() called when number of written variants (0) unequal to initially declared value (10).

In [None]:
    logger_cnt = logging.getLogger('read_prior_cnts')    
    logger_cnt.info(
        'reading prior counts from {}'.format(prior_count_dir)
    )        
    
    prior_cnt_keys = [None] * len(block_df)
    prior_cnt_vals = [None] * len(block_df)
    for block_id in range(len(block_df)):
        if(block_id % 100 == 0):
            logger_cnt.info(
                'reading block {} of {}'.format(block_id, len(block_df))
            )    
        cnt = np.load('{}/{}.npz'.format(prior_count_dir, block_id))
        prior_cnt_keys[block_id] = cnt['keys']
        prior_cnt_vals[block_id] = cnt['vals']
        
    logger_cnt.info(
        'prior counts is loaded on memory'
    )    
        
    return prior_cnt_keys, prior_cnt_vals

In [None]:
def read_alleles_block(pgen_f, block_df, block_id):
    """wrapper function of pgen.PgenReader.read_alleles_range for a LD block"""
    bim_interval = [block_df.bimStart[block_id], block_df.bimEnd[block_id]]
    with pg.PgenReader(pgen_f) as pgr:
        buf_ndary = np.zeros(
            (bim_interval[1] - bim_interval[0], pgr.get_raw_sample_ct() * 2), 
            dtype=np.int32
        )
        pgr.read_alleles_range(bim_interval[0], bim_interval[1], buf_ndary)
    return buf_ndary