# [technical note] [WIP] pgenlib

- Yosuke Tanigawa (ytanigaw@stanford.edu)
- 2017.02.02

## objective 
- Learn how to call pgenlib with python API 
- would like to demonstrate if we can infer haplotype based on informative snps on long reads

## data set in mind
- `/share/PI/mrivas/data/nanopore-wgs-consortium-old/nanopore-wgs.25000.sorted.10k.mapq50.ext.sorted.informative.q14.snps`

## dependencies

In [1]:
import numpy as np
import pgenlib as pg
import pandas as pd
import subprocess as sp
import sys

In [2]:
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

## how to read pgen file (or bed file)
- In the following sections, I will try basic APIs based on specification

- To reads bed file, one need to know the number of samples (raw_sample_ct), which we can obtain from corresponding ped file
- here is the wrapper function to read bed file

In [4]:
chr11 = pg.PgenReader('/share/PI/mrivas/ukbb/download/chr20impv1.bgen')

RuntimeError: /share/PI/mrivas/ukbb/download/chr20impv1.bgen is not a .pgen file (first two bytes don't match the magic number).


In [3]:
def pgen_reader_wrapper(data_dir, chrom):
    file_base = '{}/chrom{}'.format(data_dir, chrom)
    try:
        raw_sample_ct = \
        int(sp.check_output(["wc", "-l", 
                             '{}.fam'.format(file_base)]).split()[0])
    except:
        print "Unexpected error:", sys.exc_info()[0]
        raise        
    else:
        try:
            pgen_obj = pg.PgenReader('{}.bed'.format(file_base),
                                     raw_sample_ct)
        except:
            raise
        else:
            return(pgen_obj)

## region of interest
- The first 20 informative SNP sites on the data file is as follows:

In [4]:
regions_raw = !cat /share/PI/mrivas/data/nanopore-wgs-consortium-old/nanopore-wgs.25000.sorted.10k.mapq50.ext.sorted.informative.q14.snps|awk '{if(NR > 1){print $7}}'|sed -e 's/;/\n/g'
regions = [r.split(',') for r in regions_raw]

In [5]:
regions[:20]

[['chr10:1226991', 'A', 'G', 'rs4880485', 'True', '17'],
 ['chr10:1227181', 't', 'C', 'rs7914227', 'True', '17'],
 ['chr10:1227299', 'g', 'A', 'rs7894015', 'True', '20'],
 ['chr10:1227887', 't', 'G', 'rs72760988', 'True', '14'],
 ['chr10:1228338', 'A', 'G', 'rs7091963', 'False', '14'],
 ['chr10:1228623', 'T', 'C', 'rs3750679', 'True', '14'],
 ['chr10:1229473', 'A', 'T', '*', 'None', '16'],
 ['chr10:1230234', 'C', 'T', '*', 'None', '14'],
 ['chr10:1230523', 'T', 'C', 'rs876630', 'True', '14'],
 ['chr10:1232525', 'C', 'T', '!', 'None', '14'],
 ['chr10:1234056', 'A', 'G', 'rs904961', 'True', '19'],
 ['chr10:1235018', 'G', 'A', 'rs1392831', 'True', '21'],
 ['chr10:1235728', 'C', 'A', '*', 'None', '15'],
 ['chr10:1237920', 'A', 'G', 'rs4880785', 'True', '19'],
 ['chr10:1237932', 'G', 'A', 'rs1392826', 'True', '14'],
 ['chr10:1238878', 'g', 'A', 'rs34864641', 'True', '18'],
 ['chr10:1239269', 'T', 'A', '*', 'None', '16'],
 ['chr10:1241902', 'G', 'T', '*', 'None', '15'],
 ['chr10:1245531', 'C

- let's focus on chromosome 10 for now

In [6]:
data_dir = '/share/PI/mrivas/data/ukbb/cal'
chrom = 10

In [7]:
chr10 = pgen_reader_wrapper(data_dir, chrom)

In [8]:
type(chr10)

Python.pgenlib.PgenReader

- We now have genotypic information on memory with pgenlib

### # of samples, # of variants, phase info

In [9]:
chr10.get_raw_sample_ct()

152729

In [10]:
chr10.get_variant_ct()

40082

In [11]:
chr10.hardcall_phase_present()

False

### access genotype info of a specific locus

- need to prepare numpy array to store results

In [12]:
res_ary = np.zeros(chr10.get_raw_sample_ct(), dtype=np.int8)

- one can query genotypes of specific region

In [13]:
chr10.read(0, res_ary)

In [14]:
res_ary

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [15]:
set(res_ary)

{-9, 0, 1}

In [16]:
sum(res_ary == 0)

148004

In [17]:
sum(res_ary == 1)

6

In [18]:
sum(res_ary == -9)

4719

## read map file for marker info
- we need to take correspondance between genomic coordinate and SNP marker (line # on map file)

In [19]:
map10 = pd.read_csv('{}/chrom{}.bim'.format(data_dir, chrom), sep = '\t', 
                   names = ['chr', 'id', 'morgan', 'bp', 'pri', 'sec'])

In [20]:
def query_map_file_by_genome_index(map_f, index):
    print 'query: {}'.format(index)
    print map_f.loc[index - 1 <= map_f['bp']].head(3)

In [21]:
query_map_file_by_genome_index(map10, 122691)

query: 122691
    chr             id  morgan      bp  pri  sec
10   10   Affx-2576943       0  127924    1    2
11   10  Affx-35481884       0  131716    1    2
12   10  Affx-35481888       0  133243    1    2


- not all the snps have match in haplotype reference 

In [22]:
idxs = [[r[0].split(':')[0], int(r[0].split(':')[1])]
        for r in regions]
idxs_chr10 = [i[1] for i in idxs if i[0] == 'chr10']

In [23]:
idxs_chr10[:10]

[1226991,
 1227181,
 1227299,
 1227887,
 1228338,
 1228623,
 1229473,
 1230234,
 1230523,
 1232525]

In [24]:
for index in idxs_chr10[:30]:
    query_map_file_by_genome_index(map10, index)

query: 1226991
     chr             id  morgan       bp  pri  sec
376   10   Affx-2625122       0  1227597    2    1
377   10  Affx-35483131       0  1228575    1    2
378   10  Affx-35483132       0  1229915    1    2
query: 1227181
     chr             id  morgan       bp  pri  sec
376   10   Affx-2625122       0  1227597    2    1
377   10  Affx-35483131       0  1228575    1    2
378   10  Affx-35483132       0  1229915    1    2
query: 1227299
     chr             id  morgan       bp  pri  sec
376   10   Affx-2625122       0  1227597    2    1
377   10  Affx-35483131       0  1228575    1    2
378   10  Affx-35483132       0  1229915    1    2
query: 1227887
     chr             id  morgan       bp  pri  sec
377   10  Affx-35483131       0  1228575    1    2
378   10  Affx-35483132       0  1229915    1    2
379   10  Affx-52348440       0  1230769    2    1
query: 1228338
     chr             id  morgan       bp  pri  sec
377   10  Affx-35483131       0  1228575    1    2
378   1

- This gives me a doubt that we might be working with a wrong version of genome

In [25]:
def dist_to_probe(map_f, index):
    try:
        probe_pos = map_f.loc[index <= map_f['bp']].head(1)['bp'].as_matrix()[0]
    except:
        print "Unexpected error:", sys.exc_info()[0]
        print "query = {}".format(index)
        return(-1)
    else:
        return(probe_pos - index)

In [26]:
dists = np.array([dist_to_probe(map10, idx) for idx in idxs_chr10])

In [27]:
len(idxs_chr10)

2534

In [28]:
sum(dists == 0)

27

- only 27 out of 2534 'informative' SNPs has a hit in UKBB population

In [None]:
def make_hist(x, title = None, xlabel = None, ylabel = None, filename = None):
    '''
    This function generates histogram of a vector x and save to file
    Inputs:
      x: data vector
      title:    title of the plot
      xlabel:   label on x-axis
      ylabel:   label on y-axis
      filename: name of the image file (if given, save to file)
    Returns:
      matlab plot object
    Side effect:
      save an image file if filename is given
    '''    
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(1, 1, 1)
    ax.hist(x, 20)
    
    if(xlabel != None):
        ax.set_xlabel(xlabel)
    if(ylabel != None):
        ax.set_ylabel(ylabel)
    if(title != None):
        ax.set_title(title)
    if(filename != None):
        fig.savefig(filename)

In [None]:
make_hist(dists, 
          title = 'Distance between SNP on reads and nearest SNP marker in UKBB', 
          xlabel = 'Distance (bp)', ylabel = 'Frequency', filename='20170202_pgenlib.png')

In [None]:
make_hist([d for d in dists if d <= 100], 
          title = 'Distance between SNP on reads and nearest SNP marker in UKBB (<= 100bp)', 
          xlabel = 'Distance (bp)', ylabel = 'Frequency', filename='20170202_pgenlib_lt100bp.png')

In [None]:
make_hist([d for d in dists if d <= 20], 
          title = 'Distance between SNP on reads and nearest SNP marker in UKBB (<= 20bp)', 
          xlabel = 'Distance (bp)', ylabel = 'Frequency', filename='20170202_pgenlib_lt20bp.png')