In [1]:
import numpy as np
import pgenlib as pg
import pandas as pd
import subprocess as sp
import sys, re

In [2]:
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

# prepare params & files

In [3]:
chromosome = '20'

In [4]:
file_head = '/share/PI/mrivas/data/1000genomes/ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes-pgen'.format(chromosome)

In [5]:
pgen = {}
bim  = {}
genome_index_to_variant_index = {}
bim_col_names = ['chr', 'id', 'morgan', 'bp', 'pri', 'sec']

### population reference data

In [6]:
pgen[chromosome] = pg.PgenReader('{}.pgen'.format(file_head))
bim[chromosome]  = pd.read_csv('{}.bim'.format(file_head),
                               sep = '\t', names = bim_col_names)
genome_index_to_variant_index[chromosome] = \
dict(zip(bim[chromosome]['bp'], range(len(bim[chromosome]))))

### our observation

In [7]:
snps_f =  '/share/PI/mrivas/data/nanopore-wgs-consortium-old/nanopore-wgs.25000.sorted.10k.mapq50.ext.sorted.informative.q14.snps'
reads = pd.read_csv(snps_f, sep = '\t')

In [8]:
def parse_snps_str(snps_str, filter_by_dbsnp = True):
    list_of_snps = [re.split(',|:',snp) for snp in snps_str.split(';')]
    if(filter_by_dbsnp):
        '''
        keep only snps with valid snp id
        '''
        list_of_snps = np.array([snp for snp in list_of_snps
                                 if snp[4][:2] == 'rs'])
    return(list_of_snps)

In [9]:
# reconstruct list of snps
reads['snps'] = np.array([parse_snps_str(reads.ix[read_num, 6]) 
                          for read_num in range(len(reads))])

In [10]:
# add field of chromosome number
reads['chr'] = np.array([reads['snps'][read_num][0][0][3:]
                         for read_num in range(len(reads))])

In [11]:
# extracting snp positions
reads['snp_pos'] = np.array([[int(x) for x in reads.ix[read_num, 'snps'][:, 1]]
                             for read_num in range(len(reads))])

In [43]:
def snp_poss_to_variant_idxs(snp_poss, conv_dict):
    return(np.array([conv_dict[snp_pos] for snp_pos 
                     in snp_poss if snp_pos in conv_dict]))
    

In [45]:
read_num = 334
snp_poss_to_variant_idxs(reads['snp_pos'][read_num], 
                         genome_index_to_variant_index[reads['chr'][read_num]])

array([20623, 20732, 20813, 21017, 21054, 21139, 21143, 21187, 21253,
       21331, 21537, 21540, 21562, 21607, 21632])

In [46]:
# take subset of fragments on the chromosome of interest
reads_chr20 = reads[reads['chr'] == '20']
print reads_chr20.shape

(13, 10)


In [47]:
reads_chr20.head(3)

Unnamed: 0,name,#mismatches,#mismatches_with_hits_to_dbSNP,#SNPs_with_var_id,#SNPs_with_var_id(validated),length,"snps([<pos>,<ref>,<seq>,<varid>,<validated>,<baseCallQ>;]+)",snps,chr,snp_pos
334,1f3ad947-2a90-4c4c-8155-c8ca34c3a4d9_Basecall_...,47,17,16,14,32044,"chr20:712806,T,A,*,None,16;chr20:712968,C,G,*,...","[[chr20, 714008, a, G, rs2317021, True, 18], [...",20,"[714008, 717649, 720916, 727476, 728499, 73089..."
335,5ac37613-6f90-4320-9c8e-fefb98f49f25_Basecall_...,35,14,13,12,29431,"chr20:1341776,a,G,*,None,14;chr20:1342191,c,T,...","[[chr20, 1344929, C, A, rs6033551, True, 18], ...",20,"[1344929, 1345368, 1346236, 1346381, 1347754, ..."
336,021d57af-e9b8-49d6-b200-7793894f18a2_Basecall_...,40,16,15,14,27679,"chr20:4376042,A,G,*,None,17;chr20:4376044,C,A,...","[[chr20, 4379857, c, A, rs3848822, True, 17], ...",20,"[4379857, 4381876, 4382054, 4385424, 4385756, ..."


In [48]:
reads_chr20.ix[:, 'snps']

334    [[chr20, 714008, a, G, rs2317021, True, 18], [...
335    [[chr20, 1344929, C, A, rs6033551, True, 18], ...
336    [[chr20, 4379857, c, A, rs3848822, True, 17], ...
337    [[chr20, 15106978, c, T, rs975993, True, 15], ...
338    [[chr20, 16114674, g, A, rs971314, True, 16], ...
339    [[chr20, 17213181, c, G, rs8116844, True, 14],...
340    [[chr20, 35005210, c, T, rs538777470, False, 1...
341    [[chr20, 43923028, a, G, rs66503531, True, 15]...
342    [[chr20, 47824040, t, A, rs6095462, True, 16],...
343    [[chr20, 48302652, C, T, rs1016234, True, 14],...
344    [[chr20, 51259768, T, C, rs856409, True, 17], ...
345    [[chr20, 53759041, a, G, rs6069139, True, 19],...
346    [[chr20, 54342375, T, C, rs13041144, True, 14]...
Name: snps, dtype: object

In [53]:
variant_idxs = \
np.array([snp_poss_to_variant_idxs(reads_chr20['snp_pos'][read_num], 
                                   genome_index_to_variant_index[reads['chr'][read_num]])
          for read_num in range(334, 347)])

In [55]:
genome_index_to_variant_index['20'][714008]

20623

In [57]:
variant_idxs[0]

array([20623, 20732, 20813, 21017, 21054, 21139, 21143, 21187, 21253,
       21331, 21537, 21540, 21562, 21607, 21632])

In [58]:
def combine_alleles(allele_int32, raw_sample_ct):
    '''
    Input:
      allele_int32: array of length 2m
    Output:
      allele_combined: array of length m
    map {[0, 0] : [0],
         [0, 1] : [1],
         [1, 0] : [2],
         [1, 1] : [3],
         [-9, ?] : [-9],
         [?, -9] : [-9]}
    '''
    alleles_combined = np.array([allele_int32[i] * 2 + allele_int32[i + 1] 
                                 if (allele_int32[i] != -9 and allele_int32[i+1] != -9)
                                 else -9
                                 for i in range(raw_sample_ct)])
    return(alleles_combined)

In [59]:
def read_haplotype_list_wrapper(pgen, variant_idxs):
    sample_ct = pgen.get_raw_sample_ct()
    read_alleles_list_buf = np.zeros((len(variant_idxs), 
                                      2 * sample_ct), 
                                     dtype=np.int32)
    haplotype_list = \
    np.array([combine_alleles(read_alleles_list, sample_ct)
              for read_alleles_list in read_alleles_list_buf])
    return(haplotype_list)

In [60]:
haplotype_list = read_haplotype_list_wrapper(pgen[chromosome], variant_idxs[0])

In [61]:
haplotype_list.shape

(15, 2504)

In [62]:
haplotype_list

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## converting to string representation

In [67]:
haplotype_list_str = [''.join([str(x) for x in haplotype]) for haplotype in haplotype_list]

## number of haplotypes in this region

In [72]:
len(set(haplotype_list_str))

1