In [3]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import h5py
import petl as etl
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py
import pyfasta

In [4]:
gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud') ## cloud connection function

In [5]:
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/all/ag1000g.phase2.ar1")  ## Adding phase2 genotype path 
gcsmap = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset= zarr.Group(gcsmap, read_only=True) ## read data

In [6]:
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")  ## Adding phase2 genotype path 
gcsmap = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset_biallel= zarr.Group(gcsmap, read_only=True) ## read data

In [7]:
calldata_out= h5py.File('data/outgroup_allele_counts_phase2.h5', mode='r')
calldata_out.keys()

<KeysViewHDF5 ['2L', '2R', '3L', '3R', 'X']>

---------------------------

### Loading positions

In [62]:
pos_all = allel.SortedIndex(callset["3L"]["variants/POS"][:])
pos_all

0,1,2,3,4,...,18167051,18167052,18167053,18167054,18167055
15,19,20,23,24,...,41963165,41963183,41963184,41963288,41963345


In [63]:
pos_bi = allel.SortedIndex(callset_biallel["3L"]["variants/POS"])
pos_bi

0,1,2,3,4,...,7897661,7897662,7897663,7897664,7897665
9790,9791,9798,9812,9815,...,41956530,41956532,41956537,41956541,41956551


In [61]:
loc1, loc2 = pos_bi.locate_intersection(pos_all)
loc1, loc2

(array([ True,  True,  True, ...,  True,  True,  True]),
 array([False, False, False, ..., False, False, False]))

In [58]:
pos_p2_sel = pos_bi[loc1]
pos_p2_sel

0,1,2,3,4,...,8906418,8906419,8906420,8906421,8906422
25050,51212,51214,51226,51245,...,49356421,49356424,49356425,49356426,49356429


In [32]:
pos_p1_sel = pos_all[loc2]
pos_p1_sel

0,1,2,3,4,...,8906418,8906419,8906420,8906421,8906422
25050,51212,51214,51226,51245,...,49356421,49356424,49356425,49356426,49356429


------------------------------

In [33]:
variants_all = allel.VariantChunkedTable(callset["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
variants_all

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,20,b'C',[b'T' b'' b''],6,1.8,13.06,1,
1,103,b'C',[b'T' b'' b''],252,1.54,1.47,1,
2,163,b'G',[b'A' b'' b''],709,2.21,1.75,1,
...,...,...,...,...,...,...,...,...
21442862,49361815,b'A',[b'G' b'' b''],5011,0.62,0.08,1,
21442863,49362329,b'C',[b'T' b'' b''],929,0.12,0.1,1,
21442864,49362335,b'A',[b'T' b'' b''],916,0.1,0.06,1,


In [35]:
variants_all_filt = variants_all.compress(loc2)
variants_all_filt

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,25050,b'G',[b'T' b'' b''],38417,51.53,8.02,1,
1,51212,b'T',[b'A' b'' b''],39639,55.75,13.58,1,
2,51214,b'G',[b'C' b'' b''],39461,55.73,19.83,1,
...,...,...,...,...,...,...,...,...
8906420,49356425,b'G',[b'T' b'' b''],28138,41.81,9.58,1,
8906421,49356426,b'G',[b'T' b'' b''],28311,41.57,10.3,1,
8906422,49356429,b'A',[b'G' b'' b''],28106,40.92,10.13,1,


In [44]:
phase2_all_ref = variants_all_filt["REF"][:]
phase2_all_ref, phase2_all_ref.shape

(array([b'G', b'T', b'G', ..., b'G', b'G', b'A'], dtype='|S1'), (8906423,))

In [45]:
phase2_all_alt = variants_all_filt["ALT"][:]
phase2_all_alt, phase2_all_alt.shape

(array([[b'T', b'', b''],
        [b'A', b'', b''],
        [b'C', b'', b''],
        ...,
        [b'T', b'', b''],
        [b'T', b'', b''],
        [b'G', b'', b'']], dtype='|S1'), (8906423, 3))

In [38]:
variants_bi = allel.VariantChunkedTable(callset_biallel["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
variants_bi

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,25050,b'G',b'T',38417,51.53,8.02,1,
1,51212,b'T',b'A',39639,55.75,13.58,1,
2,51214,b'G',b'C',39461,55.73,19.83,1,
...,...,...,...,...,...,...,...,...
8906420,49356425,b'G',b'T',28138,41.81,9.58,1,
8906421,49356426,b'G',b'T',28311,41.57,10.3,1,
8906422,49356429,b'A',b'G',28106,40.92,10.13,1,


In [39]:
variants_bi_filt = variants_bi.compress(loc1)
variants_bi_filt

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,25050,b'G',b'T',38417,51.53,8.02,1,
1,51212,b'T',b'A',39639,55.75,13.58,1,
2,51214,b'G',b'C',39461,55.73,19.83,1,
...,...,...,...,...,...,...,...,...
8906420,49356425,b'G',b'T',28138,41.81,9.58,1,
8906421,49356426,b'G',b'T',28311,41.57,10.3,1,
8906422,49356429,b'A',b'G',28106,40.92,10.13,1,


In [40]:
phase2_bi_ref = variants_bi_filt["REF"][:]
phase2_bi_ref, phase2_bi_ref.shape

(array([b'G', b'T', b'G', ..., b'G', b'G', b'A'], dtype='|S1'), (8906423,))

In [41]:
phase2_bi_alt = variants_bi_filt["ALT"][:]
phase2_bi_alt, phase2_bi_ref.shape

(array([b'T', b'A', b'C', ..., b'T', b'T', b'G'], dtype='|S1'), (8906423,))

In [42]:
phase2_bi_refalt = np.column_stack([phase2_bi_ref, phase2_bi_alt])
phase2_bi_refalt, phase2_bi_refalt.shape

(array([[b'G', b'T'],
        [b'T', b'A'],
        [b'G', b'C'],
        ...,
        [b'G', b'T'],
        [b'G', b'T'],
        [b'A', b'G']], dtype='|S1'), (8906423, 2))

In [46]:
mapping = allel.create_allele_mapping(phase2_all_ref, phase2_all_alt, phase2_bi_refalt)
mapping

array([[ 0,  1, -1, -1],
       [ 0,  1, -1, -1],
       [ 0,  1, -1, -1],
       ...,
       [ 0,  1, -1, -1],
       [ 0,  1, -1, -1],
       [ 0,  1, -1, -1]], dtype=int8)

------------------------------

### <i> An.christyi </i> dataset

Subset christyi allele count and map its alleles

In [47]:
calldata_chr = calldata_out['2L']['chri']
calldata_chr[:], calldata_chr.shape

(array([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        ...,
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=int32), (21442865, 4))

In [48]:
loc2.shape, calldata_chr.shape

((21442865,), (21442865, 4))

In [49]:
loc2

array([False, False, False, ..., False, False, False])

In [50]:
ac_snp_chri = allel.AlleleCountsArray(calldata_chr)
ac_snp_chri

Unnamed: 0,0,1,2,3,Unnamed: 5
0,0,0,0,0,
1,0,0,0,0,
2,0,0,0,0,
...,...,...,...,...,...
21442862,0,0,0,0,
21442863,0,0,0,0,
21442864,0,0,0,0,


In [51]:
ac_snp_chri = ac_snp_chri.compress(loc2)
ac_snp_chri

Unnamed: 0,0,1,2,3,Unnamed: 5
0,0,0,0,0,
1,0,0,0,0,
2,0,0,0,0,
...,...,...,...,...,...
8906420,0,0,0,0,
8906421,0,0,0,0,
8906422,0,0,0,0,


In [52]:
mapping.shape, ac_snp_chri.shape

((8906423, 4), (8906423, 4))

In [53]:
chri_allele_count = ac_snp_chri.map_alleles(mapping)
chri_allele_count

Unnamed: 0,0,1,Unnamed: 3
0,0,0,
1,0,0,
2,0,0,
...,...,...,...
8906420,0,0,
8906421,0,0,
8906422,0,0,


----------------------------

In [9]:
def out_map(out_pop, chrom):

    pos_all = allel.SortedIndex(callset[chrom]["variants/POS"][:])
    pos_bi = allel.SortedIndex(callset_biallel[chrom]["variants/POS"])
    loc1, loc2 = pos_bi.locate_intersection(pos_all)
    pos_p2_sel = pos_bi[loc1]
    pos_p1_sel = pos_all[loc2]
    variants_all = allel.VariantChunkedTable(callset[chrom]["variants"], 
                                         names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                         index='POS')
    variants_bi = allel.VariantChunkedTable(callset_biallel[chrom]["variants"], 
                                         names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                         index='POS')
    
    variants_all_filt = variants_all.compress(loc2)
    phase2_all_ref = variants_all_filt["REF"][:]
    phase2_all_alt = variants_all_filt["ALT"][:]

    variants_bi_filt = variants_bi.compress(loc1)
    phase2_bi_ref = variants_bi_filt["REF"][:]
    phase2_bi_alt = variants_bi_filt["ALT"][:]
    
    phase2_bi_refalt = np.column_stack([phase2_bi_ref, phase2_bi_alt])
    mapping = allel.create_allele_mapping(phase2_all_ref, phase2_all_alt, phase2_bi_refalt)
    calldata_outgroup= h5py.File('data/outgroup_allele_counts_phase2.h5', mode='r')
    calldata_out_pop = calldata_outgroup[chrom][out_pop]
    ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
    ac_out_pop = ac_out_pop.compress(loc2)
    pop_map_ac = ac_out_pop.map_alleles(mapping)

    return pop_map_ac   

In [10]:
ac_chri_3L = out_map('chri', '3L')
ac_chri_3R = out_map('chri', '3R')
ac_chri_2L = out_map('chri', '2L')
ac_chri_2R = out_map('chri', '2R')
ac_chri_X = out_map('chri', 'X')

In [None]:
ac_chri_3L = out_map('chri', '3L')
ac_chri_3R = out_map('chri', '3R')
ac_chri_2L = out_map('chri', '2L')
ac_chri_2R = out_map('chri', '2R')
ac_chri_X = out_map('chri', 'X')