# Loading Modules

In [1]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py

### Loading datasets of phase1 and phase2

In [2]:
gcs_orig = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cache')
gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token=gcs_orig.session.credentials)

In [3]:
#phase2path = os.path.join("ag1000g-release", "phase2.AR1", "variation", "main", "zarr", "pass", "ag1000g.phase2.ar1.pass")
phase2path = os.path.join("ag1000g-release", "phase2.AR1", "variation", "main", "zarr", "biallelic", "ag1000g.phase2.ar1.pass.biallelic")

In [4]:
gcsmap2 = gcsfs.mapping.GCSMap(phase2path, gcs=gcs)

In [5]:
calldata_phase2= zarr.Group(gcsmap2, read_only=True)

In [6]:
phase1path = os.path.join("ag1000g-release", "phase1.AR3", "variation", "main", "zarr", "ag1000g.phase1.ar3.pass")

In [7]:
gcsmap1 = gcsfs.mapping.GCSMap(phase1path, gcs=gcs)

In [8]:
calldata_phase1= zarr.Group(gcsmap1, read_only=True)

--------------------------------------------

## Create a Mapping Function for phase2

In [9]:
def out_map(out_pop, chrom):

    ###### Create the new allel map from phase1 to phase2 ######

    pos_phase1 = allel.SortedIndex(calldata_phase1[chrom]["variants/POS"][:])
    pos_phase2 = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"][:])
    loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
    pos_p2_sel = pos_phase2[loc1]
    pos_p1_sel = pos_phase1[loc2]
    variants_phase1 = allel.VariantChunkedTable(calldata_phase1["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase1_filt = variants_phase1.compress(loc2)
    phase1_ref = variants_phase1_filt["REF"][:]
    phase1_alt = variants_phase1_filt["ALT"][:]
    variants_phase2 = allel.VariantChunkedTable(calldata_phase2["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase2_filt = variants_phase2.compress(loc1)
    phase2_ref = variants_phase2_filt["REF"][:]
    phase2_alt = variants_phase2_filt["ALT"][:]
    phase2refalt = np.column_stack([phase2_ref, phase2_alt])
    mapping = allel.create_allele_mapping(phase1_ref, phase1_alt, phase2refalt)
    
    ###### Now Mapping on our selected Outgroup ######
    
    
    calldata_outgroup= h5py.File('/gcs/phase1.AR3/extras/outgroup_allele_counts.h5', mode='r')
    calldata_out_pop = calldata_outgroup[chrom][out_pop]
    ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
    ac_out_pop = ac_out_pop.compress(loc2)
    pop_map_ac = ac_out_pop.map_alleles(mapping)

    return pop_map_ac   

In [14]:
asd = out_map('epir', '2L')
asd

Unnamed: 0,0,1,Unnamed: 3
0,1,0,
1,0,0,
2,1,0,
...,...,...,...
7635921,0,0,
7635922,0,0,
7635923,0,0,


In [15]:
esd = out_map('meru', '2L')
esd

Unnamed: 0,0,1,Unnamed: 3
0,20,0,
1,20,0,
2,20,0,
...,...,...,...
7635921,0,0,
7635922,0,0,
7635923,0,0,


------------------------------

In [21]:
calldata_outgroup= h5py.File('/gcs/phase1.AR3/extras/outgroup_allele_counts.h5', mode='r')

In [22]:
calldata_outgroup['2L'].keys()

<KeysViewHDF5 ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']>

In [68]:
calldata_out_count['2L']['chri'][1000_000:1000_010]

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0]], dtype=int32)

In [69]:
calldata_chr_count = calldata_out_count['2L']['chri']
calldata_chr_count[:], calldata_chr_count.shape

(array([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        ...,
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=int32), (10377280, 4))

In [26]:
b = np.asarray(calldata_chr)

In [41]:
asd = b.tolist()

In [45]:
a = list(filter(lambda x: x!= b'.', asd))

In [47]:
len(a)

7793733

In [94]:
phase2_ref = calldata_phase2["2L"]["variants"]["REF"][:]
phase2_ref, phase2_ref.shape

(array([b'G', b'T', b'G', ..., b'G', b'G', b'A'], dtype='|S1'), (8906423,))

In [95]:
phase2_alt = calldata_phase2["2L"]["variants"]["ALT"][:]
phase2_alt, phase2_alt.shape

(array([b'T', b'A', b'C', ..., b'T', b'T', b'G'], dtype='|S1'), (8906423,))