# Loading Modules

In [None]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py

Phase1 and Phase2 are different, either the outgroup alleles are different from phase2. They're based on phase 1 pass and REF and ALT alleles are different on the 2 phases. 
So the 1st step is find the positions shared from phase1 and phase2. After do that, apply these positions to phase2. Then map outgroup chromosomes.

### Loading datasets of phase1 and phase2

In [82]:
gcs_orig = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cache')
gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token=gcs_orig.session.credentials)

In [83]:
phase2path = os.path.join("ag1000g-release", "phase2.AR1", "variation", "main", "zarr", "biallelic", "ag1000g.phase2.ar1.pass")

In [84]:
gcsmap2 = gcsfs.mapping.GCSMap(phase2path, gcs=gcs)

In [85]:
phase1path = os.path.join("ag1000g-release", "phase1.AR3", "variation", "main", "zarr", "ag1000g.phase1.ar3.pass")

In [86]:
gcsmap1 = gcsfs.mapping.GCSMap(phase1path, gcs=gcs)

In [87]:
calldata_phase1= zarr.Group(gcsmap1, read_only=True)

-----------------------------------------------------

### Loading positions

In [92]:
pos_phase1 = allel.SortedIndex(calldata_phase1["2L"]["variants/POS"][:])
pos_phase1

0,1,2,3,4,...,10377275,10377276,10377277,10377278,10377279
44688,44691,44732,44736,44756,...,49356424,49356425,49356426,49356429,49356435


In [93]:
pos_phase2 = allel.SortedIndex(calldata_phase2["2L"]["variants/POS"])
pos_phase2

0,1,2,3,4,...,8906418,8906419,8906420,8906421,8906422
25050,51212,51214,51226,51245,...,49356421,49356424,49356425,49356426,49356429


Now I have to take only phase1 positions in phase2

In [96]:
loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
loc1, loc2

(array([False,  True,  True, ...,  True,  True,  True]),
 array([False, False, False, ...,  True,  True, False]))

In [99]:
pos_p2_sel = pos_phase2[loc1]
pos_p2_sel

0,1,2,3,4,...,7635919,7635920,7635921,7635922,7635923
51212,51214,51226,51245,51251,...,49356421,49356424,49356425,49356426,49356429


------------------------------

### Selecting same SNPs from christi dataset

In [100]:
calldata_out= h5py.File('/gcs/phase1.AR3/extras/outgroup_alleles.h5', mode='r')
calldata_out

<HDF5 file "outgroup_alleles.h5" (mode r)>

In [101]:
calldata_chr = calldata_out['2L']['chri']
calldata_chr[:], calldata_chr.shape

(array([b'.', b'.', b'.', ..., b'.', b'.', b'.'], dtype='|S1'), (10377280,))

In [None]:
sel_snp_chri = calldata_chr[loc2]
sel_snp_chri

----------------------------------------------------------------

#### Sandbox

In [81]:
variants_phase1 = allel.VariantChunkedTable(calldata_phase1["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
variants_phase1

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,44688,b'C',[b'T' b'' b''],27446,48.74,7.0,1,
1,44691,b'C',[b'A' b'' b''],27568,48.91,11.43,1,
2,44732,b'C',[b'T' b'' b''],28429,50.41,15.6,1,
...,...,...,...,...,...,...,...,...
10377277,49356426,b'G',[b'T' b'' b''],19560,41.48,10.36,1,
10377278,49356429,b'A',[b'G' b'' b''],19450,40.82,10.0,1,
10377279,49356435,b'G',[b'A' b'' b''],18687,39.57,14.0,1,


In [9]:
calldata_phase2= zarr.Group(gcsmap2, read_only=True)

In [10]:
variants_phase2 = allel.VariantChunkedTable(calldata_phase2["2L"]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
variants_phase2

Unnamed: 0,POS,REF,ALT,DP,MQ,QD,numalt,Unnamed: 8
0,25050,b'G',b'T',38417,51.53,8.02,1,
1,51212,b'T',b'A',39639,55.75,13.58,1,
2,51214,b'G',b'C',39461,55.73,19.83,1,
...,...,...,...,...,...,...,...,...
8906420,49356425,b'G',b'T',28138,41.81,9.58,1,
8906421,49356426,b'G',b'T',28311,41.57,10.3,1,
8906422,49356429,b'A',b'G',28106,40.92,10.13,1,


In [60]:
calldata_out= h5py.File('/gcs/phase1.AR3/extras/outgroup_alleles.h5', mode='r')
calldata_out

<HDF5 file "outgroup_alleles.h5" (mode r)>

In [61]:
calldata_out.keys()

<KeysViewHDF5 ['2L', '2R', '3L', '3R', 'X']>

In [62]:
calldata_out['2L'].keys()

<KeysViewHDF5 ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']>

In [63]:
calldata_out['2L']['chri'][1000_000:1000_010]

array([b'C', b'G', b'C', b'C', b'C', b'C', b'G', b'C', b'C', b'T'],
      dtype='|S1')

In [64]:
calldata_chr = calldata_out['2L']['chri']
calldata_chr[:], calldata_chr.shape

(array([b'.', b'.', b'.', ..., b'.', b'.', b'.'], dtype='|S1'), (10377280,))

In [65]:
calldata_out_count= h5py.File('/gcs/phase1.AR3/extras/outgroup_allele_counts.h5', mode='r')
calldata_out_count

<HDF5 file "outgroup_allele_counts.h5" (mode r)>

In [66]:
calldata_out_count.keys()

<KeysViewHDF5 ['2L', '2R', '3L', '3R', 'X']>

In [67]:
calldata_out_count['2L'].keys()

<KeysViewHDF5 ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']>

In [68]:
calldata_out_count['2L']['chri'][1000_000:1000_010]

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0]], dtype=int32)

In [69]:
calldata_chr_count = calldata_out_count['2L']['chri']
calldata_chr_count[:], calldata_chr_count.shape

(array([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        ...,
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=int32), (10377280, 4))

In [26]:
b = np.asarray(calldata_chr)

In [41]:
asd = b.tolist()

In [45]:
a = list(filter(lambda x: x!= b'.', asd))

In [47]:
len(a)

7793733

In [94]:
phase2_ref = calldata_phase2["2L"]["variants"]["REF"][:]
phase2_ref, phase2_ref.shape

(array([b'G', b'T', b'G', ..., b'G', b'G', b'A'], dtype='|S1'), (8906423,))

In [95]:
phase2_alt = calldata_phase2["2L"]["variants"]["ALT"][:]
phase2_alt, phase2_alt.shape

(array([b'T', b'A', b'C', ..., b'T', b'T', b'G'], dtype='|S1'), (8906423,))

In [96]:
allel.create_allele_mapping(phase2_ref, phase2_alt, calldata_chr)

ValueError: arrays do not have matching length for dimension 0