# Outgroup allele counts phase2 dataset creation

In this notebook I created the new dataset with allele counts for outgroups.
This notebook is structured in 5 steps:

1) <b>Loading step</b>, I loaded the Ag1000G phase2 pass data, the ingroup data and the outgroup data

2) <b>Alignment step</b>, I aligned the each outgroup genome with the phase2 pass dataset

3) <b>Writing step</b>, I wrote the new allele count of the outgroups on a zarr dataset

4) <b>Mapping step</b>, I mapped the outgroup alle count dataset to the phase2 biallelic daset to take only the biallelic variants for each outgroup

5) <b>Building step</b>, I wrote the new biallelic allele count of the outgroups in a new dataset


Import modules:

In [19]:
#import gcsfs #module for google cloud connectioimport gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import h5py
import petl as etl
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py
#import pyfasta

Import Alistair modules:

In [20]:
%run 'imports_20150407.ipynb'

-----------------------------

## 1) Loading step

Loading phase2 calldata:

In [None]:
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/pass/ag1000g.phase2.ar1.pass")  ## Adding phase2 genotype path 
gcsmap_p2 = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset= zarr.Group(gcsmap_p2, read_only=True) ## read data

---------------------------

Load ingroup calldata:

In [22]:
ingroup_species = 'arab', 'meru', 'mela', 'quad'
ingroup_callset_fn_template = '/bucket/outgroup/UnifiedGenotyper/{species}_ref_ug_vqsr_cnvrt_sort.h5'
agc_callsets = {species: h5py.File(ingroup_callset_fn_template.format(species=species), mode='r')
                    for species in ingroup_species}

Test mela:

In [23]:
agc_callsets['mela']

<HDF5 file "mela_ref_ug_vqsr_cnvrt_sort.h5" (mode r)>

Loading outgroup calldata:

In [24]:
outgroup_species = ['chri', 'epir']

In [25]:
outgroup_variants_fn_template = '/bucket/{species}_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.{chrom}.npy'

--------------------------

## 2) Align step

Align ingroup calldata to the phase2 calldata:

In [26]:
def align_ingroup_ac(chrom, species):
    
    # load Ag1000G variant positions and alternate alleles
    variants = callset[chrom]['variants']
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    alt = variants['ALT'][:]
    
    # load ingroup variant positions and alternate alleles
    variants_other = agc_callsets[species][chrom]['variants']
    pos_other = allel.SortedIndex(variants_other['POS'][:], copy=False)
    alt_other = agc_callsets[species][chrom]['variants']['ALT'][:]
    
    # locate intersection between callsets
    loc_isec, loc_other_isec = pos.locate_intersection(pos_other)
    # exclude duplicates
    loc_other_dup = pos_other == np.roll(pos_other, 1)
    loc_other_isec &= ~loc_other_dup
    assert nnz(loc_isec) == nnz(loc_other_isec)
    log(pos.size, 'variants in Ag1000G')
    log(nnz(loc_isec), 'variants in intersection')
    
    # filter data to the intersection
    alt_isec = alt[loc_isec]
    alt_other_isec = alt_other[loc_other_isec]
    
    # load ingroup genotypes and count alleles
    genotype_other_isec = allel.GenotypeChunkedArray(agc_callsets[species][chrom]['calldata']['genotype']).compress(loc_other_isec, axis=0)
    ac_other_isec = genotype_other_isec.count_alleles()[:]

    # setup array to store ingroup allele counts with alleles remapped to Ag1000G
    n_variants_isec = nnz(loc_isec)
    ac_am = np.zeros((n_variants_isec, 4), dtype='i4')

    # fill in reference allele counts
    ac_am[:, 0] = ac_other_isec[:, 0]
    
    # fill in alternate allele counts
    loc_a1 = alt_isec[:, 0] == alt_other_isec
    loc_a2 = alt_isec[:, 1] == alt_other_isec
    loc_a3 = alt_isec[:, 2] == alt_other_isec
    ac_am[loc_a1, 1] = ac_other_isec[loc_a1, 1]
    ac_am[loc_a2, 2] = ac_other_isec[loc_a2, 1]
    ac_am[loc_a3, 3] = ac_other_isec[loc_a3, 1]    
    
    # finally extend to all Ag1000G variant positions
    ac_aligned = np.zeros((pos.shape[0], 4), dtype='i4')
    ac_aligned[loc_isec] = ac_am
    for i in range(4):
        log(i, nnz(ac_aligned[:, i]))
    
    return ac_aligned

In [27]:
ac_aligned = align_ingroup_ac('3L', 'mela')
ac_aligned

10640388 variants in Ag1000G
9769468 variants in intersection
0 9259374
1 330829
2 87100
3 8495


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int32)

Align outgroup calldata to the phase2 calldata:

In [29]:
def align_outgroup_ac(chrom, species):

    # load Ag1000G variant positions and alternate alleles
    variants = callset[chrom]['variants']
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    alt = variants['ALT'][:]
    
    # load outgroup variant positions and alternate alleles
    variants_other = np.load(outgroup_variants_fn_template.format(species=species, chrom=chrom), mmap_mode='r')
    pos_other = allel.SortedIndex(variants_other['POS'], copy=False)
    alt_other = variants_other['ALT']

    # locate intersection between callsets
    loc_isec, loc_other_isec = pos.locate_intersection(pos_other)
    # exclude duplicates
    loc_other_dup = pos_other == np.roll(pos_other, 1)
    loc_other_isec &= ~loc_other_dup
    assert nnz(loc_isec) == nnz(loc_other_isec)
    log(pos.size, 'variants in Ag1000G')
    log(nnz(loc_isec), 'variants in intersection')

    # filter data to the intersection
    alt_isec = alt[loc_isec]
    alt_other_isec = alt_other[loc_other_isec]

    # setup array to store outgroup allele counts with alleles remapped to Ag1000G
    n_variants_isec = nnz(loc_isec)
    ac_am = np.zeros((n_variants_isec, 4), dtype='i4')
    
    # reference allele observed
    loc_ref = alt_other_isec == b'.'
    loc_a1 = alt_isec[:, 0] == alt_other_isec
    loc_a2 = alt_isec[:, 1] == alt_other_isec
    loc_a3 = alt_isec[:, 2] == alt_other_isec
    ac_am[loc_ref, 0] = 1
    ac_am[loc_a1, 1] = 1
    ac_am[loc_a2, 2] = 1
    ac_am[loc_a3, 3] = 1

    # finally extend to all Ag1000G variant positions
    ac_aligned = np.zeros((pos.shape[0], 4), dtype='i4')
    ac_aligned[loc_isec] = ac_am
    for i in range(4):
        log(i, nnz(ac_aligned[:, i]))
    
    return ac_aligned

christyi try:

In [11]:
ac_aligned = align_outgroup_ac('3L', 'chri')
ac_aligned

18167056 variants in Ag1000G
9887983 variants in intersection
0 7203267
1 1189266
2 300458
3 36233


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int32)

--------------------------------

## 3) Writing step

Write the align on a new hdf5 dataset:

In [22]:
with h5py.File('data/outgroup_allele_counts_phase2.h5',
               mode='a') as outgroup_allele_counts:
    for chrom in chromosomes:
        h5g = outgroup_allele_counts.require_group(chrom)
        for species in ingroup_species:
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                ac_aligned = align_ingroup_ac(chrom, species)
                h5d = h5g.create_dataset(species, data=ac_aligned, chunks=True)

        for species in outgroup_species:
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                ac_aligned = align_outgroup_ac(chrom, species)
                h5d = h5g.create_dataset(species, data=ac_aligned, chunks=True)

2R arab building
15425222 variants in Ag1000G
14834299 variants in intersection
0 14576684
1 428646
2 94803
3 7051
2R meru building
15425222 variants in Ag1000G
14685803 variants in intersection
0 13977209
1 523340
2 118070
3 9512
2R mela building
15425222 variants in Ag1000G
14362070 variants in intersection
0 13635602
1 503871
2 112333
3 9142
2R quad building
15425222 variants in Ag1000G
14709343 variants in intersection
0 14292439
1 412504
2 93303
3 7233
2R chri building
15425222 variants in Ag1000G
11969950 variants in intersection
0 8806507
1 1437481
2 300048
3 28887
2R epir building
15425222 variants in Ag1000G
12052087 variants in intersection
0 8480598
1 1539524
2 323175
3 28921
2L arab building
11524923 variants in Ag1000G
11049210 variants in intersection
0 10800836
1 332781
2 78528
3 5890
2L meru building
11524923 variants in Ag1000G
10957865 variants in intersection
0 10484963
1 352624
2 82733
3 6598
2L mela building
11524923 variants in Ag1000G
10669729 variants in interse

----------------------------------------------

## 4 Mapping step

In [30]:
calldata_out= h5py.File('data/outgroup_allele_counts_phase2.h5', mode='r')
calldata_out.keys()

<KeysViewHDF5 ['2L', '2R', '3L', '3R', 'X']>

In [None]:
geno_bi_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")  ## Adding phase2 biallelic genotype path 
gcsmap_bi = gcsfs.mapping.GCSMap(geno_bi_path, gcs=gcs) ## link callset
callset_biallel= zarr.Group(gcsmap_bi, read_only=True) ## read data

In [35]:
def out_map(out_pop, chrom):
    
            pos_all = allel.SortedIndex(callset[chrom]["variants/POS"][:])
            pos_bi = allel.SortedIndex(callset_biallel[chrom]["variants/POS"])
            loc1, loc2 = pos_bi.locate_intersection(pos_all)
            pos_p2_sel = pos_bi[loc1]
            pos_p1_sel = pos_all[loc2]
            variants_all = allel.VariantChunkedTable(callset[chrom]["variants"], 
                                                 names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD'],
                                                 index='POS')
            variants_bi = allel.VariantChunkedTable(callset_biallel[chrom]["variants"], 
                                                 names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                                 index='POS')

            variants_all_filt = variants_all.compress(loc2)
            phase2_all_ref = variants_all_filt["REF"][:]
            phase2_all_alt = variants_all_filt["ALT"][:]

            variants_bi_filt = variants_bi.compress(loc1)
            phase2_bi_ref = variants_bi_filt["REF"][:]
            phase2_bi_alt = variants_bi_filt["ALT"][:]

            phase2_bi_refalt = np.column_stack([phase2_bi_ref, phase2_bi_alt])
            mapping = allel.create_allele_mapping(phase2_all_ref, phase2_all_alt, phase2_bi_refalt)
            calldata_outgroup= h5py.File('outgroup_allele_counts_phase2.h5', mode='r')
            calldata_out_pop = calldata_outgroup[chrom][out_pop]
            ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
            ac_out_pop = ac_out_pop.compress(loc2)
            pop_map_ac = ac_out_pop.map_alleles(mapping)
            

            return pop_map_ac   

In [37]:
chri_2R = out_map('chri' , '2R')
chri_2L = out_map('chri' , '2L')
chri_3R = out_map('chri' , '3R')
chri_3L = out_map('chri' , '3L')
chri_X = out_map('chri' , 'X')

In [38]:
epir_2R = out_map('epir' , '2R')
epir_2L = out_map('epir' , '2L')
epir_3R = out_map('epir' , '3R')
epir_3L = out_map('epir' , '3L')
epir_X = out_map('epir' , 'X')

In [39]:
arab_2R = out_map('arab' , '2R')
arab_2L = out_map('arab' , '2L')
arab_3R = out_map('arab' , '3R')
arab_3L = out_map('arab' , '3L')
arab_X = out_map('arab' , 'X')

In [40]:
meru_2R = out_map('meru' , '2R')
meru_2L = out_map('meru' , '2L')
meru_3R = out_map('meru' , '3R')
meru_3L = out_map('meru' , '3L')
meru_X = out_map('meru' , 'X')

In [41]:
mela_2R = out_map('mela' , '2R')
mela_2L = out_map('mela' , '2L')
mela_3R = out_map('mela' , '3R')
mela_3L = out_map('mela' , '3L')
mela_X = out_map('mela' , 'X')

In [42]:
quad_2R = out_map('quad' , '2R')
quad_2L = out_map('quad' , '2L')
quad_3R = out_map('quad' , '3R')
quad_3L = out_map('quad' , '3L')
quad_X = out_map('quad' , 'X')

## 5) Writing step

In [43]:
root = zarr.open('outgroup_alleles_phase2.zarr', mode='a')

In [44]:
#foo = root.create_group('foo')
#bar = foo.create_dataset('bar', data=my_array)
# shortcuts
bar = root.create_dataset('2R/chri', data=chri_2R)
bar = root.create_dataset('2R/epir', data=epir_2R)
bar = root.create_dataset('2R/mela', data=mela_2R)
bar = root.create_dataset('2R/meru', data=meru_2R)
bar = root.create_dataset('2R/arab', data=arab_2R)
bar = root.create_dataset('2R/quad', data=quad_2R)

In [45]:
bar = root.create_dataset('2L/chri', data=chri_2L)
bar = root.create_dataset('2L/epir', data=epir_2L)
bar = root.create_dataset('2L/mela', data=mela_2L)
bar = root.create_dataset('2L/meru', data=meru_2L)
bar = root.create_dataset('2L/arab', data=arab_2L)
bar = root.create_dataset('2L/quad', data=quad_2L)

In [46]:
bar = root.create_dataset('3R/chri', data=chri_3R)
bar = root.create_dataset('3R/epir', data=epir_3R)
bar = root.create_dataset('3R/mela', data=mela_3R)
bar = root.create_dataset('3R/meru', data=meru_3R)
bar = root.create_dataset('3R/arab', data=arab_3R)
bar = root.create_dataset('3R/quad', data=quad_3R)

In [47]:
bar = root.create_dataset('3L/chri', data=chri_3L)
bar = root.create_dataset('3L/epir', data=epir_3L)
bar = root.create_dataset('3L/mela', data=mela_3L)
bar = root.create_dataset('3L/meru', data=meru_3L)
bar = root.create_dataset('3L/arab', data=arab_3L)
bar = root.create_dataset('3L/quad', data=quad_3L)

In [48]:
bar = root.create_dataset('X/chri', data=chri_X)
bar = root.create_dataset('X/epir', data=epir_X)
bar = root.create_dataset('X/mela', data=mela_X)
bar = root.create_dataset('X/meru', data=meru_X)
bar = root.create_dataset('X/arab', data=arab_X)
bar = root.create_dataset('X/quad', data=quad_X)

In [49]:
root.tree()

In [50]:
# read data again
my_array= root['2L/arab'][:]
my_array

array([[24,  0],
       [24,  0],
       [24,  0],
       ...,
       [ 0,  0],
       [ 0,  0],
       [ 0,  0]], dtype=int32)

----------------------------------

In [51]:
allel.AlleleCountsArray(root['2L/arab'])

Unnamed: 0,0,1,Unnamed: 3
0,24,0,
1,24,0,
2,24,0,
...,...,...,...
8906420,0,0,
8906421,0,0,
8906422,0,0,


In [52]:
allel.AlleleCountsArray(calldata_out['2L/arab'])

Unnamed: 0,0,1,2,3,Unnamed: 5
0,24,0,0,0,
1,24,0,0,0,
2,24,0,0,0,
...,...,...,...,...,...
11524920,0,0,0,0,
11524921,0,0,0,0,
11524922,0,0,0,0,


In [53]:
calldata_out= zarr.open('outgroup_alleles_phase2.zarr')


In [54]:
calldata_out.tree()