# Notebook for create a new outgroup dataset for phase2

Import modules:

In [74]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import h5py
import petl as etl
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py
import pyfasta

Import Alistair modules:

In [75]:
%run '../../imports_20150407.ipynb'

allel 1.2.1


-----------------------------

Loading phase2 calldata:

In [76]:
gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud') ## cloud connection function

In [77]:
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/all/ag1000g.phase2.ar1")  ## Adding phase2 genotype path 
gcsmap = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset= zarr.Group(gcsmap, read_only=True) ## read data

Loading outgroup calldata:

In [84]:
outgroup_species = ['chri', 'epir']

In [80]:
outgroup_variants_fn_template = '/home/jovyan/notebooks/data/{species}_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.{chrom}.npy'

--------------------------

Align outgroup calldata to the phase2 calldata:

In [81]:
def align_outgroup_ac(chrom, species):

    # load Ag1000G variant positions and alternate alleles
    variants = callset[chrom]['variants']
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    alt = variants['ALT'][:]
    
    # load outgroup variant positions and alternate alleles
    variants_other = np.load(outgroup_variants_fn_template.format(species=species, chrom=chrom), mmap_mode='r')
    pos_other = allel.SortedIndex(variants_other['POS'], copy=False)
    alt_other = variants_other['ALT']

    # locate intersection between callsets
    loc_isec, loc_other_isec = pos.locate_intersection(pos_other)
    # exclude duplicates
    loc_other_dup = pos_other == np.roll(pos_other, 1)
    loc_other_isec &= ~loc_other_dup
    assert nnz(loc_isec) == nnz(loc_other_isec)
    log(pos.size, 'variants in Ag1000G')
    log(nnz(loc_isec), 'variants in intersection')

    # filter data to the intersection
    alt_isec = alt[loc_isec]
    alt_other_isec = alt_other[loc_other_isec]

    # setup array to store outgroup allele counts with alleles remapped to Ag1000G
    n_variants_isec = nnz(loc_isec)
    ac_am = np.zeros((n_variants_isec, 4), dtype='i4')
    
    # reference allele observed
    loc_ref = alt_other_isec == b'.'
    loc_a1 = alt_isec[:, 0] == alt_other_isec
    loc_a2 = alt_isec[:, 1] == alt_other_isec
    loc_a3 = alt_isec[:, 2] == alt_other_isec
    ac_am[loc_ref, 0] = 1
    ac_am[loc_a1, 1] = 1
    ac_am[loc_a2, 2] = 1
    ac_am[loc_a3, 3] = 1

    # finally extend to all Ag1000G variant positions
    ac_aligned = np.zeros((pos.shape[0], 4), dtype='i4')
    ac_aligned[loc_isec] = ac_am
    for i in range(4):
        log(i, nnz(ac_aligned[:, i]))
    
    return ac_aligned

christyi try:

In [82]:
ac_aligned = align_outgroup_ac('3L', 'chri')
ac_aligned

18167056 variants in Ag1000G
9887983 variants in intersection
0 7203267
1 1189266
2 300458
3 36233


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int32)

--------------------------------

Write the align on a new hdf5 dataset:

In [87]:
with h5py.File('outgroup_allele_counts_phase2.h5',
               mode='a') as outgroup_allele_counts:
    for chrom in chromosomes:
        h5g = outgroup_allele_counts.require_group(chrom)
        for species in outgroup_species:
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                ac_aligned = align_outgroup_ac(chrom, species)
                h5d = h5g.create_dataset(species, data=ac_aligned, chunks=True)

2R chri skipping
2R epir skipping
2L chri skipping
2L epir building
21442865 variants in Ag1000G
12910149 variants in intersection
0 8914412
1 1718989
2 400287
3 39988
3R chri skipping
3R epir building
24943504 variants in Ag1000G
14216276 variants in intersection
0 9957236
1 1807128
2 469254
3 54642
3L chri skipping
3L epir building
18167056 variants in Ag1000G
10108545 variants in intersection
0 7099424
1 1277364
2 324472
3 36962
X chri skipping
X epir building
9389639 variants in Ag1000G
2325640 variants in intersection
0 1570560
1 315456
2 61950
3 5518
