# Notebook for create a new outgroup dataset for phase2

This notebook is structured in 5 steps:

1) Loading step

2) Alignment step

3) Writing step

4) Mapping step

5) Building step


Import modules:

In [2]:
%run 'imports.ipynb'

-----------------------------

## 1) Loading step

Loading phase2 calldata:

In [4]:
gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud') ## cloud connection function

In [5]:
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/all/ag1000g.phase2.ar1")  ## Adding phase2 genotype path 
gcsmap = gcsfs.mapping.GCSMap(geno_path, gcs=gcs) ## link callset
callset= zarr.Group(gcsmap, read_only=True) ## read data

In [6]:
geno_bi_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")  ## Adding phase2 genotype path 
gcsmap_bi = gcsfs.mapping.GCSMap(geno_bi_path, gcs=gcs) ## link callset
callset_biallel= zarr.Group(gcsmap_bi, read_only=True) ## read data

---------------------------

Load ingroup calldata:

In [31]:
ingroup_species = 'arab','meru', 'mela', 'quad'


In [None]:
ingroup_callset_fn_template = '/bucket/outgroup/{species}_ref_ug_vqsr_cnvrt_sort.h5'
agc_callsets = {species: h5py.File(ingroup_callset_fn_template.format(species=species), mode='r')
                    for species in ingroup_species}

Test mela:

In [None]:
agc_callsets['mela']

Loading outgroup calldata:

In [32]:
outgroup_species = ['chri', 'epir']

In [None]:
outgroup_variants_fn_template = '/bucket/{species}_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.{chrom}.npy'

--------------------------

## 2) Align step

Align ingroup calldata to the phase2 calldata:

In [13]:
def align_ingroup_ac(chrom, species):
    
    # load Ag1000G variant positions and alternate alleles
    variants = callset[chrom]['variants']
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    alt = variants['ALT'][:]
    
    # load ingroup variant positions and alternate alleles
    variants_other = agc_callsets[species][chrom]['variants']
    pos_other = allel.SortedIndex(variants_other['POS'][:], copy=False)
    alt_other = agc_callsets[species][chrom]['variants']['ALT'][:]
    
    # locate intersection between callsets
    loc_isec, loc_other_isec = pos.locate_intersection(pos_other)
    # exclude duplicates
    loc_other_dup = pos_other == np.roll(pos_other, 1)
    loc_other_isec &= ~loc_other_dup
    assert nnz(loc_isec) == nnz(loc_other_isec)
    log(pos.size, 'variants in Ag1000G')
    log(nnz(loc_isec), 'variants in intersection')
    
    # filter data to the intersection
    alt_isec = alt[loc_isec]
    alt_other_isec = alt_other[loc_other_isec]
    
    # load ingroup genotypes and count alleles
    genotype_other_isec = allel.GenotypeChunkedArray(agc_callsets[species][chrom]['calldata']['genotype']).compress(loc_other_isec, axis=0)
    ac_other_isec = genotype_other_isec.count_alleles()[:]

    # setup array to store ingroup allele counts with alleles remapped to Ag1000G
    n_variants_isec = nnz(loc_isec)
    ac_am = np.zeros((n_variants_isec, 4), dtype='i4')

    # fill in reference allele counts
    ac_am[:, 0] = ac_other_isec[:, 0]
    
    # fill in alternate allele counts
    loc_a1 = alt_isec[:, 0] == alt_other_isec
    loc_a2 = alt_isec[:, 1] == alt_other_isec
    loc_a3 = alt_isec[:, 2] == alt_other_isec
    ac_am[loc_a1, 1] = ac_other_isec[loc_a1, 1]
    ac_am[loc_a2, 2] = ac_other_isec[loc_a2, 1]
    ac_am[loc_a3, 3] = ac_other_isec[loc_a3, 1]    
    
    # finally extend to all Ag1000G variant positions
    ac_aligned = np.zeros((pos.shape[0], 4), dtype='i4')
    ac_aligned[loc_isec] = ac_am
    for i in range(4):
        log(i, nnz(ac_aligned[:, i]))
    
    return ac_aligned

In [26]:
ac_aligned = align_ingroup_ac('3L', 'mela')
ac_aligned

18167056 variants in Ag1000G
13772829 variants in intersection
0 12797186
1 627764
2 176462
3 19444


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int32)

Align outgroup calldata to the phase2 calldata:

In [14]:
def align_outgroup_ac(chrom, species):

    # load Ag1000G variant positions and alternate alleles
    variants = callset[chrom]['variants']
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    alt = variants['ALT'][:]
    
    # load outgroup variant positions and alternate alleles
    variants_other = np.load(outgroup_variants_fn_template.format(species=species, chrom=chrom), mmap_mode='r')
    pos_other = allel.SortedIndex(variants_other['POS'], copy=False)
    alt_other = variants_other['ALT']

    # locate intersection between callsets
    loc_isec, loc_other_isec = pos.locate_intersection(pos_other)
    # exclude duplicates
    loc_other_dup = pos_other == np.roll(pos_other, 1)
    loc_other_isec &= ~loc_other_dup
    assert nnz(loc_isec) == nnz(loc_other_isec)
    log(pos.size, 'variants in Ag1000G')
    log(nnz(loc_isec), 'variants in intersection')

    # filter data to the intersection
    alt_isec = alt[loc_isec]
    alt_other_isec = alt_other[loc_other_isec]

    # setup array to store outgroup allele counts with alleles remapped to Ag1000G
    n_variants_isec = nnz(loc_isec)
    ac_am = np.zeros((n_variants_isec, 4), dtype='i4')
    
    # reference allele observed
    loc_ref = alt_other_isec == b'.'
    loc_a1 = alt_isec[:, 0] == alt_other_isec
    loc_a2 = alt_isec[:, 1] == alt_other_isec
    loc_a3 = alt_isec[:, 2] == alt_other_isec
    ac_am[loc_ref, 0] = 1
    ac_am[loc_a1, 1] = 1
    ac_am[loc_a2, 2] = 1
    ac_am[loc_a3, 3] = 1

    # finally extend to all Ag1000G variant positions
    ac_aligned = np.zeros((pos.shape[0], 4), dtype='i4')
    ac_aligned[loc_isec] = ac_am
    for i in range(4):
        log(i, nnz(ac_aligned[:, i]))
    
    return ac_aligned

christyi try:

In [11]:
ac_aligned = align_outgroup_ac('3L', 'chri')
ac_aligned

18167056 variants in Ag1000G
9887983 variants in intersection
0 7203267
1 1189266
2 300458
3 36233


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int32)

--------------------------------

## 3) Writing step

Write the align on a new hdf5 dataset:

In [None]:
with h5py.File('data/outgroup_allele_counts_phase2.h5',
               mode='a') as outgroup_allele_counts:
    for chrom in chromosomes:
        h5g = outgroup_allele_counts.require_group(chrom)
        for species in ingroup_species:
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                ac_aligned = align_ingroup_ac(chrom, species)
                h5d = h5g.create_dataset(species, data=ac_aligned, chunks=True)

        for species in outgroup_species:
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                ac_aligned = align_outgroup_ac(chrom, species)
                h5d = h5g.create_dataset(species, data=ac_aligned, chunks=True)

----------------------------------------------

## 4 Mapping step

In [3]:
calldata_out= h5py.File('data/outgroup_allele_counts_phase2.h5', mode='r')
calldata_out.keys()

<KeysViewHDF5 ['2L', '2R', '3L', '3R', 'X']>

In [4]:
calldata_out['2L']['chri']

<HDF5 dataset "chri": shape (11524923, 4), type "<i4">

In [12]:
def out_map(species, chrom):
    
        for chrom in chromosomes:

            pos_all = allel.SortedIndex(callset[chrom]["variants/POS"][:])
            pos_bi = allel.SortedIndex(callset_biallel[chrom]["variants/POS"])
            loc1, loc2 = pos_bi.locate_intersection(pos_all)
            pos_p2_sel = pos_bi[loc1]
            pos_p1_sel = pos_all[loc2]
            variants_all = allel.VariantChunkedTable(callset[chrom]["variants"], 
                                                 names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                                 index='POS')
            variants_bi = allel.VariantChunkedTable(callset_biallel[chrom]["variants"], 
                                                 names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                                 index='POS')

            variants_all_filt = variants_all.compress(loc2)
            phase2_all_ref = variants_all_filt["REF"][:]
            phase2_all_alt = variants_all_filt["ALT"][:]

            variants_bi_filt = variants_bi.compress(loc1)
            phase2_bi_ref = variants_bi_filt["REF"][:]
            phase2_bi_alt = variants_bi_filt["ALT"][:]

            phase2_bi_refalt = np.column_stack([phase2_bi_ref, phase2_bi_alt])
            mapping = allel.create_allele_mapping(phase2_all_ref, phase2_all_alt, phase2_bi_refalt)
            calldata_outgroup= h5py.File('data/outgroup_allele_counts_phase2.h5', mode='r')
            calldata_out_pop = calldata_outgroup[chrom][species]
            ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
            ac_out_pop = ac_out_pop.compress(loc2)
            pop_map_ac = ac_out_pop.map_alleles(mapping)

        return pop_map_ac   

In [None]:
chri_2R = out_map('chri' , '2R')
chri_2L = out_map('chri' , '2L')
chri_3R = out_map('chri' , '3R')
chri_3L = out_map('chri' , '3L')
chri_X = out_map('chri' , 'X')

In [None]:
epir_2R = out_map('epir' , '2R')
epir_2L = out_map('epir' , '2L')
epir_3R = out_map('epir' , '3R')
epir_3L = out_map('epir' , '3L')
epir_X = out_map('epir' , 'X')

In [None]:
arab_2R = out_map('arab' , '2R')
arab_2L = out_map('arab' , '2L')
arab_3R = out_map('arab' , '3R')
arab_3L = out_map('arab' , '3L')
arab_X = out_map('arab' , 'X')

In [None]:
meru_2R = out_map('meru' , '2R')
meru_2L = out_map('meru' , '2L')
meru_3R = out_map('meru' , '3R')
meru_3L = out_map('meru' , '3L')
meru_X = out_map('meru' , 'X')

In [None]:
mela_2R = out_map('mela' , '2R')
mela_2L = out_map('mela' , '2L')
mela_3R = out_map('mela' , '3R')
mela_3L = out_map('mela' , '3L')
mela_X = out_map('mela' , 'X')

In [None]:
quad_2R = out_map('quad' , '2R')
quad_2L = out_map('quad' , '2L')
quad_3R = out_map('quad' , '3R')
quad_3L = out_map('quad' , '3L')
quad_X = out_map('quad' , 'X')

-------------------------

## 5) Writing step

In [None]:
root = zarr.open('outgroup_alleles_phase2.zarr', mode='a')

In [None]:
#foo = root.create_group('foo')
#bar = foo.create_dataset('bar', data=my_array)
# shortcuts
bar = root.create_dataset('2R/chri', data=chri_2R)
bar = root.create_dataset('2R/epir', data=epir_2R)
bar = root.create_dataset('2R/mela', data=mela_2R)
bar = root.create_dataset('2R/meru', data=meru_2R)
bar = root.create_dataset('2R/arab', data=arab_2R)
bar = root.create_dataset('2R/quad', data=quad_2R)

In [None]:
bar = root.create_dataset('2L/chri', data=chri_2L)
bar = root.create_dataset('2L/epir', data=epir_2L)
bar = root.create_dataset('2L/mela', data=mela_2L)
bar = root.create_dataset('2L/meru', data=meru_2L)
bar = root.create_dataset('2L/arab', data=arab_2L)
bar = root.create_dataset('2L/quad', data=quad_2L)

In [None]:
bar = root.create_dataset('3R/chri', data=chri_3R)
bar = root.create_dataset('3R/epir', data=epir_3R)
bar = root.create_dataset('3R/mela', data=mela_3R)
bar = root.create_dataset('3R/meru', data=meru_3R)
bar = root.create_dataset('3R/arab', data=arab_3R)
bar = root.create_dataset('3R/quad', data=quad_3R)

In [None]:
bar = root.create_dataset('3L/chri', data=chri_3L)
bar = root.create_dataset('3L/epir', data=epir_3L)
bar = root.create_dataset('3L/mela', data=mela_3L)
bar = root.create_dataset('3L/meru', data=meru_3L)
bar = root.create_dataset('3L/arab', data=arab_3L)
bar = root.create_dataset('3L/quad', data=quad_3L)

In [None]:
bar = root.create_dataset('X/chri', data=chri_X)
bar = root.create_dataset('X/epir', data=epir_X)
bar = root.create_dataset('X/mela', data=mela_X)
bar = root.create_dataset('X/meru', data=meru_X)
bar = root.create_dataset('X/arab', data=arab_X)
bar = root.create_dataset('X/quad', data=quad_X)

In [None]:
root.tree()

In [None]:
# read data again
my_array= root['2L/arab'][:]
my_array

In [None]:
allel.AlleleCountsArray(root['2L/arab'])

In [None]:
allel.AlleleCountsArray(calldata_out['2L/arab'])