In [21]:
%run imports.ipynb
import cython
import vcfnp

In [4]:
callset_fn = '/kwiat/1/anopheles/ag1000g/data/phase2/release/AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic/'
callset= zarr.open(callset_fn)

In [10]:
outgroup_species = 'epir', 'chri'
outgroup_vcf_fn_template = '/kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/{species}_fake_cnvrt_sort.vcf.gz'

ingroup_species = 'arab', 'meru', 'mela', 'quad'
ingroup_callset_fn_template = '/kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/UnifiedGenotyper/{species}_ref_ug_vqsr_cnvrt_sort.h5'
ingroup_callsets = {species: h5py.File(ingroup_callset_fn_template.format(species=species), mode='r')
                    for species in ingroup_species}

In [15]:
def call_ingroup_allele(chrom, species):

    # extract position and alleles
    variants = ingroup_callsets[species][chrom]['variants'] 
    pos = allel.SortedIndex(variants['POS'][:], copy=False)
    ref = variants['REF'][:]
    alt = variants['ALT'][:]
    alleles = np.column_stack([ref, alt])
    
    # extract genotypes
    g = allel.GenotypeChunkedArray(ingroup_callsets[species][chrom]['calldata']['genotype'])
    
    # count alleles
    ac = g.count_alleles()
    
    # determine major allele
    major_allele_idx = np.argmax(ac[:], axis=1)
    rows = np.arange(pos.shape[0])
    cols = major_allele_idx[:]
    ingroup_allele = alleles[rows[:, None], cols[:, None]][:, 0]
    
    return pos, ingroup_allele


In [16]:
def call_outgroup_allele(chrom, species):
    
    # load variants
    variants = vcfnp.variants(
        outgroup_vcf_fn_template.format(species=species),
        region=chrom,
        fields=['CHROM', 'POS', 'REF', 'ALT'],
        dtypes={'REF': 'a1', 'ALT': 'a1'},
        arities={'ALT': 1},
        progress=5000000,
        cache=True,
    )
    
    # extract position and alleles
    pos = allel.SortedIndex(variants['POS'], copy=False)
    ref = variants['REF']
    alt = variants['ALT']
    
    # determine called allele
    outgroup_allele = np.where(alt == b'.', ref, alt)
    
    return pos, outgroup_allele
    

In [17]:
def call_species_allele(chrom, species):
    if species in ingroup_species:
        return call_ingroup_allele(chrom, species)
    elif species in outgroup_species:
        return call_outgroup_allele(chrom, species)
    else:
        raise Exception(species)

In [23]:
def align_species_allele(chrom, species):
    
    # determine species allele
    pos_other, allele_other = call_species_allele(chrom, species)
    
    # extract Ag1000G variant positions
    pos = allel.SortedIndex(callset[chrom]['variants']['POS'][:], copy=False)
    
    # set up output array
    allele_other_aln = np.array([b'.'] * pos.size)

    # find intersection
    loc1, loc2 = pos_other.locate_intersection(pos)
    
    # exclude duplicates
    loc_dup = pos_other == np.roll(pos_other, 1)
    loc_isec_other = loc1 & ~loc_dup
    
    # fill output array
    allele_other_aln[loc2] = allele_other[loc_isec_other]
    
    # check shape
    assert allele_other_aln.shape[0] == pos.shape[0]
    
    return allele_other_aln

In [26]:
other_species = ingroup_species + outgroup_species

In [29]:
# store other species alleles at Ag1000G variant positions
outgroup_alleles_fn = '/home/beniamino/notebook/data/outgroup_alleles_phase2.h5'
with h5py.File(outgroup_alleles_fn, mode='a') as outgroup_alleles:
    for chrom in chromosomes:
        for species in other_species:
            h5g = outgroup_alleles.require_group(chrom)
            if species in h5g:
                log(chrom, species, 'skipping')
            else:
                log(chrom, species, 'building')
                data = align_species_allele(chrom, species)
                h5d = h5g.create_dataset(species, data=data, chunks=True)
                log(h5d)


2R arab building
<HDF5 dataset "arab": shape (12047846,), type "|S1">
2R meru building
<HDF5 dataset "meru": shape (12047846,), type "|S1">
2R mela building
<HDF5 dataset "mela": shape (12047846,), type "|S1">
2R quad building
<HDF5 dataset "quad": shape (12047846,), type "|S1">
2R epir building


[vcfnp] 2019-10-01 10:46:18.446485 :: caching is enabled
[vcfnp] 2019-10-01 10:46:18.447948 :: cache file available
[vcfnp] 2019-10-01 10:46:18.449253 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/epir_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.2R.npy


<HDF5 dataset "epir": shape (12047846,), type "|S1">
2R chri building


[vcfnp] 2019-10-01 10:46:44.982067 :: caching is enabled
[vcfnp] 2019-10-01 10:46:45.004302 :: cache file available
[vcfnp] 2019-10-01 10:46:45.005290 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/chri_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.2R.npy


<HDF5 dataset "chri": shape (12047846,), type "|S1">
2L arab building
<HDF5 dataset "arab": shape (8906423,), type "|S1">
2L meru building
<HDF5 dataset "meru": shape (8906423,), type "|S1">
2L mela building
<HDF5 dataset "mela": shape (8906423,), type "|S1">
2L quad building
<HDF5 dataset "quad": shape (8906423,), type "|S1">
2L epir building


[vcfnp] 2019-10-01 10:49:56.136820 :: caching is enabled
[vcfnp] 2019-10-01 10:49:56.139362 :: cache file available
[vcfnp] 2019-10-01 10:49:56.140349 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/epir_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.2L.npy


<HDF5 dataset "epir": shape (8906423,), type "|S1">
2L chri building


[vcfnp] 2019-10-01 10:50:10.593054 :: caching is enabled
[vcfnp] 2019-10-01 10:50:10.594968 :: cache file available
[vcfnp] 2019-10-01 10:50:10.595907 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/chri_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.2L.npy


<HDF5 dataset "chri": shape (8906423,), type "|S1">
3R arab building
<HDF5 dataset "arab": shape (10752701,), type "|S1">
3R meru building
<HDF5 dataset "meru": shape (10752701,), type "|S1">
3R mela building
<HDF5 dataset "mela": shape (10752701,), type "|S1">
3R quad building
<HDF5 dataset "quad": shape (10752701,), type "|S1">
3R epir building


[vcfnp] 2019-10-01 10:53:19.988451 :: caching is enabled
[vcfnp] 2019-10-01 10:53:19.989894 :: cache file available
[vcfnp] 2019-10-01 10:53:19.990847 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/epir_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.3R.npy


<HDF5 dataset "epir": shape (10752701,), type "|S1">
3R chri building


[vcfnp] 2019-10-01 10:53:31.056317 :: caching is enabled
[vcfnp] 2019-10-01 10:53:31.057892 :: cache file available
[vcfnp] 2019-10-01 10:53:31.059318 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/chri_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.3R.npy


<HDF5 dataset "chri": shape (10752701,), type "|S1">
3L arab building
<HDF5 dataset "arab": shape (7897666,), type "|S1">
3L meru building
<HDF5 dataset "meru": shape (7897666,), type "|S1">
3L mela building
<HDF5 dataset "mela": shape (7897666,), type "|S1">
3L quad building
<HDF5 dataset "quad": shape (7897666,), type "|S1">
3L epir building


[vcfnp] 2019-10-01 10:56:04.351871 :: caching is enabled
[vcfnp] 2019-10-01 10:56:04.353540 :: cache file available
[vcfnp] 2019-10-01 10:56:04.354268 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/epir_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.3L.npy


<HDF5 dataset "epir": shape (7897666,), type "|S1">
3L chri building


[vcfnp] 2019-10-01 10:56:15.750204 :: caching is enabled
[vcfnp] 2019-10-01 10:56:15.751248 :: cache file available
[vcfnp] 2019-10-01 10:56:15.752453 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/chri_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.3L.npy


<HDF5 dataset "chri": shape (7897666,), type "|S1">
X arab building
<HDF5 dataset "arab": shape (4472265,), type "|S1">
X meru building
<HDF5 dataset "meru": shape (4472265,), type "|S1">
X mela building
<HDF5 dataset "mela": shape (4472265,), type "|S1">
X quad building
<HDF5 dataset "quad": shape (4472265,), type "|S1">
X epir building


[vcfnp] 2019-10-01 10:57:22.444679 :: caching is enabled
[vcfnp] 2019-10-01 10:57:22.447198 :: cache file available
[vcfnp] 2019-10-01 10:57:22.448163 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/epir_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.X.npy


<HDF5 dataset "epir": shape (4472265,), type "|S1">
X chri building


[vcfnp] 2019-10-01 10:57:26.324041 :: caching is enabled
[vcfnp] 2019-10-01 10:57:26.325178 :: cache file available
[vcfnp] 2019-10-01 10:57:26.325723 :: loading from cache file /kwiat/1/anopheles/ag1000g/data/phase1/release/AR3/agc/chri_fake_cnvrt_sort.vcf.gz.vcfnp_cache/variants.X.npy


<HDF5 dataset "chri": shape (4472265,), type "|S1">
