## Functions 

In [2]:
import gcsfs #module for google cloud connection
import os
import allel
import zarr
import pandas as pd
import h5py
import petl as etl
import numpy as np
from matplotlib import pyplot
import seaborn as sns
import h5py
import pyfasta

In [None]:
#gcs_orig = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cache')
#gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token=gcs_orig.session.credentials)
##gcs =  gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cloud')

In [2]:
gcs_bucket_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='anon', access='read_only')

In [None]:
 ###### Adding phase1 genotype path ######
geno_path_p1 = os.path.join("ag1000g-release/phase1.AR3/variation/main/zarr/ag1000g.phase1.ar3.pass")
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_path_p1)
calldata_phase1= zarr.Group(gcsacmap, read_only=True)

In [None]:
 ###### Adding phase2 accessibility path ######
geno_path = os.path.join("ag1000g-release/phase2.AR1/variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic")
gcsacmap = gcs_bucket_fs.get_mapper(root=geno_path)
calldata_phase2= zarr.Group(gcsacmap, read_only=True)

In [3]:
 ###### Adding phase2 haplotype path ######
hap_path = 'ag1000g-release/phase2.AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes'
store = gcs_bucket_fs.get_mapper(root=hap_path)
calldata_hap_phase2= zarr.Group(store, read_only=True)

In [5]:
 ###### Adding phase2 accessibility path ######
accessibility_path = os.path.join("ag1000g-release/phase2.AR1/accessibility/accessibility.zarr")
gcsacmap = gcs_bucket_fs.get_mapper(root=accessibility_path)
accessibility= zarr.Group(gcsacmap, read_only=True)

In [3]:
fasta_fn = '/home/jovyan/notebooks/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa'
genome = pyfasta.Fasta(fasta_fn)

In [3]:
palette = sns.color_palette()

-----------------------------------------------

In [2]:
def out_map(out_pop, chrom):

    ###### Create the new allel map from phase1 to phase2 ######

    pos_phase1 = allel.SortedIndex(calldata_phase1[chrom]["variants/POS"][:])
    pos_phase2 = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"][:])
    loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
    pos_p2_sel = pos_phase2.compress(loc1)
    pos_p1_sel = pos_phase1.compress(loc2)
    variants_phase1 = allel.VariantChunkedTable(calldata_phase1[chrom]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase1_filt = variants_phase1.compress(loc2, axis=0)
    phase1_ref = variants_phase1_filt["REF"][:]
    phase1_alt = variants_phase1_filt["ALT"][:]
    variants_phase2 = allel.VariantChunkedTable(calldata_phase2[chrom]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase2_filt = variants_phase2.compress(loc1, axis=0)
    phase2_ref = variants_phase2_filt["REF"][:]
    phase2_alt = variants_phase2_filt["ALT"][:]
    phase2refalt = np.column_stack([phase2_ref, phase2_alt])
    mapping = allel.create_allele_mapping(phase1_ref, phase1_alt, phase2refalt)
    
    ###### Now Mapping on our selected Outgroup ######
    
    
    calldata_outgroup= h5py.File('/gcs/phase1.AR3/extras/outgroup_allele_counts.h5', mode='r')
    calldata_out_pop = calldata_outgroup[chrom][out_pop]
    ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
    ac_out_pop = ac_out_pop.compress(loc2)
    pop_map_ac = ac_out_pop.map_alleles(mapping)

    return pop_map_ac

In [2]:
def abba_baba(chrom, a, b, c, out_pop, windows_size):

    ###### loading phase2 metadata ######
    metadata = pd.read_csv("/home/jovyan/notebooks/samples.meta.txt", sep="\t")
    pop_select = metadata.population.isin({a, b, c}).values
    pop_subset = metadata[pop_select]
    
    ###### loading phase2 genome and subset ######
    pos_phase1 = allel.SortedIndex(calldata_phase1[chrom]["variants/POS"][:])
    pos_phase2 = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"][:])
    loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
    pos_p2_sel = pos_phase2.compress(loc1)
    genotypes_phase2_call = calldata_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    geno_p2_subset = genotypes_phase2.subset(sel0=loc1, sel1=pop_select)

    
    ###### perform allele count to my subpopulations ######
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    
    ###### perform abba baba test ######
    y = allel.average_patterson_d(ac_subpops[a], ac_subpops[b], ac_subpops[c], out_pop, windows_size)
    print ('Estimated value of the statistic using all data:' ,y[0])
    print ('Estimated standard error:' ,y[1])
    print ('Z-score:' ,y[2])
    
    import csv
    with open('file.csv', 'a') as csvFile:
        tbl = [y[0],y[1],y[2]]
        writer = csv.writer(csvFile)
        writer.writerow(tbl)
        csvFile.close()
    
    ###### compute windows with equal numbers of SNPs ######
    windows = allel.moving_statistic(pos_p2_sel, statistic=lambda v: [v[0], v[-1]], size=windows_size)
    x = np.asarray(windows).mean(axis=1)
    
    ###### Plot the test ######
    fig, ax = pyplot.subplots(figsize=(12, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y[3], lw=.5)
    ax.set_ylabel("D Value")
    ax.set_xlabel('Chromosome %s position (bp)' %chrom)
    ax.set_xlim(0, pos_p2_sel.max())
    ax.set_title(('ABBA BABA Test between %s, %s, %s and outgroup population on chromosome %s') % (a,b,c,chrom))

In [None]:
def pbs_plot(chrom, a, b, c, windows_size):

    ###### loading phase2 metadata ######
    metadata = pd.read_csv("/home/jovyan/notebooks/samples.meta.txt", sep="\t")
    pop_select = metadata.population.isin({a, b, c}).values
    pop_subset = metadata[pop_select]
    
    ###### loading phase2 genome and subset ######
    genotypes_phase2_call = calldata_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    pos = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"])
    geno_p2_subset = genotypes_phase2.subset(sel1=pop_select)

    
    ###### perform allele count to my subpopulations ######
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    
    ##### perform pbs #####
    pbs = allel.pbs(ac_subpops[a], ac_subpops[b], ac_subpops[c], windows_size)
    
    ###### compute windows with equal numbers of SNPs ######
    windows = allel.moving_statistic(pos, statistic=lambda v: [v[0], v[-1]], size=windows_size)
    x = np.asarray(windows).mean(axis=1)
    
    ###### Plot the test ######
    fig, ax = pyplot.subplots(figsize=(12, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, pbs, lw=.5)
    ax.set_ylabel("PBS Value")
    ax.set_xlabel('Chromosome %s position (bp)' %chrom)
    ax.set_xlim(0, pos.max())
    ax.set_title(('PBS between %s, %s, %s populations on chromosome %s') % (a,b,c,chrom))

In [1]:
def plot_dxy(pop1, pop2, chrom, window_size=20000, min_n_bases=1):
    metadata = pd.read_csv("samples.meta.txt", sep=",")
    pop_select = metadata.population.isin({pop1,pop2}).values
    pop_subset = metadata[pop_select]
    genotypes_phase2_call = calldata_hap_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    geno_p2_subset = genotypes_phase2.subset(sel1=pop_select)
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    ac1 = ac_subpops[pop1]
    ac2 = ac_subpops[pop2]
    pos = calldata_phase2[chrom]['variants']['POS'][:]    
    print (ac1.shape, ac2.shape, pos.shape, seq.shape)
    dxy, windows, n_bases, counts = allel.windowed_divergence(pos, ac1, ac2, 
                                                                    size=window_size, 
                                                                    start=1, 
                                                                    stop=pos.max()
                                                             )
    x = np.mean(windows[n_bases >= min_n_bases], axis=1)
    y = dxy[n_bases >= min_n_bases]
    fig, ax = plt.subplots(figsize=(14, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y, lw=.5)
    ax.set_xlim(0, seq.size)
    ax.set_title('%s vs %s (%s)' % (pop1, pop2, chrom), fontsize=14)
    ax.set_ylabel('Dxy')
    ax.set_xlabel('position')
    ax.set_ylim(0,0.018)    

----------------------------------------------