## Functions 

In [2]:
def out_map(out_pop, chrom):

    ###### Create the new allel map from phase1 to phase2 ######

    pos_phase1 = allel.SortedIndex(calldata_phase1[chrom]["variants/POS"][:])
    pos_phase2 = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"][:])
    loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
    pos_p2_sel = pos_phase2.compress(loc1)
    pos_p1_sel = pos_phase1.compress(loc2)
    variants_phase1 = allel.VariantChunkedTable(calldata_phase1[chrom]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase1_filt = variants_phase1.compress(loc2, axis=0)
    phase1_ref = variants_phase1_filt["REF"][:]
    phase1_alt = variants_phase1_filt["ALT"][:]
    variants_phase2 = allel.VariantChunkedTable(calldata_phase2[chrom]["variants"], 
                                     names=['POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'numalt'],
                                     index='POS')
    variants_phase2_filt = variants_phase2.compress(loc1, axis=0)
    phase2_ref = variants_phase2_filt["REF"][:]
    phase2_alt = variants_phase2_filt["ALT"][:]
    phase2refalt = np.column_stack([phase2_ref, phase2_alt])
    mapping = allel.create_allele_mapping(phase1_ref, phase1_alt, phase2refalt)
    
    ###### Now Mapping on our selected Outgroup ######
    
    
    calldata_outgroup= h5py.File('/gcs/phase1.AR3/extras/outgroup_allele_counts.h5', mode='r')
    calldata_out_pop = calldata_outgroup[chrom][out_pop]
    ac_out_pop = allel.AlleleCountsArray(calldata_out_pop)
    ac_out_pop = ac_out_pop.compress(loc2)
    pop_map_ac = ac_out_pop.map_alleles(mapping)

    return pop_map_ac

In [2]:
def abba_baba_plot(chrom, a, b, c, out_pop, windows_size):

    ###### loading phase2 metadata ######
    metadata = pd.read_csv("/home/jovyan/notebooks/samples.meta.txt", sep="\t")
    pop_select = metadata.population.isin({a, b, c}).values
    pop_subset = metadata[pop_select]
    
    ###### loading phase2 genome and subset ######
    pos_phase1 = allel.SortedIndex(calldata_phase1[chrom]["variants/POS"][:])
    pos_phase2 = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"][:])
    loc1, loc2 = pos_phase2.locate_intersection(pos_phase1)
    pos_p2_sel = pos_phase2.compress(loc1)
    genotypes_phase2_call = calldata_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    geno_p2_subset = genotypes_phase2.subset(sel0=loc1, sel1=pop_select)

    
    ###### perform allele count to my subpopulations ######
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    
    ###### perform abba baba test ######
    y = allel.average_patterson_d(ac_subpops[a], ac_subpops[b], ac_subpops[c], out_pop, windows_size)
    print ('Estimated value of the statistic using all data:' ,y[0])
    print ('Estimated standard error:' ,y[1])
    print ('Z-score:' ,y[2])
    
    import csv
    with open('file.csv', 'a') as csvFile:
        tbl = [y[0],y[1],y[2]]
        writer = csv.writer(csvFile)
        writer.writerow(tbl)
        csvFile.close()
    
    ###### compute windows with equal numbers of SNPs ######
    windows = allel.moving_statistic(pos_p2_sel, statistic=lambda v: [v[0], v[-1]], size=windows_size)
    x = np.asarray(windows).mean(axis=1)
    
    ###### Plot the test ######
    fig, ax = pyplot.subplots(figsize=(12, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y[3], lw=.5)
    ax.set_ylabel("D Value")
    ax.set_xlabel('Chromosome %s position (bp)' %chrom)
    ax.set_xlim(0, pos_p2_sel.max())
    ax.set_title(('ABBA BABA Test between %s, %s, %s and outgroup population on chromosome %s') % (a,b,c,chrom))

In [None]:
def pbs_plot(chrom, a, b, c, windows_size):

    ###### loading phase2 metadata ######
    metadata = pd.read_csv("/home/jovyan/notebooks/samples.meta.txt", sep="\t")
    pop_select = metadata.population.isin({a, b, c}).values
    pop_subset = metadata[pop_select]
    
    ###### loading phase2 genome and subset ######
    genotypes_phase2_call = calldata_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    pos = allel.SortedIndex(calldata_phase2[chrom]["variants/POS"])
    geno_p2_subset = genotypes_phase2.subset(sel1=pop_select)

    
    ###### perform allele count to my subpopulations ######
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    
    ##### perform pbs #####
    pbs = allel.pbs(ac_subpops[a], ac_subpops[b], ac_subpops[c], windows_size)
    
    ###### compute windows with equal numbers of SNPs ######
    windows = allel.moving_statistic(pos, statistic=lambda v: [v[0], v[-1]], size=windows_size)
    x = np.asarray(windows).mean(axis=1)
    
    ###### Plot the test ######
    fig, ax = pyplot.subplots(figsize=(12, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, pbs, lw=.5)
    ax.set_ylabel("PBS Value")
    ax.set_xlabel('Chromosome %s position (bp)' %chrom)
    ax.set_xlim(0, pos.max())
    ax.set_title(('PBS between %s, %s, %s populations on chromosome %s') % (a,b,c,chrom))

In [1]:
def plot_dxy(pop1, pop2, chrom, window_size=20000, min_n_bases=1):
    metadata = pd.read_csv("/home/jovyan/notebooks/samples.meta.txt", sep=",")
    pop_select = metadata.population.isin({pop1,pop2}).values
    pop_subset = metadata[pop_select]
    genotypes_phase2_call = calldata_hap_phase2[chrom]["calldata/GT"]
    genotypes_phase2 = allel.GenotypeChunkedArray(genotypes_phase2_call)
    geno_p2_subset = genotypes_phase2.subset(sel1=pop_select)
    grp = pop_subset.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_p2_subset.count_alleles_subpops(grp_indices)
    ac1 = ac_subpops[pop1]
    ac2 = ac_subpops[pop2]
    pos = calldata_phase2[chrom]['variants']['POS'][:]    
    print (ac1.shape, ac2.shape, pos.shape, seq.shape)
    dxy, windows, n_bases, counts = allel.windowed_divergence(pos, ac1, ac2, 
                                                                    size=window_size, 
                                                                    start=1, 
                                                                    stop=pos.max()
                                                             )
    x = np.mean(windows[n_bases >= min_n_bases], axis=1)
    y = dxy[n_bases >= min_n_bases]
    fig, ax = plt.subplots(figsize=(14, 4))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y, lw=.5)
    ax.set_xlim(0, seq.size)
    ax.set_title('%s vs %s (%s)' % (pop1, pop2, chrom), fontsize=14)
    ax.set_ylabel('Dxy')
    ax.set_xlabel('position')
    ax.set_ylim(0,0.018)    

In [1]:
autosomes = '2R', '2L', '3R', '3L'
chromosomes = autosomes + ('X',)


class GenomeFigure(object):
    
    def __init__(self, genome, *args, **kwargs):
        self.chromosomes = kwargs.pop('chromosomes', ['2R', '2L', '3R', '3L', 'X'])
        maxchrsize = max(np.array(genome[chrom]).size for chrom in self.chromosomes)
        fig = plt.figure(*args, **kwargs)
        self.fig = fig
        self.ax = dict()
        for i, chrom in enumerate(self.chromosomes):
            ax = fig.add_subplot(3, 2, i+1)
            self.ax[chrom] = ax
            S = np.array(genome[chrom])
            if i % 2 == 1:
                sns.despine(ax=ax, offset=10, top=True, left=True, right=False)
                ax.set_xlim(0, maxchrsize)
                ax.yaxis.tick_right()
                ax.yaxis.set_label_position('right')
            else:
                ax.set_xlim((S.size)-(maxchrsize), S.size)
                ax.yaxis.tick_left()
                sns.despine(ax=ax, offset=10, top=True, left=False, right=True)
            ax.set_xticks(range(0, S.size, int(5e6)))
            ax.set_xticklabels(range(0, int(S.size/1e6), 5))
            ax.set_title(chrom, fontweight='bold')
            ax.xaxis.tick_bottom()
        fig.tight_layout()
        
    def apply(self, f, **kwargs):
        chromosomes = kwargs.pop('chromosomes', self.chromosomes)
        for chrom in chromosomes:
            ax = self.ax[chrom]
            f(chrom, ax, **kwargs)
        
        
def subplots(*args, **kwargs):
    fig, ax = plt.subplots(*args, **kwargs)
    sns.despine(ax=ax, offset=10)
    return fig, ax

In [None]:
ac_cache = dict()

def load_ac(chrom, pop):
    if (chrom, pop) in ac_cache:
        return ac_cache[chrom, pop]
    else:
        if pop in out_species:
            ac = calldata_out[chrom][pop]
        else:
            ac = calldata_biallel[chrom][pop]
            ac_cache[chrom, pop] = ac
        return ac
    
fourpop_cache = dict()

In [None]:
def f4_analysis(chroms, A, B, C, D, regions=None, blen=100000, plot=False, ax=None):
    
    
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    
    key = (",".join(chroms), region_str, A, B, C, D, blen)
    if key in fourpop_cache:
        # re-use from cache
        d, d_se, d_z, d_vb, d_vj = fourpop_cache[key]
        
    else:
        # load allele counts
        aca = np.vstack([load_ac(chrom, A)[region] for chrom, region in zip(chroms, regions)])
        acb = np.vstack([load_ac(chrom, B)[region] for chrom, region in zip(chroms, regions)])
        acc = np.vstack([load_ac(chrom, C)[region] for chrom, region in zip(chroms, regions)])
        acd = np.vstack([load_ac(chrom, D)[region] for chrom, region in zip(chroms, regions)])
            
        # run D test
        d, d_se, d_z, d_vb, d_vj = allel.blockwise_patterson_d(aca, acb, acc, acd, 
                                                                     blen=blen)
        
        # cache for re-use
        fourpop_cache[key] = d, d_se, d_z, d_vb, d_vj
    
    return d, d_se, d_z, d_vb, d_vj

def f4_analys(As, Bs, Cs, Ds, chroms=chromosomes, regions=None, blen=100000):
    
    if regions is None:
        regions = [slice(None)] * 4
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    # normalise inputs
    if not isinstance(chroms, (list, tuple)):
        chroms = [chroms]
    if not isinstance(As, (list, tuple)):
        As = [As]
    if not isinstance(Bs, (list, tuple)):
        Bs = [Bs]
    if not isinstance(Cs, (list, tuple)):
        Cs = [Cs]
    if not isinstance(Ds, (list, tuple)):
        Ds = [Ds]
        
    # setup output table
    tbl = [['chromosome', 'test', 'D', 'SE', 'Z']]

    for A in As:
        for B in Bs:
            if A != B:
                for C in Cs:
                    for D in Ds:
                        if C != D:
                            d, d_se, d_z, _, _ = f4_analysis(chroms, A, B, C, D, 
                                                             regions=regions, blen=blen)
                            test = 'D(%s, %s; %s, %s)' % (A, B, C, D)
                            row = [",".join(chroms), test, d, d_se, d_z]
                            tbl += [row]
    return tbl

def tbl_display(tbl):
        # display results
    (etl
     .wrap(tbl)
     .interpolate('D', '%.3f')
     .interpolate('SE', '%.4f')
     .interpolate('Z', '%.1f')
     .displayall(index_header=False, 
                 tr_style=lambda row: 'background-color: %s' % ('#afa' if float(row.Z) > 5 else '#aaf' if float(row.Z) < -5 else 'white')))

In [None]:
def count_alleles_pos(chrom, start, stop):
    
    metadata = pd.read_csv("samples.meta.txt", sep="\t")
    pos_all = allel.SortedIndex(callset_biallel[chrom]["variants/POS"])
    loc = pos_all.intersect_range(start, stop)
    loc1, loc2 = pos_all.locate_intersection(loc)
    geno = allel.GenotypeChunkedArray(callset_biallel[chrom]["calldata/GT"])
    geno_subset = geno.subset(sel0=loc1)
    ac = geno_subset.count_alleles()[:]
    grp = metadata.groupby("population")
    grp_indices = grp.indices
    ac_subpops = geno_subset.count_alleles_subpops(grp_indices)

    return ac_subpops

In [None]:
def tbl_d_plot(table, title, sort=None):
    # plot
    tbl = etl.wrap(table).sort(sort)
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20, tbl.nrows()*.2))
    
    ## First Plot - Bar Plot
    x = tbl.values('D').array()
    y = np.arange(tbl.nrows())
    ax1.axvline(0, color='r', lw=1)
    xerr = tbl.values('SE').array()
    ax1.errorbar(x, y, xerr=xerr, fmt='o', lw=1, color='k', mew=1, mfc='k')
    ax1.set_yticks(y)
    ylbls = ['%s; Z = %.1f' % (v.test, v.Z) for v in tbl.records()]
    ax1.set_yticklabels(ylbls)
    ax1.yaxis.tick_right()
    ax1.xaxis.tick_top()
    ax1.xaxis.set_label_position('top')
    ax1.set_xlim(-.2,.2)
    ax1.set_ylim(-1, y.size)
    ax1.grid(axis='both', color='#dddddd')
    ax1.set_xlabel("Patterson's $D$ sorted by %s" % (sort))
    fig.suptitle('%s' % (title), y=1.02, fontsize=16)

    ## Second plot - Heatmap
    results = etl.todataframe(tbl)
    df1 = results[['test', 'D','Z']].round(2)
    heatmap1_data = pd.pivot_table(df1, values='Z', 
                     index=['test'], 
                     columns='D')
    lm = sns.heatmap(heatmap1_data, cmap="RdBu", vmin=15, vmax=-15, annot=True)
    ax2 = lm.axes
    ax2.set_xlim(-.7)
    ax2.xaxis.set_label_position('top')
    ax2.xaxis.tick_top()
    plt.xticks(rotation=70)
    ax2.set_xlabel("Patterson's $D$ sorted by population")
    plt.text(1.02, 0.5, 'Z-score', {'fontsize': 15},
             horizontalalignment='left',
             verticalalignment='center',
             rotation=90,
             clip_on=False,
             transform=plt.gca().transAxes)

    ax2.grid(False, 'major')
    ax2.grid(True, 'minor')
    ax2.set_yticks([t - 0.5 for t in ax2.get_yticks()], minor=True)
    sns.despine()
    fig.tight_layout()
    plt.show()

----------------------------------------------