# ABBA-BABA Test
In this notebook I performed the ABBA-BABA Test using the new outgroup allele counts <b>(see 09112019_outgroup_allele_count_phase2_dataset_creation)</b> and the new allele count dataset for the phase2.

I have used part of <b>Alistair's notebook (20150529 Admixture, introgression.ipynb)</b> on the Ag1000G github repository

In [2]:
%run imports.ipynb

In [3]:
chromosomes = ['3L', '3R', '2L', '2R', 'X']

In [4]:
out_species = ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']

In [27]:
calldata_out= zarr.open('data/outgroup_alleles_phase2.zarr')
calldata_out.tree()

In [28]:
calldata_biallel= zarr.open('data/phase2_allele_count.zarr')
calldata_biallel.tree()

In [8]:
allel.AlleleCountsArray(calldata_out['2L']['chri'])

Unnamed: 0,0,1,Unnamed: 3
0,0,0,
1,0,0,
2,0,0,
...,...,...,...
8906420,0,0,
8906421,0,0,
8906422,0,0,


In [9]:
metadata = pd.read_csv("samples.meta.txt", sep="\t")

In [10]:
ac_cache = dict()

def load_ac(chrom, pop):
    if (chrom, pop) in ac_cache:
        return ac_cache[chrom, pop]
    else:
        # only take the first two alleles
        if pop in out_species:
            ac = calldata_out[chrom][pop]
        else:
            ac = calldata_biallel[chrom][pop]
            ac_cache[chrom, pop] = ac
        return ac

In [11]:
ac_cache.keys()

dict_keys([])

In [12]:
# don't repeat unnecessarily
fourpop_cache = dict()

In [13]:
len(genome['3L'])

41963435

In [14]:
def f4_analysis(chrom, A, B, C, D, blen=100000, plot=False, ax=None):
    
    key = (chrom, A, B, C, D, blen)
    if key in fourpop_cache:
        # re-use from cache
        d, d_se, d_z, d_vb, d_vj = fourpop_cache[key]
        
    else:
        # load allele counts
        aca = load_ac(chrom, A)
        acb = load_ac(chrom, B)
        acc = load_ac(chrom, C)
        acd = load_ac(chrom, D)

        # run D test
        d, d_se, d_z, d_vb, d_vj = allel.blockwise_patterson_d(aca, acb, acc, acd, blen=blen)
        
        # cache for re-use
        fourpop_cache[key] = d, d_se, d_z, d_vb, d_vj
    
    return d, d_se, d_z, d_vb, d_vj


def f4_analyses(As, Bs, Cs, Ds, chroms=chromosomes, blen=100000):
    
    # normalise inputs
    if not isinstance(chroms, (list, tuple)):
        chroms = [chroms]
    if not isinstance(As, (list, tuple)):
        As = [As]
    if not isinstance(Bs, (list, tuple)):
        Bs = [Bs]
    if not isinstance(Cs, (list, tuple)):
        Cs = [Cs]
    if not isinstance(Ds, (list, tuple)):
        Ds = [Ds]
        
    # setup output table
    tbl = [['chromosome', 'test', 'D', 'SE', 'Z']]
    for chrom in chroms:
        for A in As:
            for B in Bs:
                if A != B:
                    for C in Cs:
                        for D in Ds:
                            if C != D:
                                d, d_se, d_z, _, _ = f4_analysis(chrom, A, B, C, D, blen=blen)
                                test = 'D(%s, %s; %s, %s)' % (A, B, C, D)
                                row = [chrom, test, d, d_se, d_z]
                                tbl += [row]
    
    # display results
    (etl
     .wrap(tbl)
     .interpolate('D', '%.3f')
     .interpolate('SE', '%.4f')
     .interpolate('Z', '%.1f')
     .displayall(index_header=False, tr_style=lambda row: 'background-color: %s' % ('#afa' if float(row.Z) > 5 else '#aaf' if float(row.Z) < -5 else 'white')))

    
def f4_plot(chrom, A, B, C, D, blen=100000, ax=None, ylim=(-.6, .6)):
    d, d_se, d_z, d_vb, d_vj = f4_analysis(chrom, A, B, C, D, blen=blen)

    # load variant positions
    pos = callset_biallel[chrom]['variants']['POS'][:]

    block_starts = allel.moving_statistic(pos, statistic=np.min, size=blen)
    block_stops = allel.moving_statistic(pos, statistic=np.max, size=blen)
    block_widths = block_stops - block_starts

    if ax is None:
        fig, ax = subplots(figsize=(7, 2))
    ax.bar(block_starts, d_vb, width=block_widths, linewidth=0)
    ax.axhline(0, color='k', linestyle='--')
    ax.set_ylabel('D', rotation=0, ha='center', va='center')
    ax.set_ylim(*ylim)
    
    return ax

    
def f4_fig(A, B, C, D, blen=100000, ylim=(-.6, .6)):
    gf = GenomeFigure(genome, figsize=(8, 5))
    gf.apply(lambda chrom, ax: f4_plot(chrom, A, B, C, D, ax=ax, ylim=ylim))
    gf.fig.suptitle('D(%s, %s; %s, %s)' % (A, B, C, D), fontsize=12, fontweight='bold')
    gf.fig.tight_layout()
    return gf


In [22]:
A = 'mela'
B = 'quad'
C = 'meru'
D = 'chri', 'epir'
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(mela, quad; meru, chri)",-0.332,0.0194,-17.1
3L,"D(mela, quad; meru, epir)",-0.299,0.0189,-15.9
3R,"D(mela, quad; meru, chri)",-0.165,0.0119,-13.9
3R,"D(mela, quad; meru, epir)",-0.132,0.0116,-11.3
2L,"D(mela, quad; meru, chri)",-0.15,0.0099,-15.2
2L,"D(mela, quad; meru, epir)",-0.123,0.0095,-12.9
2R,"D(mela, quad; meru, chri)",-0.115,0.0084,-13.6
2R,"D(mela, quad; meru, epir)",-0.077,0.0092,-8.3
X,"D(mela, quad; meru, chri)",-0.15,0.0136,-11.1
X,"D(mela, quad; meru, epir)",-0.114,0.0139,-8.1


In [40]:
an_col = 'AOcol', 'BFcol', 'CIcol', 'GHcol', 'GNcol'
an_gam = 'GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam'
an_na = 'KE', 'GM', 'GW'

-----------------------------------------

## D statistics Analysis with Mayotte

### Using <i>An.christyi</i> as outgroup:

In [30]:
A = an_gam
B = 'FRgam'
C = an_col
D = 'chri'

In [31]:
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(GHgam, FRgam; AOcol, chri)",-0.008,0.005,-1.6
3L,"D(GHgam, FRgam; BFcol, chri)",0.002,0.0042,0.5
3L,"D(GHgam, FRgam; CIcol, chri)",0.005,0.0041,1.2
3L,"D(GHgam, FRgam; GHcol, chri)",0.005,0.004,1.2
3L,"D(GHgam, FRgam; GNcol, chri)",0.005,0.0042,1.1
3L,"D(CMgam, FRgam; AOcol, chri)",-0.01,0.0051,-2.0
3L,"D(CMgam, FRgam; BFcol, chri)",-0.003,0.0043,-0.6
3L,"D(CMgam, FRgam; CIcol, chri)",-0.002,0.0044,-0.4
3L,"D(CMgam, FRgam; GHcol, chri)",-0.002,0.0045,-0.4
3L,"D(CMgam, FRgam; GNcol, chri)",-0.001,0.0045,-0.3


### Using <i>An.melas</i> as outgroup:

In [36]:
A = an_gam
B = 'FRgam'
C = an_col
D = 'mela'

In [37]:
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(GHgam, FRgam; AOcol, mela)",-0.015,0.006,-2.5
3L,"D(GHgam, FRgam; BFcol, mela)",-0.0,0.005,-0.0
3L,"D(GHgam, FRgam; CIcol, mela)",0.002,0.0048,0.5
3L,"D(GHgam, FRgam; GHcol, mela)",0.002,0.0047,0.5
3L,"D(GHgam, FRgam; GNcol, mela)",0.003,0.0051,0.6
3L,"D(CMgam, FRgam; AOcol, mela)",-0.017,0.006,-2.8
3L,"D(CMgam, FRgam; BFcol, mela)",-0.006,0.0052,-1.2
3L,"D(CMgam, FRgam; CIcol, mela)",-0.005,0.0052,-1.0
3L,"D(CMgam, FRgam; GHcol, mela)",-0.005,0.0052,-1.0
3L,"D(CMgam, FRgam; GNcol, mela)",-0.004,0.0053,-0.7


--------------------------------------------

## D Statistic Analysis with Gabon

### Using <i>An.christyi</i> as outgroup

In [32]:
A = an_gam
B = 'GAgam'
C = an_col
D = 'chri'

In [33]:
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(GHgam, GAgam; AOcol, chri)",-0.024,0.0036,-6.5
3L,"D(GHgam, GAgam; BFcol, chri)",-0.006,0.0026,-2.4
3L,"D(GHgam, GAgam; CIcol, chri)",-0.006,0.0024,-2.4
3L,"D(GHgam, GAgam; GHcol, chri)",-0.006,0.0024,-2.6
3L,"D(GHgam, GAgam; GNcol, chri)",-0.006,0.0026,-2.2
3L,"D(CMgam, GAgam; AOcol, chri)",-0.026,0.0036,-7.2
3L,"D(CMgam, GAgam; BFcol, chri)",-0.011,0.0026,-4.3
3L,"D(CMgam, GAgam; CIcol, chri)",-0.012,0.0027,-4.6
3L,"D(CMgam, GAgam; GHcol, chri)",-0.013,0.0027,-4.8
3L,"D(CMgam, GAgam; GNcol, chri)",-0.012,0.0027,-4.4


### Using <i>An.melas</i> as outgroup:

In [38]:
A = an_gam
B = 'FRgam'
C = an_col
D = 'mela'

In [39]:
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(GHgam, FRgam; AOcol, mela)",-0.015,0.006,-2.5
3L,"D(GHgam, FRgam; BFcol, mela)",-0.0,0.005,-0.0
3L,"D(GHgam, FRgam; CIcol, mela)",0.002,0.0048,0.5
3L,"D(GHgam, FRgam; GHcol, mela)",0.002,0.0047,0.5
3L,"D(GHgam, FRgam; GNcol, mela)",0.003,0.0051,0.6
3L,"D(CMgam, FRgam; AOcol, mela)",-0.017,0.006,-2.8
3L,"D(CMgam, FRgam; BFcol, mela)",-0.006,0.0052,-1.2
3L,"D(CMgam, FRgam; CIcol, mela)",-0.005,0.0052,-1.0
3L,"D(CMgam, FRgam; GHcol, mela)",-0.005,0.0052,-1.0
3L,"D(CMgam, FRgam; GNcol, mela)",-0.004,0.0053,-0.7


--------------------------

## D statistics with hybrid populations:

### Using <i>An.christyi</i> as outgroup:

In [44]:
A = an_gam
B = an_na
C = an_col
D = 'chri'

In [45]:
f4_analyses(A, B, C, D)

chromosome,test,D,SE,Z
3L,"D(GHgam, KE; AOcol, chri)",-0.015,0.0056,-2.8
3L,"D(GHgam, KE; BFcol, chri)",-0.02,0.0049,-4.0
3L,"D(GHgam, KE; CIcol, chri)",-0.023,0.0047,-4.9
3L,"D(GHgam, KE; GHcol, chri)",-0.026,0.0046,-5.6
3L,"D(GHgam, KE; GNcol, chri)",-0.022,0.0051,-4.4
3L,"D(GHgam, GM; AOcol, chri)",0.02,0.0063,3.1
3L,"D(GHgam, GM; BFcol, chri)",0.0,0.0067,0.0
3L,"D(GHgam, GM; CIcol, chri)",-0.01,0.0068,-1.5
3L,"D(GHgam, GM; GHcol, chri)",-0.008,0.0072,-1.2
3L,"D(GHgam, GM; GNcol, chri)",-0.022,0.0069,-3.3


### Using <i>An.melas</i> as outgroup:

In [44]:
A = an_gam
B = an_na
C = an_col
D = 'mela'

In [46]:
f4_analyses(A,B,C,D)

chromosome,test,D,SE,Z
3L,"D(GHgam, KE; AOcol, chri)",-0.015,0.0056,-2.8
3L,"D(GHgam, KE; BFcol, chri)",-0.02,0.0049,-4.0
3L,"D(GHgam, KE; CIcol, chri)",-0.023,0.0047,-4.9
3L,"D(GHgam, KE; GHcol, chri)",-0.026,0.0046,-5.6
3L,"D(GHgam, KE; GNcol, chri)",-0.022,0.0051,-4.4
3L,"D(GHgam, GM; AOcol, chri)",0.02,0.0063,3.1
3L,"D(GHgam, GM; BFcol, chri)",0.0,0.0067,0.0
3L,"D(GHgam, GM; CIcol, chri)",-0.01,0.0068,-1.5
3L,"D(GHgam, GM; GHcol, chri)",-0.008,0.0072,-1.2
3L,"D(GHgam, GM; GNcol, chri)",-0.022,0.0069,-3.3
