# ABBA-BABA Test
In this notebook I performed the ABBA-BABA Test using the new outgroup allele counts <b>(see 09112019_outgroup_allele_count_phase2_dataset_creation)</b> and the new allele count dataset for the phase2.

I have used part of <b>Alistair's notebook (20150529 Admixture, introgression.ipynb)</b> on the Ag1000G github repository

-------------------

Loading modules and callsets paths:

In [1]:
%run imports.ipynb

In [2]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=30)
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [3]:
from dask.distributed import Client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.33.212.2:39921  Dashboard: /user/carlo%20mariade%20marco1/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [4]:
chromosomes = ['3L', '3R', '2L', '2R', 'X']

In [5]:
out_species = ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']

In [6]:
an_col = 'AOcol', 'BFcol', 'GNcol', 'CIcol', 'GHcol'
an_gam = 'GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam'
an_na = 'KE', 'GM', 'GW'

In [7]:
calldata_out= zarr.open('data/outgroup_alleles_phase2.zarr')

In [8]:
calldata_biallel= zarr.open('data/phase2_biallel_allele_count.zarr')

In [9]:
metadata = pd.read_csv("samples.meta.txt", sep="\t")
metadata.head()

Unnamed: 0,ox_code,src_code,population,country,location,site,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,ebi_sample_acc,latitude,longitude
0,AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95033368,30.99,ERS311878,5.60858,-1.54926
1,AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95843804,31.7,ERS311886,5.60858,-1.54926
2,AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,107420666,35.65,ERS311894,4.91217,-1.77397
3,AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,95993752,29.46,ERS311902,4.91217,-1.77397
4,AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,103044262,33.67,ERS311910,4.91217,-1.77397


In [10]:
ac_cache = dict()

def load_ac(chrom, pop):
    if (chrom, pop) in ac_cache:
        return ac_cache[chrom, pop]
    else:
        if pop in out_species:
            ac = calldata_out[chrom][pop]
        else:
            ac = calldata_biallel[chrom][pop]
            ac_cache[chrom, pop] = ac
        return ac

In [11]:
fourpop_cache = dict()

-----------------------------------------

In [12]:
def f4_analysis(chroms, A, B, C, D, regions=None, blen=100000, plot=False, ax=None):
    
    
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    
    key = (",".join(chroms), region_str, A, B, C, D, blen)
    if key in fourpop_cache:
        # re-use from cache
        d, d_se, d_z, d_vb, d_vj = fourpop_cache[key]
        
    else:
        # load allele counts
        aca = dask.array.vstack([load_ac(chrom, A)[region] for chrom, region in zip(chroms, regions)])
        acb = dask.array.vstack([load_ac(chrom, B)[region] for chrom, region in zip(chroms, regions)])
        acc = dask.array.vstack([load_ac(chrom, C)[region] for chrom, region in zip(chroms, regions)])
        acd = dask.array.vstack([load_ac(chrom, D)[region] for chrom, region in zip(chroms, regions)])
            
        # run D test
        d, d_se, d_z, d_vb, d_vj = allel.blockwise_patterson_d(aca, acb, acc, acd, 
                                                                     blen=blen)
        
        # cache for re-use
        fourpop_cache[key] = d, d_se, d_z, d_vb, d_vj
    
    return d, d_se, d_z, d_vb, d_vj

def f4_analys(As, Bs, Cs, Ds, chroms=chromosomes, regions=None, blen=100000):
    
    if regions is None:
        regions = [slice(None)] * 4
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    # normalise inputs
    if not isinstance(chroms, (list, tuple)):
        chroms = [chroms]
    if not isinstance(As, (list, tuple)):
        As = [As]
    if not isinstance(Bs, (list, tuple)):
        Bs = [Bs]
    if not isinstance(Cs, (list, tuple)):
        Cs = [Cs]
    if not isinstance(Ds, (list, tuple)):
        Ds = [Ds]
        
    # setup output table
    tbl = [['chromosome', 'test', 'D', 'SE', 'Z']]

    for A in As:
        for B in Bs:
            if A != B:
                for C in Cs:
                    for D in Ds:
                        if C != D:
                            d, d_se, d_z, _, _ = f4_analysis(chroms, A, B, C, D, 
                                                             regions=regions, blen=blen)
                            test = 'D(%s, %s; %s, %s)' % (A, B, C, D)
                            row = [",".join(chroms), test, d, d_se, d_z]
                            tbl += [row]

    # display results
    (etl
     .wrap(tbl)
     .interpolate('D', '%.3f')
     .interpolate('SE', '%.4f')
     .interpolate('Z', '%.1f')
     .displayall(index_header=False, 
                 tr_style=lambda row: 'background-color: %s' % ('#afa' if float(row.Z) > 5 else '#aaf' if float(row.Z) < -5 else 'white')))

In [13]:
A = an_col
B = 'KE'
C = an_gam
D = 'chri'

In [15]:

f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(AOcol, KE; GHgam, chri)",0.001,0.0076,0.2
"3L,3R","D(AOcol, KE; CMgam, chri)",0.0,0.0074,0.0
"3L,3R","D(AOcol, KE; BFgam, chri)",0.0,0.0076,0.1
"3L,3R","D(AOcol, KE; GNgam, chri)",0.001,0.0075,0.1
"3L,3R","D(AOcol, KE; GQgam, chri)",0.017,0.0053,3.1
"3L,3R","D(AOcol, KE; UGgam, chri)",-0.008,0.0072,-1.1
"3L,3R","D(AOcol, KE; GAgam, chri)",0.012,0.0048,2.5
"3L,3R","D(AOcol, KE; FRgam, chri)",-0.062,0.0064,-9.7
"3L,3R","D(BFcol, KE; GHgam, chri)",-0.018,0.0052,-3.6
"3L,3R","D(BFcol, KE; CMgam, chri)",-0.022,0.0052,-4.3


------------------------------

In [16]:
A = an_col
B = 'GM'
C = an_gam
D = 'chri'
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(AOcol, GM; GHgam, chri)",0.04,0.0046,8.8
"3L,3R","D(AOcol, GM; CMgam, chri)",0.043,0.0048,9.0
"3L,3R","D(AOcol, GM; BFgam, chri)",0.041,0.0048,8.6
"3L,3R","D(AOcol, GM; GNgam, chri)",0.041,0.0046,8.9
"3L,3R","D(AOcol, GM; GQgam, chri)",0.061,0.0055,11.1
"3L,3R","D(AOcol, GM; UGgam, chri)",0.048,0.0049,9.8
"3L,3R","D(AOcol, GM; GAgam, chri)",0.067,0.0054,12.4
"3L,3R","D(AOcol, GM; FRgam, chri)",0.063,0.0058,10.8
"3L,3R","D(BFcol, GM; GHgam, chri)",0.021,0.006,3.6
"3L,3R","D(BFcol, GM; CMgam, chri)",0.021,0.0061,3.4


In [17]:
A = an_col
B = 'GW'
C = an_gam
D = 'chri'
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(AOcol, GW; GHgam, chri)",0.051,0.0043,11.9
"3L,3R","D(AOcol, GW; CMgam, chri)",0.054,0.0047,11.4
"3L,3R","D(AOcol, GW; BFgam, chri)",0.052,0.0046,11.1
"3L,3R","D(AOcol, GW; GNgam, chri)",0.052,0.0045,11.4
"3L,3R","D(AOcol, GW; GQgam, chri)",0.07,0.0057,12.2
"3L,3R","D(AOcol, GW; UGgam, chri)",0.058,0.0051,11.4
"3L,3R","D(AOcol, GW; GAgam, chri)",0.076,0.0059,12.9
"3L,3R","D(AOcol, GW; FRgam, chri)",0.072,0.0063,11.5
"3L,3R","D(BFcol, GW; GHgam, chri)",0.032,0.0069,4.6
"3L,3R","D(BFcol, GW; CMgam, chri)",0.032,0.0071,4.5


-------------------------

In [21]:
an_col = 'BFcol', 'GNcol', 'CIcol', 'GHcol'

In [22]:
A = an_col
B = 'AOcol'
C = an_gam
D = 'chri'
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(BFcol, AOcol; GHgam, chri)",-0.02,0.0049,-4.2
"3L,3R","D(BFcol, AOcol; CMgam, chri)",-0.023,0.0043,-5.4
"3L,3R","D(BFcol, AOcol; BFgam, chri)",-0.02,0.0049,-4.1
"3L,3R","D(BFcol, AOcol; GNgam, chri)",-0.021,0.0045,-4.7
"3L,3R","D(BFcol, AOcol; GQgam, chri)",-0.034,0.0039,-8.6
"3L,3R","D(BFcol, AOcol; UGgam, chri)",-0.026,0.004,-6.6
"3L,3R","D(BFcol, AOcol; GAgam, chri)",-0.039,0.0034,-11.2
"3L,3R","D(BFcol, AOcol; FRgam, chri)",-0.031,0.0041,-7.5
"3L,3R","D(GNcol, AOcol; GHgam, chri)",-0.032,0.0044,-7.2
"3L,3R","D(GNcol, AOcol; CMgam, chri)",-0.036,0.004,-9.2


In [24]:
A = 'AOcol'
B = an_col
C = an_gam
D = 'chri'
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(AOcol, BFcol; GHgam, chri)",0.02,0.0049,4.2
"3L,3R","D(AOcol, BFcol; CMgam, chri)",0.023,0.0043,5.4
"3L,3R","D(AOcol, BFcol; BFgam, chri)",0.02,0.0049,4.1
"3L,3R","D(AOcol, BFcol; GNgam, chri)",0.021,0.0045,4.7
"3L,3R","D(AOcol, BFcol; GQgam, chri)",0.034,0.0039,8.6
"3L,3R","D(AOcol, BFcol; UGgam, chri)",0.026,0.004,6.6
"3L,3R","D(AOcol, BFcol; GAgam, chri)",0.039,0.0034,11.2
"3L,3R","D(AOcol, BFcol; FRgam, chri)",0.031,0.0041,7.5
"3L,3R","D(AOcol, GNcol; GHgam, chri)",0.032,0.0044,7.2
"3L,3R","D(AOcol, GNcol; CMgam, chri)",0.036,0.004,9.2


In [23]:
A = an_col
B = an_gam
C = 'AOcol'
D = 'chri'
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(1, 15000000), slice(37000000 , 53200684)))

chromosome,test,D,SE,Z
"3L,3R","D(BFcol, GHgam; AOcol, chri)",0.053,0.0082,6.5
"3L,3R","D(BFcol, CMgam; AOcol, chri)",0.055,0.0084,6.6
"3L,3R","D(BFcol, BFgam; AOcol, chri)",0.057,0.0084,6.7
"3L,3R","D(BFcol, GNgam; AOcol, chri)",0.058,0.0084,6.9
"3L,3R","D(BFcol, GQgam; AOcol, chri)",0.036,0.0065,5.5
"3L,3R","D(BFcol, UGgam; AOcol, chri)",0.054,0.0079,6.8
"3L,3R","D(BFcol, GAgam; AOcol, chri)",0.03,0.0066,4.6
"3L,3R","D(BFcol, FRgam; AOcol, chri)",0.046,0.007,6.5
"3L,3R","D(GNcol, GHgam; AOcol, chri)",0.042,0.0079,5.3
"3L,3R","D(GNcol, CMgam; AOcol, chri)",0.044,0.0081,5.4
