# ABBA-BABA Test
In this notebook I performed the ABBA-BABA Test using the new outgroup allele counts <b>(see 09112019_outgroup_allele_count_phase2_dataset_creation)</b> and the new allele count dataset for the phase2.

I have used part of <b>Alistair's notebook (20150529 Admixture, introgression.ipynb)</b> on the Ag1000G github repository

-------------------

Loading modules and callsets paths:

In [18]:
%run imports.ipynb

In [2]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=30)
cluster

VBox(children=(HTML(value='<h2>KubeCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [3]:
from dask.distributed import Client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://10.34.231.25:41795  Dashboard: /user/carlo%20mariade%20marco1/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [4]:
chromosomes = ['3L', '3R', '2L', '2R', 'X']

In [5]:
out_species = ['arab', 'chri', 'epir', 'mela', 'meru', 'quad']

In [6]:
an_col = 'AOcol', 'BFcol', 'GNcol', 'CIcol', 'GHcol'
an_gam = 'GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam'
an_na = 'KE', 'GM', 'GW'

In [7]:
calldata_out= zarr.open('data/outgroup_alleles_phase2.zarr')

In [8]:
calldata_biallel= zarr.open('data/phase2_biallel_allele_count.zarr')

In [9]:
metadata = pd.read_csv("samples.meta.txt", sep="\t")
metadata.head()

Unnamed: 0,ox_code,src_code,population,country,location,site,contributor,contact,year,m_s,sex,n_sequences,mean_coverage,ebi_sample_acc,latitude,longitude
0,AA0040-C,Twifo_Praso__E2,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95033368,30.99,ERS311878,5.60858,-1.54926
1,AA0041-C,Twifo_Praso__H3,GHcol,Ghana,Twifo Praso,Twifo Praso,David Weetman,David Weetman,2012,M,F,95843804,31.7,ERS311886,5.60858,-1.54926
2,AA0042-C,Takoradi_C7,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,107420666,35.65,ERS311894,4.91217,-1.77397
3,AA0043-C,Takoradi_H8,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,95993752,29.46,ERS311902,4.91217,-1.77397
4,AA0044-C,Takoradi_D10,GHcol,Ghana,Takoradi,Takoradi,David Weetman,David Weetman,2012,M,F,103044262,33.67,ERS311910,4.91217,-1.77397


In [10]:
ac_cache = dict()

def load_ac(chrom, pop):
    if (chrom, pop) in ac_cache:
        return ac_cache[chrom, pop]
    else:
        if pop in out_species:
            ac = calldata_out[chrom][pop]
        else:
            ac = calldata_biallel[chrom][pop]
            ac_cache[chrom, pop] = ac
        return ac

In [11]:
fourpop_cache = dict()

-----------------------------------------

In [12]:
def f4_analysis(chroms, A, B, C, D, regions=None, blen=100000, plot=False, ax=None):
    
    
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    
    key = (",".join(chroms), region_str, A, B, C, D, blen)
    if key in fourpop_cache:
        # re-use from cache
        d, d_se, d_z, d_vb, d_vj = fourpop_cache[key]
        
    else:
        # load allele counts
        aca = dask.array.vstack([load_ac(chrom, A)[region] for chrom, region in zip(chroms, regions)])
        acb = dask.array.vstack([load_ac(chrom, B)[region] for chrom, region in zip(chroms, regions)])
        acc = dask.array.vstack([load_ac(chrom, C)[region] for chrom, region in zip(chroms, regions)])
        acd = dask.array.vstack([load_ac(chrom, D)[region] for chrom, region in zip(chroms, regions)])
            
        # run D test
        d, d_se, d_z, d_vb, d_vj = allel.blockwise_patterson_d(aca, acb, acc, acd, 
                                                                     blen=blen)
        
        # cache for re-use
        fourpop_cache[key] = d, d_se, d_z, d_vb, d_vj
    
    return d, d_se, d_z, d_vb, d_vj

def f4_analys(As, Bs, Cs, Ds, chroms=chromosomes, regions=None, blen=100000):
    
    if regions is None:
        regions = [slice(None)] * 4
    
    region_str = ",".join(["{0}_{1}".format(r.start, r.stop) for r in regions])
    # normalise inputs
    if not isinstance(chroms, (list, tuple)):
        chroms = [chroms]
    if not isinstance(As, (list, tuple)):
        As = [As]
    if not isinstance(Bs, (list, tuple)):
        Bs = [Bs]
    if not isinstance(Cs, (list, tuple)):
        Cs = [Cs]
    if not isinstance(Ds, (list, tuple)):
        Ds = [Ds]
        
    # setup output table
    tbl = [['chromosome', 'test', 'D', 'SE', 'Z']]

    for A in As:
        for B in Bs:
            if A != B:
                for C in Cs:
                    for D in Ds:
                        if C != D:
                            d, d_se, d_z, _, _ = f4_analysis(chroms, A, B, C, D, 
                                                             regions=regions, blen=blen)
                            test = 'D(%s, %s; %s, %s)' % (A, B, C, D)
                            row = [",".join(chroms), test, d, d_se, d_z]
                            tbl += [row]

    # display results
    (etl
     .wrap(tbl)
     .interpolate('D', '%.3f')
     .interpolate('SE', '%.4f')
     .interpolate('Z', '%.1f')
     .displayall(index_header=False, 
                 tr_style=lambda row: 'background-color: %s' % ('#afa' if float(row.Z) > 5 else '#aaf' if float(row.Z) < -5 else 'white')))
    

In [13]:
A = an_gam
B = 'GAgam'
C = an_col
D = 'chri'

In [17]:
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(15000000, 41000000), slice(1000000, 37000000)))

chromosome,test,D,SE,Z
"3L,3R","D(GHgam, GAgam; AOcol, chri)",-0.027,0.0029,-9.2
"3L,3R","D(GHgam, GAgam; BFcol, chri)",-0.011,0.0031,-3.6
"3L,3R","D(GHgam, GAgam; GNcol, chri)",-0.009,0.0026,-3.4
"3L,3R","D(GHgam, GAgam; CIcol, chri)",-0.01,0.0027,-3.7
"3L,3R","D(GHgam, GAgam; GHcol, chri)",-0.011,0.0032,-3.4
"3L,3R","D(CMgam, GAgam; AOcol, chri)",-0.03,0.0029,-10.6
"3L,3R","D(CMgam, GAgam; BFcol, chri)",-0.017,0.0029,-5.9
"3L,3R","D(CMgam, GAgam; GNcol, chri)",-0.015,0.0024,-6.4
"3L,3R","D(CMgam, GAgam; CIcol, chri)",-0.016,0.0024,-6.6
"3L,3R","D(CMgam, GAgam; GHcol, chri)",-0.018,0.003,-5.8


In [42]:
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(15000000, 41000000), slice(1000000, 37000000)))

chromosome,test,D,SE,Z
"3L,3R","D(GHgam, GAgam; AOcol, chri)",-0.027,0.0029,-9.2
"3L,3R","D(GHgam, GAgam; BFcol, chri)",-0.011,0.0031,-3.6
"3L,3R","D(CMgam, GAgam; AOcol, chri)",-0.03,0.0029,-10.6
"3L,3R","D(CMgam, GAgam; BFcol, chri)",-0.017,0.0029,-5.9
"3L,3R","D(BFgam, GAgam; AOcol, chri)",-0.028,0.0026,-10.8
"3L,3R","D(BFgam, GAgam; BFcol, chri)",-0.013,0.0023,-5.8
"3L,3R","D(GNgam, GAgam; AOcol, chri)",-0.03,0.0026,-11.2
"3L,3R","D(GNgam, GAgam; BFcol, chri)",-0.014,0.0022,-6.4
"3L,3R","D(GQgam, GAgam; AOcol, chri)",-0.015,0.003,-5.1
"3L,3R","D(GQgam, GAgam; BFcol, chri)",-0.01,0.003,-3.5


In [14]:
A = an_gam
B = 'KE'
C = an_col
D = 'chri'

In [15]:
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(15000000, 41000000), slice(1000000, 37000000)))

chromosome,test,D,SE,Z
"3L,3R","D(GHgam, KE; AOcol, chri)",-0.046,0.0082,-5.6
"3L,3R","D(GHgam, KE; BFcol, chri)",-0.049,0.008,-6.2
"3L,3R","D(GHgam, KE; GNcol, chri)",-0.048,0.0077,-6.2
"3L,3R","D(GHgam, KE; CIcol, chri)",-0.053,0.0079,-6.7
"3L,3R","D(GHgam, KE; GHcol, chri)",-0.057,0.0084,-6.7
"3L,3R","D(CMgam, KE; AOcol, chri)",-0.049,0.009,-5.5
"3L,3R","D(CMgam, KE; BFcol, chri)",-0.055,0.0089,-6.1
"3L,3R","D(CMgam, KE; GNcol, chri)",-0.054,0.0085,-6.3
"3L,3R","D(CMgam, KE; CIcol, chri)",-0.059,0.0088,-6.7
"3L,3R","D(CMgam, KE; GHcol, chri)",-0.063,0.0093,-6.8


In [19]:
A = an_gam
B = 'FRgam'
C = an_col
D = 'chri'

In [20]:
f4_analys(A, B, C, D, chroms=("3L", "3R"),
                regions=(slice(15000000, 41000000), slice(1000000, 37000000)))

chromosome,test,D,SE,Z
"3L,3R","D(GHgam, FRgam; AOcol, chri)",-0.016,0.0047,-3.3
"3L,3R","D(GHgam, FRgam; BFcol, chri)",-0.004,0.0037,-1.1
"3L,3R","D(GHgam, FRgam; GNcol, chri)",-0.002,0.0033,-0.5
"3L,3R","D(GHgam, FRgam; CIcol, chri)",-0.003,0.0036,-0.9
"3L,3R","D(GHgam, FRgam; GHcol, chri)",-0.003,0.0036,-1.0
"3L,3R","D(CMgam, FRgam; AOcol, chri)",-0.019,0.0053,-3.5
"3L,3R","D(CMgam, FRgam; BFcol, chri)",-0.01,0.0044,-2.2
"3L,3R","D(CMgam, FRgam; GNcol, chri)",-0.008,0.0042,-1.9
"3L,3R","D(CMgam, FRgam; CIcol, chri)",-0.009,0.0044,-2.1
"3L,3R","D(CMgam, FRgam; GHcol, chri)",-0.01,0.0045,-2.2
