# Treemix Ag1000G phase2
For build a dataset Treemix I need unlinked SNPs. So I have to prune my allele count datasets to obtain SNPs in high LD.
For doing this I need:
    - Phase2 Genotype callset
    - Phase2 Allele count
    - Outgroup Allele count

In this notebook I edited an old Alistair's notebook of the Phase1 of Ag1000G (<b>20151001 treemix prep 4</b>). On my phase2 datasets I have already the biallelic allele counts so I skipped the searching and filtering for biallelic SNPs

Import my modules:

In [1]:
%run imports.ipynb

Import callsets:

In [2]:
callset_pass= callset_biallel
allele_counts= zarr.open('data/phase2_biallel_allele_count.zarr/')
outgroup_allele_counts= zarr.open('data/outgroup_alleles_phase2.zarr/')

------------------------

Define functions to locate biallelic allele counts on a range for my outgroup and phase2 datasets:

In [3]:
def outgroup_ascertainment(chrom, start, stop, outgroups):
    
    # locate region
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    locr = pos.locate_range(start, stop)
    
    # ascertain SNPs
    loca = np.zeros(pos.shape, dtype='b1')
    loca[locr] = True
    log('outgroup ascertainment, initial', nnz(loca))
    for s in outgroups:
        ac = allel.AlleleCountsArray(outgroup_allele_counts[chrom][s][:])
        # non-missing
        locs = (ac.sum(axis=1) > 0)
        loca &= locs
        log(s, nnz(loca))
        
    return loca
        

In [4]:
def ingroup_ascertainment(chrom, start, stop, segpops):

    # locate region
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    locr = pos.locate_range(start, stop)

    # ascertain SNPs
    loca = np.zeros(pos.shape, dtype='b1')
    loca[locr] = True
    log('ingroup ascertainment, initial', nnz(loca))

    
    # require segregating
    for pop in segpops:
        ac = allel.AlleleCountsArray(allele_counts[chrom][pop][:])
        loc_seg = ac.min(axis=1) > 0
        loca &= loc_seg
        log('after require segregating in', pop, nnz(loca))
        
    return loca

Define function for ld pruning. LD-pruning remove SNPs with an high correlation. Using windows this function compute pairwise LD between all SNPs within each window, then removing one SNP from each correlated pair.

Define function for generating treemix file:

In [5]:
def to_treemix(acs, fn):
    pops = sorted(acs.keys())
    n_variants = acs[pops[0]].shape[0]
    n_alleles = acs[pops[0]].shape[1]
    assert n_alleles == 2, 'only biallelic variants supported'
    for pop in pops[1:]:
        assert n_variants == acs[pop].shape[0], 'bad number of variants for pop %s' % pop
        assert n_alleles == acs[pop].shape[1], 'bad number of alleles for pop %s' % pop
        
    with open(fn, 'wt', encoding='ascii') as f:
        print(' '.join(pops), file=f)
        for i in range(n_variants):
            print(' '.join([','.join(map(str, acs[pop][i])) for pop in pops]), file=f)


Define a new function that randomly downsample if I have a large dataset and applies ld-pruning on it:

In [6]:
def downsample_and_prune(chrom, start, stop, loc_asc,
                         n=100000, ldp_size=500, ldp_step=250, ldp_threshold=.1, ldp_n_iter=1):

    # all variant positions
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    posa = pos[loc_asc]

    # randomly downsample
    if n < posa.shape[0]:
        posds = np.random.choice(posa, n, replace=False)
        posds.sort()
        posds = allel.SortedIndex(posds)
    else:
        # skip downsampling
        posds = posa
    locds = pos.locate_keys(posds)    

    # load genotype data
    genotype = allel.GenotypeChunkedArray(callset_pass[chrom]['calldata/GT'])
    geno_subset = genotype.subset(sel0=loc_asc)
    gn = geno_subset.to_n_alt()

    
    # prune    
    for i in range(ldp_n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=ldp_size, step=ldp_step, threshold=ldp_threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        log('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gnu = gn.compress(loc_unlinked, axis=0)
        posu = pos.compress(loc_unlinked)
        locu = pos.locate_keys(posu)

    return locu

Define last function, the analysis function that includes all function below and applies these on my populations, outgroups, chromosomes of interest.

In [7]:
def run_analysis(rname, chrom, start, stop, outgroups, segpops,
                 n=100000, ldp_size=500, ldp_step=250, ldp_threshold=.1, ldp_n_iter=1):

    # initial ascertainment
    loc_og_asc = outgroup_ascertainment(chrom, start, stop, outgroups=outgroups)
    loc_ig_asc = ingroup_ascertainment(chrom, start, stop, segpops=segpops)
    loc_asc = loc_og_asc & loc_ig_asc
    log('initial ascertainment', nnz(loc_asc))
    
    # downsample and prune
    locu = downsample_and_prune(chrom, start, stop, loc_asc, 
                                n=n, ldp_size=ldp_size, ldp_step=ldp_step, 
                                ldp_threshold=ldp_threshold, ldp_n_iter=ldp_n_iter)
    
    # write allele counts
    acsu = dict()
    for pop in populations:
        acsu[pop] = allele_counts[chrom][pop][:, :2][locu]
    for pop in outgroups:
        acsu[pop] = outgroup_allele_counts[chrom][pop][:, :2][locu]

    outdir = 'd/data/treemix/seg_%s_og_%s_ldp_%s' % ('_'.join(segpops), '_'.join(outgroups), ldp_n_iter)
    !mkdir -pv {outdir}
    fn = os.path.join(outdir, '%s.allele_counts.txt' % rname)
    to_treemix(acsu, fn)
    !gzip -fv {fn}


Declaring values for generating my treemix file and ran on it for chromosome 3R, 3L, X, and the X region involved on speciation between <i>An.gambiae</i> and <i>An.coluzzii</i>

In [8]:
outgroups = ['chri', 'arab', 'quad', 'meru', 'mela']
segpops = ['AOcol', 'BFcol', 'CIcol', 'GHcol', 'GNcol','GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam','KE', 'GM', 'GW']
n = 200000
ldp_n_iter = 1
region_3R_24mbp = '3R-24Mbp', '3R', 1, 24_000_000
region_3L_free = '3L-free', '3L', 18_000_000, 41_000_000

-----------------------------
## Treemix on 24Mbp 3R-free

In [23]:
rname, chrom, start, stop = region_3R_24mbp
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3R-24Mbp 3R 1 24000000
outgroup ascertainment, initial 5760020
chri 4113161
arab 4087191
quad 4024544
meru 3969408
mela 3839974
ingroup ascertainment, initial 5760020
after require segregating in AOcol 627470
after require segregating in BFcol 424553
after require segregating in CIcol 373166
after require segregating in GHcol 350110
after require segregating in GNcol 181299
after require segregating in GHgam 162341
after require segregating in CMgam 162312
after require segregating in BFgam 162127
after require segregating in GNgam 161151
after require segregating in GQgam 140491
after require segregating in UGgam 140467
after require segregating in GAgam 137080
after require segregating in FRgam 91304
after require segregating in KE 63790
after require segregating in GM 63730
after require segregating in GW 63725
initial ascertainment 36930
iteration 1 retaining 29203 removing 7727 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRga

--------------------

In [27]:
df = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_chri_arab_quad_meru_mela_ldp_1/3R-24Mbp.allele_counts.txt.gz', sep = ' ')
df

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,...,GNgam,GQgam,GW,KE,UGgam,arab,chri,mela,meru,quad
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1811,960,2240,240,10,80,200,200
1,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1780,960,2231,240,00,80,200,200
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,...,377,117,60122,897,7217,240,10,80,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29198,1560,1500,1840,1420,5931,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
29199,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2231,240,10,80,200,200
29200,1560,1500,1831,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
29201,13917,1500,1840,1411,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200


In [33]:
chrom_3_chri = df.drop(df.columns[[16,18,19,20]], axis=1)
chrom_3_chri = chrom_3_chri[chrom_3_chri.chri != '0,0']
chrom_3_chri

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,chri
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,10
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,80,800,180,1820,960,2240,10
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,10
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,80,377,117,60122,897,7217,10
5,1560,1500,1840,1420,5931,460,1380,1100,240,1300,80,800,180,1820,960,2240,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29198,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
29199,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2231,10
29200,1560,1500,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
29201,13917,1500,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,10


In [35]:
chrom_3_chri.to_csv('chrom_3L_chri.txt.gz',index=False, sep=" ")

---------------------------------------
## Treemix on 3L (18 Mbp to 41 Mbp)

In [9]:
rname, chrom, start, stop = region_3L_free
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3L-free 3L 18000000 41000000
outgroup ascertainment, initial 5358122
chri 3463220
arab 3444965
quad 3374908
meru 3322704
mela 3199401
ingroup ascertainment, initial 5358122
after require segregating in AOcol 600947
after require segregating in BFcol 402265
after require segregating in CIcol 352318
after require segregating in GHcol 330333
after require segregating in GNcol 170038
after require segregating in GHgam 151361
after require segregating in CMgam 151320
after require segregating in BFgam 151100
after require segregating in GNgam 150049
after require segregating in GQgam 131227
after require segregating in UGgam 131198
after require segregating in GAgam 127860
after require segregating in FRgam 85762
after require segregating in KE 60541
after require segregating in GM 60502
after require segregating in GW 60501
initial ascertainment 31391
iteration 1 retaining 24749 removing 6642 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAga

In [10]:
df = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_chri_arab_quad_meru_mela_ldp_1/3L-free.allele_counts.txt.gz', sep = ' ')
df

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,...,GNgam,GQgam,GW,KE,UGgam,arab,chri,mela,meru,quad
0,1560,1500,1840,1420,5913,480,1380,1100,240,1300,...,791,171,1820,960,2240,00,00,00,00,00
1,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1811,960,2240,00,00,00,00,00
2,1560,1500,1831,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,00,00,00,00,00
3,1560,1500,1840,1420,5922,480,1380,1100,240,1300,...,800,180,1820,960,2240,00,00,00,00,00
4,1560,1500,1840,1411,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,00,00,00,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24744,1560,1491,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1811,960,2240,240,00,80,200,00
24745,1560,1500,1831,1420,5931,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,00,80,200,00
24746,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1766,8115,2240,240,00,00,200,00
24747,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2231,240,00,80,200,00


In [11]:
chrom_3_chri = df.drop(df.columns[[16,18,19,20]], axis=1)
chrom_3_chri = chrom_3_chri[chrom_3_chri.chri != '0,0']
chrom_3_chri

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,chri
319,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,791,180,1820,960,2240,10
320,1560,1500,1822,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
321,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,10
322,1560,1500,1840,1420,5940,480,1353,1100,240,1300,80,800,180,1820,960,2240,10
323,1560,1491,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2231,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24684,1560,1491,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
24685,1560,1500,1840,1420,5940,480,1380,1019,240,1300,80,800,180,1820,960,2240,01
24686,1560,1491,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
24687,1540,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,01


In [12]:
chrom_3_chri.to_csv('chrom_3L_chri.txt.gz',index=False, sep=" ")