# Treemix Ag1000G phase2
For build a dataset Treemix I need unlinked SNPs. So I have to prune my allele count datasets to obtain SNPs in high LD.
For doing this I need:
    - Phase2 Genotype callset
    - Phase2 Allele count
    - Outgroup Allele count

In this notebook I edited an old Alistair's notebook of the Phase1 of Ag1000G (<b>20151001 treemix prep 4</b>). On my phase2 datasets I have already the biallelic allele counts so I skipped the searching and filtering for biallelic SNPs

Import my modules:

In [3]:
%run imports.ipynb

Import callsets:

In [5]:
callset_pass= callset_biallel
allele_counts= zarr.open('data/phase2_biallel_allele_count.zarr/')
outgroup_allele_counts= zarr.open('data/outgroup_alleles_phase2.zarr/')

------------------------

Define functions to locate biallelic allele counts on a range for my outgroup and phase2 datasets:

In [6]:
def outgroup_ascertainment(chrom, start, stop, outgroups):
    
    # locate region
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    locr = pos.locate_range(start, stop)
    
    # ascertain SNPs
    loca = np.zeros(pos.shape, dtype='b1')
    loca[locr] = True
    log('outgroup ascertainment, initial', nnz(loca))
    for s in outgroups:
        ac = allel.AlleleCountsArray(outgroup_allele_counts[chrom][s][:])
        # non-missing
        locs = (ac.sum(axis=1) > 0)
        loca &= locs
        log(s, nnz(loca))
        
    return loca
        

In [7]:
def ingroup_ascertainment(chrom, start, stop, segpops):

    # locate region
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    locr = pos.locate_range(start, stop)

    # ascertain SNPs
    loca = np.zeros(pos.shape, dtype='b1')
    loca[locr] = True
    log('ingroup ascertainment, initial', nnz(loca))

    
    # require segregating
    for pop in segpops:
        ac = allel.AlleleCountsArray(allele_counts[chrom][pop][:])
        loc_seg = ac.min(axis=1) > 0
        loca &= loc_seg
        log('after require segregating in', pop, nnz(loca))
        
    return loca

Define function for ld pruning. LD-pruning remove SNPs with an high correlation. Using windows this function compute pairwise LD between all SNPs within each window, then removing one SNP from each correlated pair.

Define function for generating treemix file:

In [8]:
def to_treemix(acs, fn):
    pops = sorted(acs.keys())
    n_variants = acs[pops[0]].shape[0]
    n_alleles = acs[pops[0]].shape[1]
    assert n_alleles == 2, 'only biallelic variants supported'
    for pop in pops[1:]:
        assert n_variants == acs[pop].shape[0], 'bad number of variants for pop %s' % pop
        assert n_alleles == acs[pop].shape[1], 'bad number of alleles for pop %s' % pop
        
    with open(fn, 'wt', encoding='ascii') as f:
        print(' '.join(pops), file=f)
        for i in range(n_variants):
            print(' '.join([','.join(map(str, acs[pop][i])) for pop in pops]), file=f)


Define a new function that randomly downsample if I have a large dataset and applies ld-pruning on it:

In [10]:
def downsample_and_prune(chrom, start, stop, loc_asc,
                         n=100000, ldp_size=500, ldp_step=250, ldp_threshold=.1, ldp_n_iter=1):

    # all variant positions
    pos = allel.SortedIndex(callset_pass[chrom]['variants']['POS'][:])
    posa = pos[loc_asc]

    # randomly downsample
    if n < posa.shape[0]:
        posds = np.random.choice(posa, n, replace=False)
        posds.sort()
        posds = allel.SortedIndex(posds)
    else:
        # skip downsampling
        posds = posa
    locds = pos.locate_keys(posds)    

    # load genotype data
    genotype = allel.GenotypeChunkedArray(callset_pass[chrom]['calldata/GT'])
    geno_subset = genotype.subset(sel0=loc_asc)
    gn = geno_subset.to_n_alt()

    
    # prune    
    for i in range(ldp_n_iter):
        loc_unlinked = allel.locate_unlinked(gn, size=ldp_size, step=ldp_step, threshold=ldp_threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        log('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gnu = gn.compress(loc_unlinked, axis=0)
        posu = pos.compress(loc_unlinked)
        locu = pos.locate_keys(posu)

    return locu

Define last function, the analysis function that includes all function below and applies these on my populations, outgroups, chromosomes of interest.

In [11]:
def run_analysis(rname, chrom, start, stop, outgroups, segpops,
                 n=100000, ldp_size=500, ldp_step=250, ldp_threshold=.1, ldp_n_iter=1):

    # initial ascertainment
    loc_og_asc = outgroup_ascertainment(chrom, start, stop, outgroups=outgroups)
    loc_ig_asc = ingroup_ascertainment(chrom, start, stop, segpops=segpops)
    loc_asc = loc_og_asc & loc_ig_asc
    log('initial ascertainment', nnz(loc_asc))
    
    # downsample and prune
    locu = downsample_and_prune(chrom, start, stop, loc_asc, 
                                n=n, ldp_size=ldp_size, ldp_step=ldp_step, 
                                ldp_threshold=ldp_threshold, ldp_n_iter=ldp_n_iter)
    
    # write allele counts
    acsu = dict()
    for pop in populations:
        acsu[pop] = allele_counts[chrom][pop][:, :2][locu]
    for pop in outgroups:
        acsu[pop] = outgroup_allele_counts[chrom][pop][:, :2][locu]

    outdir = 'd/data/treemix/seg_%s_og_%s_ldp_%s' % ('_'.join(segpops), '_'.join(outgroups), ldp_n_iter)
    !mkdir -pv {outdir}
    fn = os.path.join(outdir, '%s.allele_counts.txt' % rname)
    to_treemix(acsu, fn)
    !gzip -fv {fn}


Declaring values for generating my treemix file and ran on it for chromosome 3R, 3L, X, and the X region involved on speciation between <i>An.gambiae</i> and <i>An.coluzzii</i>

In [19]:
outgroups = ['chri', 'arab', 'quad', 'meru', 'mela']
segpops = ['AOcol', 'BFcol', 'CIcol', 'GHcol', 'GNcol','GHgam', 'CMgam', 'BFgam', 'GNgam', 'GQgam', 'UGgam', 'GAgam', 'FRgam','KE', 'GM', 'GW']
n = 200000
ldp_n_iter = 1
region_3R_first = '3R-first', '3R', 1, 2700000  
region_3R_second = '3R-second', '3R',  3000000, 37000000

-----------------------------
## Treemix on complete 3R-free

In [20]:
rname, chrom, start, stop = region_3R_first
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3R-first 3R 1 2700000
outgroup ascertainment, initial 742123
chri 563686
arab 560215
quad 554940
meru 543446
mela 528874
ingroup ascertainment, initial 742123
after require segregating in AOcol 71143
after require segregating in BFcol 48188
after require segregating in CIcol 42918
after require segregating in GHcol 40416
after require segregating in GNcol 22593
after require segregating in GHgam 19425
after require segregating in CMgam 19419
after require segregating in BFgam 19393
after require segregating in GNgam 19266
after require segregating in GQgam 16945
after require segregating in UGgam 16942
after require segregating in GAgam 16452
after require segregating in FRgam 10513
after require segregating in KE 7062
after require segregating in GM 7053
after require segregating in GW 7053
initial ascertainment 4416
iteration 1 retaining 3455 removing 961 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_chri_arab_qu

In [21]:
rname, chrom, start, stop = region_3R_second
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3R-second 3R 3000000 37000000
outgroup ascertainment, initial 7708409
chri 5184049
arab 5146563
quad 5054996
meru 4986770
mela 4826265
ingroup ascertainment, initial 7708409
after require segregating in AOcol 865047
after require segregating in BFcol 577972
after require segregating in CIcol 506993
after require segregating in GHcol 474647
after require segregating in GNcol 245265
after require segregating in GHgam 216393
after require segregating in CMgam 216288
after require segregating in BFgam 215885
after require segregating in GNgam 214351
after require segregating in GQgam 185419
after require segregating in UGgam 185329
after require segregating in GAgam 181405
after require segregating in FRgam 122630
after require segregating in KE 86071
after require segregating in GM 85988
after require segregating in GW 85982
initial ascertainment 47277
iteration 1 retaining 36203 removing 11074 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_G

-------------------
## Merging files

In [22]:
treemix_3R_first = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_chri_arab_quad_meru_mela_ldp_1/3R-first.allele_counts.txt.gz', sep = ' ')
treemix_3R_first

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,...,GNgam,GQgam,GW,KE,UGgam,arab,chri,mela,meru,quad
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1811,960,2240,240,10,80,200,200
1,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1780,960,2231,240,00,80,200,200
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,...,377,117,60122,897,7217,240,10,80,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3450,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,800,180,1811,960,2240,240,01,80,200,200
3451,1560,1500,1840,1420,5940,480,1380,1100,240,1300,...,791,180,1820,960,2240,240,10,80,200,200
3452,1560,1500,1813,1420,5922,480,1380,1100,240,1300,...,782,180,1820,960,2240,240,10,80,200,200
3453,1560,1500,1840,1420,5931,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,01,80,200,200


In [23]:
treemix_3R_second = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_chri_arab_quad_meru_mela_ldp_1/3R-second.allele_counts.txt.gz', sep = ' ')
treemix_3R_second

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,...,GNgam,GQgam,GW,KE,UGgam,arab,chri,mela,meru,quad
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1811,960,2240,240,10,80,200,200
1,1560,1500,1840,1420,5940,460,1371,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
2,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1780,960,2231,240,00,80,200,200
3,1560,1500,1840,1384,5940,460,1380,1100,240,12010,...,800,180,16220,960,2240,240,01,08,020,200
4,1560,1500,1831,1420,5940,460,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36198,24132,21129,16816,22120,53559,480,1380,1694,222,8743,...,737,117,12953,3759,21113,024,00,08,200,200
36199,1560,1500,1822,1420,5940,480,1380,1100,240,1300,...,800,180,1793,960,2240,240,10,80,200,200
36200,1560,1500,1822,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,00,80,200,200
36201,1560,1500,1840,1420,5931,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200


In [24]:
chrom_3R_gste_out = pd.concat([treemix_3R_first, treemix_3R_second])
chrom_3R_gste_out

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,...,GNgam,GQgam,GW,KE,UGgam,arab,chri,mela,meru,quad
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1811,960,2240,240,10,80,200,200
1,1560,1500,1840,1420,5940,460,1380,1100,240,1300,...,800,180,1780,960,2231,240,00,80,200,200
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,...,377,117,60122,897,7217,240,10,80,200,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36198,24132,21129,16816,22120,53559,480,1380,1694,222,8743,...,737,117,12953,3759,21113,024,00,08,200,200
36199,1560,1500,1822,1420,5940,480,1380,1100,240,1300,...,800,180,1793,960,2240,240,10,80,200,200
36200,1560,1500,1822,1420,5940,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,00,80,200,200
36201,1560,1500,1840,1420,5931,480,1380,1100,240,1300,...,800,180,1820,960,2240,240,10,80,200,200


In [25]:
chrom_3R_gste_out.to_csv('data/Treemix_data/3R_gste_out.txt',index=False, sep=" ")

--------
## Taking out only christyi

In [68]:
chrom_3R_gste_out_chri = chrom_3R_gste_out.drop(chrom_3R_gste_out.columns[[16,18,19,20]], axis=1)
chrom_3R_gste_out_chri

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,chri
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,10
1,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1780,960,2231,00
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,80,800,180,1820,960,2240,10
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,10
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,80,377,117,60122,897,7217,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36198,24132,21129,16816,22120,53559,480,1380,1694,222,8743,26,737,117,12953,3759,21113,00
36199,1560,1500,1822,1420,5940,480,1380,1100,240,1300,80,800,180,1793,960,2240,10
36200,1560,1500,1822,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00
36201,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,10


In [69]:
chrom_3R_gste_out_chri_filt = chrom_3R_gste_out_chri[chrom_3R_gste_out_chri.chri != '0,0']
chrom_3R_gste_out_chri_filt

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,chri
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,10
2,1551,1500,1840,1420,5940,460,1380,1091,240,1300,80,800,180,1820,960,2240,10
3,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,10
4,1560,13812,10174,1357,31563,046,20118,1055,123,3397,80,377,117,60122,897,7217,10
5,1560,1500,1840,1420,5931,460,1380,1100,240,1300,80,800,180,1820,960,2240,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36196,7779,7476,1786,9547,55836,480,1380,7337,222,1255,53,764,117,1757,3759,21212,10
36197,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,10
36199,1560,1500,1822,1420,5940,480,1380,1100,240,1300,80,800,180,1793,960,2240,10
36201,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,10


In [70]:
chrom_3R_gste_out_chri_filt.to_csv('data/Treemix_data/chrom_3R_gste_out_chri_filt.txt',index=False, sep=" ")

--------------------
## Taking out only melas and merus

In [30]:
outgroups = ['meru', 'mela']

In [37]:
region_3R_free = '3R-free', '3R', 1, 37000000 
region_3L_free = '3L-free', '3L', 15000000, 41000000

In [38]:
rname, chrom, start, stop = region_3R_free
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops,n=n, ldp_n_iter=ldp_n_iter)

3R-free 3R 1 37000000
outgroup ascertainment, initial 8535400
meru 8162228
mela 7826844
ingroup ascertainment, initial 8535400
after require segregating in AOcol 944999
after require segregating in BFcol 632162
after require segregating in CIcol 555287
after require segregating in GHcol 520052
after require segregating in GNcol 270605
after require segregating in GHgam 238337
after require segregating in CMgam 238226
after require segregating in BFgam 237796
after require segregating in GNgam 236126
after require segregating in GQgam 204576
after require segregating in UGgam 204483
after require segregating in GAgam 200000
after require segregating in FRgam 134437
after require segregating in KE 94097
after require segregating in GM 94004
after require segregating in GW 93998
initial ascertainment 84280
iteration 1 retaining 60344 removing 23936 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3R-free.

In [39]:
rname, chrom, start, stop = region_3L_free
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3L-free 3L 15000000 41000000
outgroup ascertainment, initial 5989818
meru 5673406
mela 5370857
ingroup ascertainment, initial 5989818
after require segregating in AOcol 672596
after require segregating in BFcol 448836
after require segregating in CIcol 392968
after require segregating in GHcol 368591
after require segregating in GNcol 190540
after require segregating in GHgam 168923
after require segregating in CMgam 168833
after require segregating in BFgam 168532
after require segregating in GNgam 167228
after require segregating in GQgam 146236
after require segregating in UGgam 146178
after require segregating in GAgam 142414
after require segregating in FRgam 96294
after require segregating in KE 68187
after require segregating in GM 68141
after require segregating in GW 68139
initial ascertainment 59314
iteration 1 retaining 43145 removing 16169 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3L

In [41]:
treemix_3R = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3R-free.allele_counts.txt.gz', sep = ' ')
treemix_3R

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
1,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,80,200
2,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1820,960,2213,80,200
3,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,782,180,1820,960,2231,80,200
4,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,791,180,1820,960,2240,80,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60339,1560,1500,1840,1420,5940,246,1380,1080,240,1300,80,800,180,1820,960,2231,80,200
60340,1560,1491,55129,1420,197397,246,8256,1091,717,4981,80,2555,711,9983,960,87137,08,020
60341,1560,1500,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
60342,1560,1500,16024,1420,53163,480,1344,1100,204,8248,80,6119,180,12755,960,20618,80,200


In [42]:
treemix_3L = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3L-free.allele_counts.txt.gz', sep = ' ')
treemix_3L

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5913,480,1380,1100,240,1300,80,791,171,1820,960,2240,00,00
1,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,00,00
2,1560,1500,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
3,1560,1491,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
4,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,791,180,1820,960,2240,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43140,1551,1500,1840,1420,5940,480,1371,1100,240,1300,80,800,171,1820,960,2240,00,00
43141,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2231,00,00
43142,1560,1500,1840,1420,5922,3711,1371,1100,240,1300,80,800,180,1820,960,2231,00,00
43143,1560,1500,1840,1420,5922,480,1380,1100,240,1300,80,791,180,1820,960,2240,00,00


In [43]:
chrom_3_me = pd.concat([treemix_3L, treemix_3R])
chrom_3_me

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5913,480,1380,1100,240,1300,80,791,171,1820,960,2240,00,00
1,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,00,00
2,1560,1500,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
3,1560,1491,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
4,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,791,180,1820,960,2240,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60339,1560,1500,1840,1420,5940,246,1380,1080,240,1300,80,800,180,1820,960,2231,80,200
60340,1560,1491,55129,1420,197397,246,8256,1091,717,4981,80,2555,711,9983,960,87137,08,020
60341,1560,1500,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
60342,1560,1500,16024,1420,53163,480,1344,1100,204,8248,80,6119,180,12755,960,20618,80,200


In [44]:
chrom_3_me.to_csv('data/Treemix_data/chrom_3_me.txt',index=False, sep=" ")

------------------------------
filtering 3L

In [45]:
treemix_3L = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3L-free.allele_counts.txt.gz', sep = ' ')
treemix_3L

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5913,480,1380,1100,240,1300,80,791,171,1820,960,2240,00,00
1,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,00,00
2,1560,1500,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
3,1560,1491,1831,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,00,00
4,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,791,180,1820,960,2240,00,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43140,1551,1500,1840,1420,5940,480,1371,1100,240,1300,80,800,171,1820,960,2240,00,00
43141,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2231,00,00
43142,1560,1500,1840,1420,5922,3711,1371,1100,240,1300,80,800,180,1820,960,2231,00,00
43143,1560,1500,1840,1420,5922,480,1380,1100,240,1300,80,791,180,1820,960,2240,00,00


In [46]:
treemix_3L_filt = treemix_3L[treemix_3L.mela != '0,0']
treemix_3L_filt

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
188,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2204,08,00
189,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,00
190,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2231,80,00
191,1560,1500,1840,1420,5922,480,1380,1100,240,1300,80,800,180,1820,960,2240,30,00
192,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43134,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
43135,1560,1500,1840,1420,5940,480,1380,1100,240,1291,80,800,180,17012,960,2240,80,200
43136,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,08,200
43137,1560,1500,1840,1420,5940,480,1380,1100,240,1291,80,800,180,17012,960,2240,80,200


In [48]:
treemix_3L_filt = treemix_3L_filt[treemix_3L_filt.meru != '0,0']
treemix_3L_filt

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
255,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,791,180,1820,960,2240,80,200
256,1560,1500,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
257,1560,1500,1822,1420,5940,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
258,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
259,1560,1500,1840,1375,5940,480,1380,1073,240,1300,80,800,180,1820,960,2240,80,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43134,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
43135,1560,1500,1840,1420,5940,480,1380,1100,240,1291,80,800,180,17012,960,2240,80,200
43136,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,08,200
43137,1560,1500,1840,1420,5940,480,1380,1100,240,1291,80,800,180,17012,960,2240,80,200


In [49]:
treemix_3L_filt.to_csv('data/Treemix_data/treemix_3L_filt_me.txt',index=False, sep=" ")

---------------------------------------
gste out

In [31]:
rname, chrom, start, stop = region_3R_first
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3R-first 3R 1 2700000
outgroup ascertainment, initial 742123
meru 705002
mela 682293
ingroup ascertainment, initial 742123
after require segregating in AOcol 71143
after require segregating in BFcol 48188
after require segregating in CIcol 42918
after require segregating in GHcol 40416
after require segregating in GNcol 22593
after require segregating in GHgam 19425
after require segregating in CMgam 19419
after require segregating in BFgam 19393
after require segregating in GNgam 19266
after require segregating in GQgam 16945
after require segregating in UGgam 16942
after require segregating in GAgam 16452
after require segregating in FRgam 10513
after require segregating in KE 7062
after require segregating in GM 7053
after require segregating in GW 7053
initial ascertainment 6363
iteration 1 retaining 4698 removing 1665 variants
mkdir: created directory 'd/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1'
d/da

In [32]:
rname, chrom, start, stop = region_3R_second
log(rname, chrom, start, stop)
run_analysis(rname, chrom, start, stop, outgroups, segpops, n=n, ldp_n_iter=ldp_n_iter)

3R-second 3R 3000000 37000000
outgroup ascertainment, initial 7708409
meru 7376900
mela 7068272
ingroup ascertainment, initial 7708409
after require segregating in AOcol 865047
after require segregating in BFcol 577972
after require segregating in CIcol 506993
after require segregating in GHcol 474647
after require segregating in GNcol 245265
after require segregating in GHgam 216393
after require segregating in CMgam 216288
after require segregating in BFgam 215885
after require segregating in GNgam 214351
after require segregating in GQgam 185419
after require segregating in UGgam 185329
after require segregating in GAgam 181405
after require segregating in FRgam 122630
after require segregating in KE 86071
after require segregating in GM 85988
after require segregating in GW 85982
initial ascertainment 77052
iteration 1 retaining 54981 removing 22071 variants
d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/

In [33]:
treemix_3R_first = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3R-first.allele_counts.txt.gz', sep = ' ')
treemix_3R_first

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
1,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,80,200
2,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1820,960,2213,80,200
3,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,782,180,1820,960,2231,80,200
4,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,791,180,1820,960,2240,80,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4693,1560,1500,1840,1420,5940,480,1380,1091,240,1300,80,800,180,1820,960,2240,80,200
4694,1560,1491,1840,1411,5940,480,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
4695,1560,1500,1840,1420,5940,480,1380,1100,240,1282,80,800,180,1820,960,2222,80,200
4696,1560,1500,1822,1420,5895,480,1380,1100,240,1300,80,800,180,1820,960,2222,80,200


In [34]:
treemix_3R_second = pd.read_csv('d/data/treemix/seg_AOcol_BFcol_CIcol_GHcol_GNcol_GHgam_CMgam_BFgam_GNgam_GQgam_UGgam_GAgam_FRgam_KE_GM_GW_og_meru_mela_ldp_1/3R-second.allele_counts.txt.gz', sep = ' ')
treemix_3R_second

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
1,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,80,200
2,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1780,960,2231,80,200
3,1560,1500,1840,1384,5940,460,1380,1100,240,12010,80,800,180,16220,960,2240,08,020
4,1560,1500,1831,1420,5940,460,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54976,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
54977,1560,1491,1831,1420,5940,480,1380,1100,240,1273,80,800,180,1811,951,2240,80,200
54978,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,782,180,1820,960,2240,80,200
54979,1560,1500,1840,1420,5940,480,1380,1100,231,1300,80,800,180,1820,960,2240,80,200


In [35]:
chrom_3R_gste_out_me = pd.concat([treemix_3R_first, treemix_3R_second])
chrom_3R_gste_out_me

Unnamed: 0,AOcol,BFcol,BFgam,CIcol,CMgam,FRgam,GAgam,GHcol,GHgam,GM,GNcol,GNgam,GQgam,GW,KE,UGgam,mela,meru
0,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1811,960,2240,80,200
1,1560,1500,1840,1420,5940,460,1371,1100,240,1300,80,800,180,1820,960,2240,80,200
2,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,800,180,1820,960,2213,80,200
3,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,782,180,1820,960,2231,80,200
4,1560,1500,1840,1420,5940,460,1380,1100,240,1300,80,791,180,1820,960,2240,80,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54976,1560,1500,1840,1420,5931,480,1380,1100,240,1300,80,800,180,1820,960,2240,80,200
54977,1560,1491,1831,1420,5940,480,1380,1100,240,1273,80,800,180,1811,951,2240,80,200
54978,1560,1500,1840,1420,5940,480,1380,1100,240,1300,80,782,180,1820,960,2240,80,200
54979,1560,1500,1840,1420,5940,480,1380,1100,231,1300,80,800,180,1820,960,2240,80,200


In [36]:
chrom_3R_gste_out_me_filt.to_csv('data/Treemix_data/chrom_3R_gste_out_me.txt',index=False, sep=" ")

---------