In [54]:
import numpy as np
import pandas as pd; pd.set_option('display.max_rows', 10000)
import allel 
import matplotlib.pyplot as plt
import zarr
import h5py
import seaborn as sns
from sklearn import metrics
from tqdm import tqdm
#from mpl_toolkits.basemap import Basemap

In [3]:
%run "~/ag1000g/selective_sweeps/scripts/sweeps_functions.py"
samples = pd.read_csv("../../data/samples.meta.txt", sep='\t')

### Ne estimates from Ag1000g populations (LDNe)

Estimates of effective population size were implemented in NeEstimator v2.1 (Do *et al*., 2014), using the Linkage Disequilibrium method (LDNe) of Waples and Do (2008). This method is widely used, and generally performs robustly in comparisons between single-sample estimators (see references for further reading).

In [65]:
Ne = pd.read_csv("Ne_analyses.LDNe", sep="\t")

In [73]:
Ne[Ne['AF'] == 'minAF_0.05']

Unnamed: 0,AF,chrom,pop,sample_size,independent_comparisons,overall_r^2,expected_r^2,Ne_estimate,Parametric_CI_lower,Parametric_CI_upper,Jackknife_CI_lower,Jackknife_CI_upper
0,minAF_0.05,3L,GHcol,55.0,6465956,0.019633,0.019242,851.2,806.9,900.7,338.5,Infinite
4,minAF_0.05,3R,GHcol,55.0,6705702,0.019729,0.01924,680.4,652.1,711.2,224.3,Infinite
8,minAF_0.05,3L,GHgam,12.0,13844967,0.109337,0.10824,279.1,259.7,301.6,98.9,Infinite
12,minAF_0.05,3R,GHgam,12.0,13760829,0.110819,0.10824,117.7,114.0,121.7,34.1,Infinite
16,minAF_0.05,3L,BFgam,92.0,2357246,0.011382,0.011249,2503.3,2167.7,2960.1,1317.0,21833.9
20,minAF_0.05,3R,BFgam,92.0,2520667,0.011443,0.011247,1699.6,1541.9,1892.6,1040.1,4508.3
24,minAF_0.05,3L,BFcol,75.0,3364929,0.0139,0.013903,Infinite,17897.2,Infinite,10892.7,Infinite
28,minAF_0.05,3R,BFcol,75.0,3173472,0.013897,0.013902,Infinite,19659.2,Infinite,9299.5,Infinite
32,minAF_0.05,3L,UGgam,112.0,2733796,0.009316,0.009184,2515.7,2249.7,2851.9,1960.6,3500.0
36,minAF_0.05,3R,UGgam,112.0,2759529,0.009329,0.009184,2304.8,2080.2,2583.0,1798.6,3198.8


Be aware that both methods to produce confidence intervals, the parametric and pseudo-jackknife method, may be suboptimal (Jones *et al*., 2016), unfortunately, the amendments they suggest have not yet been implemented.

### estimating Ne from theta=4Nemu

In [59]:
pops = samples.population.unique()
chroms = ['3L', '3R']

mu=3.5e-9

Ne = dict()
Ne_Ag = dict()

Ag_array  = zarr.open_array(f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/")
pos  = zarr.open_array(f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS")
print("-------------------  Arrays loaded ------------------------")
geno = allel.GenotypeChunkedArray(Ag_array)
        
for pop in pops:
    for chrom in chroms:
        
        pop_bool = samples.population == pop
        pop_geno = geno.compress(pop_bool, axis=1)
        
        print(f"Counting alleles {pop} {chrom}")
        ac = pop_geno.count_alleles()
        print("Computing theta")
        theta = allel.watterson_theta(pos, ac)
        print('done')
        Neff = theta/(4*mu)
        
        Ne[chrom] = Neff
    
    Ne_Ag[pop] = dict(Ne)



  0%|          | 0/16 [00:00<?, ?it/s][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

-------------------  Arrays loaded ------------------------
Counting alleles GHcol 3L
Computing theta





 50%|█████     | 1/2 [00:55<00:55, 55.29s/it][A[A[A

done
Counting alleles GHcol 3R
Computing theta





100%|██████████| 2/2 [01:55<00:00, 57.91s/it][A[A[A


  6%|▋         | 1/16 [01:55<28:57, 115.83s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GHgam 3L
Computing theta





 50%|█████     | 1/2 [00:50<00:50, 50.30s/it][A[A[A

done
Counting alleles GHgam 3R
Computing theta





100%|██████████| 2/2 [01:41<00:00, 50.55s/it][A[A[A


 12%|█▎        | 2/16 [03:36<25:59, 111.41s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles BFgam 3L
Computing theta





 50%|█████     | 1/2 [01:17<01:17, 77.86s/it][A[A[A

done
Counting alleles BFgam 3R
Computing theta





100%|██████████| 2/2 [02:45<00:00, 82.50s/it][A[A[A


 19%|█▉        | 3/16 [06:21<27:37, 127.49s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles BFcol 3L
Computing theta





 50%|█████     | 1/2 [01:22<01:22, 82.20s/it][A[A[A

done
Counting alleles BFcol 3R
Computing theta





100%|██████████| 2/2 [02:51<00:00, 85.80s/it][A[A[A


 25%|██▌       | 4/16 [09:13<28:08, 140.73s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles UGgam 3L
Computing theta





 50%|█████     | 1/2 [01:47<01:47, 107.39s/it][A[A[A

done
Counting alleles UGgam 3R
Computing theta





100%|██████████| 2/2 [03:11<00:00, 95.54s/it] [A[A[A


 31%|███▏      | 5/16 [12:24<28:34, 155.84s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GM 3L
Computing theta





 50%|█████     | 1/2 [01:12<01:12, 72.23s/it][A[A[A

done
Counting alleles GM 3R
Computing theta





100%|██████████| 2/2 [02:27<00:00, 73.63s/it][A[A[A


 38%|███▊      | 6/16 [14:51<25:32, 153.27s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GW 3L
Computing theta





 50%|█████     | 1/2 [01:21<01:21, 81.61s/it][A[A[A

done
Counting alleles GW 3R
Computing theta





100%|██████████| 2/2 [02:40<00:00, 80.43s/it][A[A[A


 44%|████▍     | 7/16 [17:32<23:19, 155.55s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles KE 3L
Computing theta





 50%|█████     | 1/2 [01:00<01:00, 60.36s/it][A[A[A

done
Counting alleles KE 3R
Computing theta





100%|██████████| 2/2 [02:02<00:00, 61.25s/it][A[A[A


 50%|█████     | 8/16 [19:35<19:25, 145.63s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles CMgam 3L
Computing theta





 50%|█████     | 1/2 [02:23<02:23, 143.25s/it][A[A[A

done
Counting alleles CMgam 3R
Computing theta





100%|██████████| 2/2 [04:51<00:00, 145.71s/it][A[A[A


 56%|█████▋    | 9/16 [24:26<22:05, 189.37s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles FRgam 3L
Computing theta





 50%|█████     | 1/2 [01:00<01:00, 60.65s/it][A[A[A

done
Counting alleles FRgam 3R
Computing theta





100%|██████████| 2/2 [02:05<00:00, 62.83s/it][A[A[A


 62%|██████▎   | 10/16 [26:32<17:01, 170.26s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GQgam 3L
Computing theta





 50%|█████     | 1/2 [00:58<00:58, 58.45s/it][A[A[A

done
Counting alleles GQgam 3R
Computing theta





100%|██████████| 2/2 [01:52<00:00, 56.50s/it][A[A[A


 69%|██████▉   | 11/16 [28:25<12:45, 153.09s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles AOcol 3L
Computing theta





 50%|█████     | 1/2 [01:20<01:20, 80.95s/it][A[A[A

done
Counting alleles AOcol 3R
Computing theta





100%|██████████| 2/2 [02:44<00:00, 82.41s/it][A[A[A


 75%|███████▌  | 12/16 [31:10<10:26, 156.61s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GAgam 3L
Computing theta





 50%|█████     | 1/2 [01:29<01:29, 89.18s/it][A[A[A

done
Counting alleles GAgam 3R
Computing theta





100%|██████████| 2/2 [02:57<00:00, 88.83s/it][A[A[A


 81%|████████▏ | 13/16 [34:07<08:08, 162.92s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GNgam 3L
Computing theta





 50%|█████     | 1/2 [01:18<01:18, 78.31s/it][A[A[A

done
Counting alleles GNgam 3R
Computing theta





100%|██████████| 2/2 [02:42<00:00, 81.03s/it][A[A[A


 88%|████████▊ | 14/16 [36:49<05:25, 162.67s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles GNcol 3L
Computing theta





 50%|█████     | 1/2 [01:05<01:05, 65.76s/it][A[A[A

done
Counting alleles GNcol 3R
Computing theta





100%|██████████| 2/2 [02:12<00:00, 66.18s/it][A[A[A


 94%|█████████▍| 15/16 [39:02<02:33, 153.58s/it][A[A


  0%|          | 0/2 [00:00<?, ?it/s][A[A[A

done
Counting alleles CIcol 3L
Computing theta





 50%|█████     | 1/2 [01:36<01:36, 96.00s/it][A[A[A

done
Counting alleles CIcol 3R
Computing theta





100%|██████████| 2/2 [03:08<00:00, 94.13s/it][A[A[A


100%|██████████| 16/16 [42:10<00:00, 158.16s/it][A[A

done





In [76]:
Ne_theta = pd.DataFrame.from_dict(Ne_Ag).T
Ne_theta.round()

Unnamed: 0,3L,3R
GHcol,952726.0,952726.0
GHgam,816605.0,816605.0
BFgam,1397081.0,1397081.0
BFcol,1242037.0,1242037.0
UGgam,1321118.0,1321118.0
GM,945927.0,945927.0
GW,1259574.0,1259574.0
KE,237080.0,237080.0
CMgam,1767792.0,1767792.0
FRgam,321054.0,321054.0


### References