##### Table S5: mean genome-wide FST between each cohort


In [1]:
from collections import defaultdict
import dask.array as da
import numpy as np
import pandas as pd
import re

import allel
import itertools

import ingenos

In [2]:
base_path = "/overflow/dschridelab/users/rrlove/aedes/"
md_path = f"{base_path}metadata/"
results_path = f"{base_path}results/"
zarr_path = "/proj/dschridelab/rrlove/aedes/vcf/filtered_110122/"

In [3]:
def return_data(zarr_path, data_type, data_name):
    
    fetch_str = f"{data_type}/{data_name}" 
    
    return da.from_zarr(zarr_path, component=fetch_str)

##### read in the genome-wide data

In [4]:
chroms_dict = defaultdict(dict)

chroms = ["AaegL5_1", "AaegL5_2", "AaegL5_3"]

for chrom in chroms:
    
    temp_in_path = zarr_path + chrom

    chroms_dict[chrom]["pos"] = return_data(temp_in_path, "variants", "POS")
    chroms_dict[chrom]["chrom"] = return_data(temp_in_path, "variants", "CHROM")
    chroms_dict[chrom]["ref"] = return_data(temp_in_path, "variants", "REF")
    chroms_dict[chrom]["alt"] = return_data(temp_in_path, "variants", "ALT")
    chroms_dict[chrom]["ac"] = return_data(temp_in_path, "variants", "AC")
    chroms_dict[chrom]["qd"] = return_data(temp_in_path, "variants", "QD")
    chroms_dict[chrom]["mq"] = return_data(temp_in_path, "variants", "MQ")
    chroms_dict[chrom]["fs"] = return_data(temp_in_path, "variants", "FS")
    chroms_dict[chrom]["mqrs"] = return_data(temp_in_path, "variants", "MQRankSum")
    chroms_dict[chrom]["rprs"] = return_data(temp_in_path, "variants", "ReadPosRankSum")
    chroms_dict[chrom]["sor"] = return_data(temp_in_path, "variants", "SOR")
    chroms_dict[chrom]["is_snp"] = return_data(temp_in_path, "variants", "is_snp")
    chroms_dict[chrom]["numalt"] = return_data(temp_in_path, "variants", "numalt")
    chroms_dict[chrom]["filter_pass"] = return_data(temp_in_path, "variants", "FILTER_PASS")

    chroms_dict[chrom]["gt"] = return_data(temp_in_path, "calldata", "GT")
    chroms_dict[chrom]["gq"] = return_data(temp_in_path, "calldata", "GQ")
    chroms_dict[chrom]["ad"] = return_data(temp_in_path, "calldata", "AD")
    chroms_dict[chrom]["pl"] = return_data(temp_in_path, "calldata", "PL")
    chroms_dict[chrom]["dp"] = return_data(temp_in_path, "calldata", "DP")
    
    print(chrom, chroms_dict[chrom]["gt"].shape)
    
samples = da.from_zarr(temp_in_path, component="samples")

AaegL5_1 (18443972, 131, 2)
AaegL5_2 (32930875, 131, 2)
AaegL5_3 (27660249, 131, 2)


In [5]:
np.sum([chroms_dict[chrom]["gt"].shape[0] for chrom in chroms])

79035096

##### read in the metadata

In [6]:
md = pd.read_table(
    md_path + "whole_sample_sorted_country.031522.csv", 
    sep="\t",)

md["locality"] = md["location"].str.split(": ", expand=True)[1]

md.head()

Unnamed: 0,sample_id,sample_short,location,sex,batch,sample_id_cat,country,locality
0,FEMALE_1-F1_CGCATGAT-TCAGGCTT_S1,FEMALE_1,Colombia: Rio Claro,F,1,FEMALE_1-F1_CGCATGAT-TCAGGCTT_S1,Colombia,Rio Claro
1,FEMALE_10-F10_GTGCCATA-ACTAGGAG_S2,FEMALE_10,Colombia: Rio Claro,F,1,FEMALE_10-F10_GTGCCATA-ACTAGGAG_S2,Colombia,Rio Claro
2,FEMALE_11-F11_CGTTGCAA-CGCTCTAT_S3,FEMALE_11,Colombia: Rio Claro,F,1,FEMALE_11-F11_CGTTGCAA-CGCTCTAT_S3,Colombia,Rio Claro
3,FEMALE_12-F12_TGAAGACG-TGGCATGT_S4,FEMALE_12,Colombia: Rio Claro,F,1,FEMALE_12-F12_TGAAGACG-TGGCATGT_S4,Colombia,Rio Claro
4,FEMALE_14-F14_ACGTTCAG-GCACAACT_S6,FEMALE_14,Colombia: Rio Claro,F,1,FEMALE_14-F14_ACGTTCAG-GCACAACT_S6,Colombia,Rio Claro


In [7]:
np.sum(~(md["sample_id"] == samples.compute()))

0

##### remove close kin

In [8]:
to_drop = np.loadtxt(md_path + "close_kin_removed_new_dataset.txt",
                    dtype=str)

to_drop_bool = (~(md["sample_short"].isin(to_drop)).values)

np.sum(to_drop_bool)

123

In [9]:
md_filtered = md.loc[to_drop_bool]

md_filtered.reset_index(drop=True, inplace=True)

md_filtered.shape

(123, 8)

##### set up filtering by country

In [10]:
country_bools = {}

countries = md_filtered["country"].unique()

for country in countries:
    
    country_bools[country] = (md_filtered["country"] == country).values
    
country_bools.keys()

dict_keys(['Colombia', 'Kenya', 'Senegal', 'Gabon', 'Brazil', 'USA'])

In [11]:
country_indices = {}

for country, flt in country_bools.items():
    
    country_indices[country] = md_filtered[flt].index
    
country

'USA'

##### read in the repetitive and non-uniquely-mapping regions

In [12]:
mask = pd.read_table(
    f"{base_path}refs/aegy/unified_mask/merged_rep_map_masks.110822.bed",
sep="\t", names=["chrom", "start", "end"])

mask.head()

Unnamed: 0,chrom,start,end
0,AaegL5_1,0,273
1,AaegL5_1,297,373
2,AaegL5_1,413,7449
3,AaegL5_1,7452,8789
4,AaegL5_1,9854,18833


##### calculate genome-wide FST in non-overlapping 500 kb windows

In [13]:
chrom_fst_dict = defaultdict(dict)

for chrom in chroms:
    
    pos = allel.SortedIndex(chroms_dict[chrom]["pos"])
    
    chrom_mask = mask[mask["chrom"] == chrom]
    
    mask_flt = (~(pos.locate_ranges(chrom_mask["start"] + 1, 
                                      chrom_mask["end"],
                                      strict=False)))

    gt = allel.GenotypeArray(chroms_dict[chrom]["gt"]).subset(sel0 = mask_flt,
                                        sel1 = to_drop_bool)
    
    gq_flt = chroms_dict[chrom]["gq"]\
    [:, to_drop_bool]\
    [mask_flt, :].compute()
    
    gt.mask = gq_flt < 20
    
    print(chrom, gt.shape)

    for country1, country2 in itertools.combinations(countries, 2):
        
        fst = allel.moving_weir_cockerham_fst(gt, 
                                              [country_indices[country1], 
                                               country_indices[country2]],
                                             size=500000)
    
        chrom_fst_dict[chrom][(country1, country2)] = fst
        
        print(country1, country2)

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  gq_flt = chroms_dict[chrom]["gq"]\


AaegL5_1 (8773166, 123, 2)


  p = ac / an[:, np.newaxis, :]
  a = ((n_bar / n_C) *


Colombia Kenya
Colombia Senegal
Colombia Gabon
Colombia Brazil
Colombia USA
Kenya Senegal
Kenya Gabon
Kenya Brazil
Kenya USA
Senegal Gabon
Senegal Brazil
Senegal USA
Gabon Brazil
Gabon USA
Brazil USA


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  gq_flt = chroms_dict[chrom]["gq"]\


AaegL5_2 (16681224, 123, 2)


  p = ac / an[:, np.newaxis, :]
  a = ((n_bar / n_C) *


Colombia Kenya
Colombia Senegal
Colombia Gabon
Colombia Brazil
Colombia USA
Kenya Senegal
Kenya Gabon
Kenya Brazil
Kenya USA
Senegal Gabon
Senegal Brazil
Senegal USA
Gabon Brazil
Gabon USA
Brazil USA


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  gq_flt = chroms_dict[chrom]["gq"]\


AaegL5_3 (13548426, 123, 2)


  p = ac / an[:, np.newaxis, :]
  a = ((n_bar / n_C) *


Colombia Kenya
Colombia Senegal
Colombia Gabon
Colombia Brazil
Colombia USA
Kenya Senegal
Kenya Gabon
Kenya Brazil
Kenya USA
Senegal Gabon
Senegal Brazil
Senegal USA
Gabon Brazil
Gabon USA
Brazil USA


##### for each pair, calculate the mean and median per-chrom and overall

In [14]:
for country1, country2 in itertools.combinations(countries, 2):
    
    fst_list = []
    
    print(country1, country2, "\n")
    
    for chrom in chroms:
        
        fst = chrom_fst_dict[chrom][(country1, country2)]
        
        print(chrom, "mean fst: ", np.mean(fst))
        print(chrom, "median fst: ", np.median(fst), "\n")
        
        fst_list.extend(fst)
        
    print("whole genome mean fst: ", np.mean(fst_list))
    print("whole genome median fst: ", np.median(fst_list), "\n")

Colombia Kenya 

AaegL5_1 mean fst:  0.22952877845495298
AaegL5_1 median fst:  0.2407208024816144 

AaegL5_2 mean fst:  0.24331682891775763
AaegL5_2 median fst:  0.24203128734230303 

AaegL5_3 mean fst:  0.21940273538967237
AaegL5_3 median fst:  0.2248747284882983 

whole genome mean fst:  0.23188725251352416
whole genome median fst:  0.2324767251308455 

Colombia Senegal 

AaegL5_1 mean fst:  0.16234496143530594
AaegL5_1 median fst:  0.15137258594802824 

AaegL5_2 mean fst:  0.15933178369244988
AaegL5_2 median fst:  0.1621143138091599 

AaegL5_3 mean fst:  0.1628742628223907
AaegL5_3 median fst:  0.15999797833983223 

whole genome mean fst:  0.1612391987331896
whole genome median fst:  0.15999797833983223 

Colombia Gabon 

AaegL5_1 mean fst:  0.26487656762329387
AaegL5_1 median fst:  0.23582159719056053 

AaegL5_2 mean fst:  0.28573763992113654
AaegL5_2 median fst:  0.28612044330536845 

AaegL5_3 mean fst:  0.27153455968006185
AaegL5_3 median fst:  0.2773972356607906 

whole genome m