##### Table 2 and Table S9: linkage disequilibrium between the focal SNPs within the Colombia and USA cohorts, respectively

In [1]:
from collections import defaultdict
import dask.array as da
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform

import allel

import ingenos

In [2]:
base_path = "/overflow/dschridelab/users/rrlove/aedes/"
md_path = f"{base_path}metadata/"
results_path = f"{base_path}results/"
vcf_path = "/proj/dschridelab/rrlove/aedes/vcf/"
zarr_path = "/proj/dschridelab/rrlove/aedes/vcf/filtered_110122/"

In [3]:
chroms = ["AaegL5_1", "AaegL5_2", "AaegL5_3"]
countries = ["Brazil", "Colombia", "USA", "Kenya", "Senegal", "Gabon"]

##### read in the data and metadata

In [4]:
def return_data(zarr_path, data_type, data_name):
    
    fetch_str = f"{data_type}/{data_name}" 
    
    return da.from_zarr(zarr_path, component=fetch_str)

In [5]:
chroms_dict = defaultdict(dict)

for chrom in ["AaegL5_3"]:
    
    temp_in_path = zarr_path + chrom

    chroms_dict[chrom]["pos"] = return_data(temp_in_path, "variants", "POS")
    chroms_dict[chrom]["chrom"] = return_data(temp_in_path, "variants", "CHROM")
    chroms_dict[chrom]["ref"] = return_data(temp_in_path, "variants", "REF")
    chroms_dict[chrom]["alt"] = return_data(temp_in_path, "variants", "ALT")
    chroms_dict[chrom]["ac"] = return_data(temp_in_path, "variants", "AC")
    chroms_dict[chrom]["qd"] = return_data(temp_in_path, "variants", "QD")
    chroms_dict[chrom]["mq"] = return_data(temp_in_path, "variants", "MQ")
    chroms_dict[chrom]["fs"] = return_data(temp_in_path, "variants", "FS")
    chroms_dict[chrom]["mqrs"] = return_data(temp_in_path, "variants", "MQRankSum")
    chroms_dict[chrom]["rprs"] = return_data(temp_in_path, "variants", "ReadPosRankSum")
    chroms_dict[chrom]["sor"] = return_data(temp_in_path, "variants", "SOR")
    chroms_dict[chrom]["is_snp"] = return_data(temp_in_path, "variants", "is_snp")
    chroms_dict[chrom]["numalt"] = return_data(temp_in_path, "variants", "numalt")
    chroms_dict[chrom]["filter_pass"] = return_data(temp_in_path, "variants", "FILTER_PASS")

    chroms_dict[chrom]["gt"] = return_data(temp_in_path, "calldata", "GT")
    chroms_dict[chrom]["gq"] = return_data(temp_in_path, "calldata", "GQ")
    chroms_dict[chrom]["ad"] = return_data(temp_in_path, "calldata", "AD")
    chroms_dict[chrom]["pl"] = return_data(temp_in_path, "calldata", "PL")
    chroms_dict[chrom]["dp"] = return_data(temp_in_path, "calldata", "DP")
    
    print(chrom, chroms_dict[chrom]["gt"].shape)
    
samples = da.from_zarr(temp_in_path, component="samples")

AaegL5_3 (27660249, 131, 2)


In [6]:
md = pd.read_table(
    md_path + "whole_sample_sorted_country.031522.csv", 
    sep="\t",)

md["locality"] = md["location"].str.split(": ", expand=True)[1]

md.head()

Unnamed: 0,sample_id,sample_short,location,sex,batch,sample_id_cat,country,locality
0,FEMALE_1-F1_CGCATGAT-TCAGGCTT_S1,FEMALE_1,Colombia: Rio Claro,F,1,FEMALE_1-F1_CGCATGAT-TCAGGCTT_S1,Colombia,Rio Claro
1,FEMALE_10-F10_GTGCCATA-ACTAGGAG_S2,FEMALE_10,Colombia: Rio Claro,F,1,FEMALE_10-F10_GTGCCATA-ACTAGGAG_S2,Colombia,Rio Claro
2,FEMALE_11-F11_CGTTGCAA-CGCTCTAT_S3,FEMALE_11,Colombia: Rio Claro,F,1,FEMALE_11-F11_CGTTGCAA-CGCTCTAT_S3,Colombia,Rio Claro
3,FEMALE_12-F12_TGAAGACG-TGGCATGT_S4,FEMALE_12,Colombia: Rio Claro,F,1,FEMALE_12-F12_TGAAGACG-TGGCATGT_S4,Colombia,Rio Claro
4,FEMALE_14-F14_ACGTTCAG-GCACAACT_S6,FEMALE_14,Colombia: Rio Claro,F,1,FEMALE_14-F14_ACGTTCAG-GCACAACT_S6,Colombia,Rio Claro


In [7]:
np.sum(~(md["sample_id"] == samples.compute()))

0

##### remove closely-related specimens

In [8]:
to_drop = np.loadtxt(md_path + "close_kin_removed_new_dataset.txt",
                    dtype=str)

to_drop_bool = (~(md["sample_short"].isin(to_drop)).values)

np.sum(to_drop_bool)

123

In [9]:
md_filtered = md.loc[to_drop_bool]

md_filtered.shape

(123, 8)

##### set up by-country filters

In [10]:
country_bools = {}

for country in md_filtered["country"].unique():
    
    country_bools[country] = (md_filtered["country"] == country).values
    
country_bools.keys()

dict_keys(['Colombia', 'Kenya', 'Senegal', 'Gabon', 'Brazil', 'USA'])

##### define and identify the loci in question

In [11]:
focal_snps = [315939224, 315983763, 315999297, 316014588, 316080722]

focal_snps

[315939224, 315983763, 315999297, 316014588, 316080722]

In [12]:
pos = chroms_dict["AaegL5_3"]["pos"]

##### also read in the bcftools genotypes for F1534C

In [13]:
vgsc_dict = allel.read_vcf(vcf_path + "Vssc/AaegL5_3_Vssc_mpileup.vcf.gz", 
                           fields = ["calldata/GT", "calldata/GQ", "variants/CHROM", 
                                     "variants/POS", "variants/REF", "variants/ALT", 
                                     "samples"])



##### reorder the metadata to match the Vgsc vcf

In [14]:
md_vcf = md.copy()

md_vcf["sample_id_cat"] = md_vcf["sample_id"].astype("category")

md_vcf["sample_id_cat"].cat.set_categories(pd.Series(vgsc_dict["samples"]), 
                                   inplace=True)

md_vcf.sort_values("sample_id_cat", inplace=True)

md_vcf.head()

Unnamed: 0,sample_id,sample_short,location,sex,batch,sample_id_cat,country,locality
96,SRR11006847,SRR11006847,Brazil: Santarem,,PRJNA602495,SRR11006847,Brazil,Santarem
82,SRR11006830,SRR11006830,Gabon: Franceville,,PRJNA602495,SRR11006830,Gabon,Franceville
35,SRR11006666,SRR11006666,Kenya: KayaBomu,,PRJNA602495,SRR11006666,Kenya,KayaBomu
76,SRR11006824,SRR11006824,Gabon: Franceville,,PRJNA602495,SRR11006824,Gabon,Franceville
80,SRR11006828,SRR11006828,Gabon: Franceville,,PRJNA602495,SRR11006828,Gabon,Franceville


In [15]:
np.sum(~(md_vcf["sample_id"] == vgsc_dict["samples"]))

0

In [16]:
vgsc_file_order = []

for name in md["sample_id"].values:
    
    vgsc_file_order.append(np.where(vgsc_dict["samples"] == name)[0][0])

##### concatenate genotypes at the focal loci

In [17]:
pos = chroms_dict[chrom]["pos"].compute()
gt = allel.GenotypeArray(chroms_dict[chrom]["gt"])
gq = chroms_dict[chrom]["gq"]

gt.mask = (gq < 20).compute()

unflt_pos = vgsc_dict["variants/POS"]
unflt_gt = allel.GenotypeArray(vgsc_dict["calldata/GT"][:, np.array(vgsc_file_order)])

In [18]:
focal_snps_list = []

for snp in focal_snps:
    
    if snp == 315939224:
        
        flt = (unflt_pos == snp)
        
        print(snp, unflt_pos[flt])
        
        focal_snps_list.append(unflt_gt.subset(sel0 = flt, sel1 = to_drop_bool))
        
    else:
        
        flt = (pos == snp)
        
        print(snp, pos[flt])
        
        focal_snps_list.append(gt.subset(sel0 = flt, sel1 = to_drop_bool))

315939224 [315939224]
315983763 [315983763]
315999297 [315999297]
316014588 [316014588]
316080722 [316080722]


In [19]:
focal_gts = allel.GenotypeArray(np.concatenate(focal_snps_list))

focal_gts.shape

(5, 123, 2)

In [20]:
for country, country_flt in country_bools.items():
    
    country_gt = focal_gts.subset(sel1 = country_flt)
        
    ld = allel.rogers_huff_r(country_gt.to_n_alt(fill=-1))
    
    print(country, "\n", squareform(ld ** 2), "\n")

Colombia 
 [[0.              nan       nan       nan       nan]
 [      nan 0.        1.        0.8803087 1.       ]
 [      nan 1.        0.        0.8857143 1.       ]
 [      nan 0.8803087 0.8857143 0.        0.8857143]
 [      nan 1.        1.        0.8857143 0.       ]] 

Kenya 
 [[0.                nan 0.06355933        nan        nan]
 [       nan 0.                nan        nan        nan]
 [0.06355933        nan 0.                nan        nan]
 [       nan        nan        nan 0.                nan]
 [       nan        nan        nan        nan 0.        ]] 

Senegal 
 [[ 0. nan nan nan nan]
 [nan  0. nan nan nan]
 [nan nan  0. nan nan]
 [nan nan nan  0. nan]
 [nan nan nan nan  0.]] 

Gabon 
 [[ 0. nan nan nan nan]
 [nan  0. nan nan nan]
 [nan nan  0. nan nan]
 [nan nan nan  0. nan]
 [nan nan nan nan  0.]] 

Brazil 
 [[ 0. nan nan nan nan]
 [nan  0. nan nan nan]
 [nan nan  0. nan nan]
 [nan nan nan  0. nan]
 [nan nan nan nan  0.]] 

USA 
 [[0.         0.35814306 0.0341916

In [21]:
ld_by_country = {}

for country, country_flt in country_bools.items():
    
    country_gt = focal_gts.subset(sel1 = country_flt)
        
    ld = allel.rogers_huff_r(country_gt.to_n_alt(fill=-1))
    
    ld_by_country[country] = pd.DataFrame(squareform(ld ** 2), index=focal_snps, 
                                          columns=focal_snps)

In [22]:
ld_by_country["Colombia"]

Unnamed: 0,315939224,315983763,315999297,316014588,316080722
315939224,0.0,,,,
315983763,,0.0,1.0,0.880309,1.0
315999297,,1.0,0.0,0.885714,1.0
316014588,,0.880309,0.885714,0.0,0.885714
316080722,,1.0,1.0,0.885714,0.0


In [23]:
ld_by_country["Brazil"]

Unnamed: 0,315939224,315983763,315999297,316014588,316080722
315939224,0.0,,,,
315983763,,0.0,,,
315999297,,,0.0,,
316014588,,,,0.0,
316080722,,,,,0.0


In [24]:
ld_by_country["USA"]

Unnamed: 0,315939224,315983763,315999297,316014588,316080722
315939224,0.0,0.358143,0.034192,0.218757,0.01286
315983763,0.358143,0.0,0.414925,0.568166,0.368465
315999297,0.034192,0.414925,0.0,0.29103,0.390857
316014588,0.218757,0.568166,0.29103,0.0,0.7398
316080722,0.01286,0.368465,0.390857,0.7398,0.0
