In [1]:
import allel
from collections import namedtuple
import datetime
import h5py
import ingenos
import itertools
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib import collections as mc
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn import model_selection
%matplotlib inline

##### set base directory

In [2]:
base = "/afs/crc.nd.edu/group/BesanskyNGS/data05/comp_karyo"

##### read in the top SNPs.

In [3]:
a_top = pd.read_csv(base + "/data/results/2La/comp/predictive_SNPs_train_set_0995_110918.tsv",
               sep = "\t", header=None)

j_top = pd.read_csv(base + "/data/results/2Rj/comp/predictive_SNPs_train_set_08_110918.tsv",
               sep = "\t", header=None)

b_top = pd.read_csv(base + "/data/results/2Rb/comp/predictive_SNPs_train_set_08_110918.tsv",
               sep = "\t", header=None)

c_col_top = pd.read_csv(base + "/data/results/2Rc/comp/col_predictive_SNPs_train_set_08_031919.tsv", 
                      sep="\t", header=None)

c_gam_top = pd.read_csv(base + "/data/results/2Rc/comp/gam_ss_predictive_SNPs_train_set_08_031919.tsv", 
                      sep="\t", header=None)

u_top = pd.read_csv(base + "/data/results/2Ru/comp/predictive_SNPs_train_set_08_110918.tsv",
               sep = "\t", header=None)

d_top = pd.read_csv(base + "/data/results/2Rd/comp/predictive_SNPs_train_set_08_052619.tsv",
               sep = "\t", header=None)

##### read in and prepare data from Main et al.

##### karyotypes taken from Main et al. 2015 supplemental file Molecular Ecology 2015 Main.xlsx, tab "Fig1_genotype_data"; VCF from Dryad, DOI: https://doi.org/10.5061/dryad.f3dn2

##### read in data, filter, and mask low-quality genotypes

In [4]:
Main_2L = allel.read_vcf(
    '/afs/crc.nd.edu/group/BesanskyNGS/data05/comp_karyo/data/Agam_normfilt_vcfs.vcf.gz',
                   fields = ['*'], region = "2L", types={'calldata/GQ': 'f4'})

Main_2R = allel.read_vcf(
    '/afs/crc.nd.edu/group/BesanskyNGS/data05/comp_karyo/data/Agam_normfilt_vcfs.vcf.gz',
                   fields = ['*'], region = "2R", types={'calldata/GQ': 'f4'})

##### read in and prepare metadata

In [5]:
Main_good_names = ["02SEL85","04SEL02","04SEL14","04SEL021","04SEL18","04SEL84","04SEL91",
                   "010sel134","O10SEL160",
               "2012SEL002","2012SEL003","2012SEL006","2012SEL009","2012sel012",
                   "2012SEL013","2012sel029","2012sel063"]

Main_sample_bool = [sample in Main_good_names for sample in Main_2R["samples"]]

Main_a = [2,2,1,1,2,2,2,2,2,1,2,2,2,2,2,2,2]
Main_b = [2,2,2,1,2,0,0,0,1,2,0,2,2,0,2,1,1]
Main_c = [2,2,0,1,2,0,0,0,1,0,0,2,2,0,2,1,1]
Main_d = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
Main_j = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
Main_u = [0,0,0,0,0,0,2,2,1,0,0,0,0,1,0,0,0]

Main_md = pd.DataFrame({"sample_ID" : pd.Series(Main_2R["samples"][Main_sample_bool]),
                       "2La" : pd.Series(Main_a),
                       "2Rb" : pd.Series(Main_b),
                       "2Rc" : pd.Series(Main_c),
                       "2Rd" : pd.Series(Main_d),
                       "2Rj" : pd.Series(Main_j),
                       "2Ru" : pd.Series(Main_u)})

Main_md = Main_md[["sample_ID","2La","2Rb","2Rc","2Rd","2Rj","2Ru"]]


In [6]:
Main_2R_gt = allel.GenotypeArray(Main_2R["calldata/GT"]).subset(sel1 = Main_sample_bool)

Main_2R_gq = Main_2R["calldata/GQ"][:,Main_sample_bool]

Main_2R_gt.mask = Main_2R_gq < 20

Main_2L_gt = allel.GenotypeArray(Main_2L["calldata/GT"]).subset(sel1 = Main_sample_bool)

Main_2L_gq = Main_2L["calldata/GQ"][:,Main_sample_bool]

Main_2L_gt.mask = Main_2L_gq < 20

  """
  # This is added back by InteractiveShellApp.init_path()


In [7]:
Inversion = namedtuple('Inversion',['SNPs','metadata','genotypes','inv_title'])

In [8]:
Main_inv_dict = {"2La" : Inversion(SNPs = a_top.values, metadata = Main_md, 
                                   genotypes = Main_2L_gt, inv_title = "2La"),
                 "2Rj" : Inversion(SNPs = j_top.values, metadata = Main_md, 
                             genotypes = Main_2R_gt, inv_title = "2Rj"),
           "2Rb" : Inversion(SNPs = b_top.values, metadata = Main_md, 
                             genotypes = Main_2R_gt, inv_title = "2Rb"),
        "2Rc_col" : Inversion(SNPs = col_top.values, metadata = Main_md, 
                               genotypes = Main_2R_gt, inv_title = "2Rc"),
           "2Rc_gam" : Inversion(SNPs = gam_top.values, metadata = Main_md, 
                                  genotypes = Main_2R_gt, inv_title = "2Rc"),
           "2Rd" : Inversion(SNPs = d_top.values, metadata = Main_md, 
                             genotypes = Main_2R_gt, inv_title = "2Rd"),
           "2Ru" : Inversion(SNPs = u_top.values, metadata = Main_md, 
                             genotypes = Main_2R_gt, inv_title = "2Ru")}

NameError: name 'col_top' is not defined

In [None]:
for inversion in Main_inv_dict.keys():
    
    ##set up objects
    SNPs = Main_inv_dict[inversion].SNPs
    md = Main_inv_dict[inversion].metadata
    gt = Main_inv_dict[inversion].genotypes
    col_name = Main_inv_dict[inversion].inv_title
    new_col_name = inversion + "_assigned"
    mean_name = inversion + "_means"
    sites_name = inversion + "_sites_called"
    match_name = inversion + "_sites_matching"
    match_proportion_name = inversion + "_pct_sites_matching"
    
    if inversion == "2La":
        
        pos = Main_2L["variants/POS"]
        
    else:
        
        pos = Main_2R["variants/POS"]
    
    ##identify sites found in the data
    site_indices = []
    
    for site in SNPs:
    
        where = np.where(pos == site)
        
        if len(where[0]) > 0:
                
            site_indices.append(where[0][0])
            
    print(inversion, "# targets: ", str(len(SNPs)), " # found: ", str(len(site_indices)))
    
    ##identify biallelic sites
    
    bi_bool = gt.subset(sel0 = site_indices).count_alleles().max_allele() <= 1
        
    alts = gt.subset(sel0 = site_indices).subset(sel0 = bi_bool).to_n_alt()
        
    is_called = gt.subset(sel0 = site_indices).subset(sel0 = bi_bool).is_called()
    
    av_gts = np.mean(np.ma.MaskedArray(
            alts, mask = ~is_called), axis=0).data
    
    match_dict = {0: None, 1: None, 2: None}
    
    for value in [0,1,2]:
    
        n_matches = np.sum(np.ma.MaskedArray(alts, mask = ~is_called) == value, axis=0)
        match_dict[value] = n_matches
    
    total_sites = np.sum(is_called, axis=0)
        
    karyos = []
    
    for alt in av_gts:
        
        if alt <= (2/3):
            
            karyos.append(0)
            
        elif alt > (2/3) and alt <= (4/3):
            
            karyos.append(1)
            
        else:
            
            karyos.append(2)
    
    match_list = []
    
    for index, karyo in enumerate(karyos):
        
        match_list.append(match_dict[karyo][index])
            
    md[new_col_name] = pd.Series(karyos)
    md[mean_name] = pd.Series(av_gts)
    md[sites_name] = pd.Series(total_sites)
    md[match_name] = pd.Series(match_list)
    md[match_proportion_name] = md[match_name] / md[sites_name]
    
    mismatches = np.sum(md[new_col_name] != md[col_name])
    
    print(inversion, " # mismatches: ", mismatches,"\n")
    print(av_gts)
    print(total_sites,"\n")

##### repeat with the Love et al. specimens. data from https://doi.org/10.5061/dryad.m2821

##### assemble the metadata. karyotypes come from previously unpublished data.

In [None]:
Love_good_names = ['KL0218','KL0220','KL0231','KL0333','KL0341','KL0370','KL0671','KL0899']

Love_a = [2,2,2,np.nan,np.nan,np.nan,np.nan,np.nan]
Love_b = [2,0,0,0,0,2,2,0]
Love_c = [2,2,2,2,2,2,2,2]
Love_d = [0,0,0,0,0,0,0,0]
Love_j = [2,2,2,2,2,2,2,2]
Love_u = [2,2,2,2,2,2,2,2]

Love_md = pd.DataFrame({"sample_ID" : pd.Series(Love_good_names),
                       "2La" : pd.Series(Love_a),
                       "2Rb" : pd.Series(Love_b),
                       "2Rc" : pd.Series(Love_c),
                       "2Rd" : pd.Series(Love_d),
                       "2Rj" : pd.Series(Love_j),
                       "2Ru" : pd.Series(Love_u)})

Love_md = Love_md[["sample_ID","2La","2Rb","2Rc","2Rd","2Rj","2Ru"]]

##### read in the data, filter it, and mask low-quality genotypes

In [None]:
Love_2R = allel.read_vcf(
    '/afs/crc.nd.edu/group/BesanskyNGS/data02/16G_bamako/all.Bamakoset.2.recode.vcf.gz',
                   fields = ['*'], region = "2R", types={'calldata/GQ': 'f4'})

Love_2L = allel.read_vcf(
    '/afs/crc.nd.edu/group/BesanskyNGS/data02/16G_bamako/all.Bamakoset.2.recode.vcf.gz',
                   fields = ['*'], region = "2L", types={'calldata/GQ': 'f4'})

Love_sample_bool = [sample in Love_good_names for sample in Love_2R["samples"]]

Love_2R_gt = allel.GenotypeArray(Love_2R["calldata/GT"]).subset(sel1 = Love_sample_bool)
Love_2R_gq = Love_2R["calldata/GQ"][:,Love_sample_bool]

Love_2R_gt.mask = Love_2R_gq < 20

Love_2L_gt = allel.GenotypeArray(Love_2L["calldata/GT"]).subset(sel1 = Love_sample_bool)

Love_2L_gq = Love_2L["calldata/GQ"][:,Love_sample_bool]

Love_2L_gt.mask = Love_2L_gq < 20

In [None]:
Love_inv_dict = {"2La" : Inversion(SNPs = a_top.values, metadata = Love_md, 
                                   genotypes = Love_2L_gt, inv_title = "2La"),
           "2Rb" : Inversion(SNPs = b_top.values, metadata = Love_md, 
                             genotypes = Love_2R_gt, inv_title = "2Rb"),
            "2Rc_col" : Inversion(SNPs = col_top.values, metadata = Love_md, 
                                   genotypes = Love_2R_gt, inv_title = "2Rc"),
           "2Rc_gam" : Inversion(SNPs = gam_top.values, metadata = Love_md, 
                                  genotypes = Love_2R_gt, inv_title = "2Rc"),
 "2Rd" : Inversion(SNPs = d_top.values, metadata = Love_md, 
                             genotypes = Love_2R_gt, inv_title = "2Rd"),
           "2Rj" : Inversion(SNPs = j_top.values, metadata = Love_md, 
                             genotypes = Love_2R_gt, inv_title = "2Rj"),
           "2Ru" : Inversion(SNPs = u_top.values, metadata = Love_md, 
                             genotypes = Love_2R_gt, inv_title = "2Ru")}

In [None]:
for inversion in Love_inv_dict.keys():
    
    ##set up objects
    SNPs = Love_inv_dict[inversion].SNPs
    md = Love_inv_dict[inversion].metadata
    gt = Love_inv_dict[inversion].genotypes
    col_name = Love_inv_dict[inversion].inv_title
    new_col_name = inversion + "_assigned"
    mean_name = inversion + "_means"
    sites_name = inversion + "_sites_called"
    match_name = inversion + "_sites_matching"
    match_proportion_name = inversion + "_pct_sites_matching"
    
    if inversion == "2La":
        
        pos = Love_2L["variants/POS"]
        
    else:
        
        pos = Love_2R["variants/POS"]
    
    ##identify sites found in the data
    site_indices = []
    
    for site in SNPs:
    
        where = np.where(pos == site)
        
        if len(where[0]) > 0:
                
            site_indices.append(where[0][0])
            
    print(inversion, "# targets: ", str(len(SNPs)), " # found: ", str(len(site_indices)))
    
    ##identify biallelic sites
    
    bi_bool = gt.subset(sel0 = site_indices).count_alleles().max_allele() <= 1
        
    alts = gt.subset(sel0 = site_indices).subset(sel0 = bi_bool).to_n_alt()
        
    is_called = gt.subset(sel0 = site_indices).subset(sel0 = bi_bool).is_called()
    
    av_gts = np.mean(np.ma.MaskedArray(
            alts, mask = ~is_called), axis=0).data
    
    match_dict = {0: None, 1: None, 2: None}
    
    for value in [0,1,2]:
    
        n_matches = np.sum(np.ma.MaskedArray(alts, mask = ~is_called) == value, axis=0)
        match_dict[value] = n_matches
    
    total_sites = np.sum(is_called, axis=0)
        
    karyos = []
    
    for alt in av_gts:
        
        if alt <= (2/3):
            
            karyos.append(0)
            
        elif alt > (2/3) and alt <= (4/3):
            
            karyos.append(1)
            
        else:
            
            karyos.append(2)
    
    match_list = []
    
    for index, karyo in enumerate(karyos):
        
        match_list.append(match_dict[karyo][index])
            
    md[new_col_name] = pd.Series(karyos)
    md[mean_name] = pd.Series(av_gts)
    md[sites_name] = pd.Series(total_sites)
    md[match_name] = pd.Series(match_list)
    md[match_proportion_name] = md[match_name] / md[sites_name]
    
    mismatches = np.sum(md[new_col_name] != md[col_name])
    
    print(inversion, " # mismatches: ", mismatches,"\n")
    print(av_gts)
    print(total_sites,"\n")

##### save the relevant metadata files, which now contain results.