In [None]:
import allel
from collections import namedtuple
import datetime
import h5py
import ingenos
import itertools
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib import collections as mc
import numpy as np
import pandas as pd
import re
import seaborn as sns
from sklearn import model_selection
%matplotlib inline

##### set base directory

In [3]:
base = "/afs/crc.nd.edu/group/BesanskyNGS/data05/comp_karyo"

##### read in data for 2R

In [None]:
v_2R, g_2R = ingenos.import_data(
    "/afs/crc.nd.edu/group/BesanskyNGS2/inversion_genotyping/merged_p2_and_VObs_2R.h5", "2R")

##### read in data for 2L

In [None]:
path_2L = "/afs/crc.nd.edu/group/BesanskyNGS2/inversion_genotyping/merged_p2_and_VObs_2L.h5"
chrom_2L = "2L"

callset_2L = h5py.File(path_2L, mode='r')[chrom_2L]

v_2L = allel.VariantChunkedTable(callset_2L['variants'], index='POS',
                                names=['POS','REF','ALT','DP','MQ','QD','numalt'])

g_2L = allel.GenotypeChunkedArray(callset_2L['calldata']['GT'])

##### read in metadata

In [1]:
md_2L = pd.read_csv(base + "/metadata/all_samples_2L_metadata_080318.csv", sep="\t")
md_2R = pd.read_csv(base + "/metadata/all_samples_2R_metadata_080318.csv", sep="\t")

NameError: name 'pd' is not defined

##### create filters to keep the correct partitions for each inversion

In [None]:
non_outliers = ((md_2R["country"] != "Kenya") &
                (md_2R["country"] != "Gambia, The") &
                (md_2R["country"] != "Guinea-Bissau")).values

west = (md_2R["country"] != "Kenya").values

j_bool = ((md_2R["country"] != "Kenya") & (md_2R["country"] != "Gambia, The") &\
          (md_2R["country"] != "Guinea-Bissau") & (md_2R["country"] != "France") &\
         (md_2R["species"] == "An. gambiae")).values

col_bool = ( (md_2R["ox_code"].isin(md_2R.loc[non_outliers,"ox_code"])) &\
                 (md_2R["species"] == "An. coluzzii"))

gam_bool = ( (md_2R["ox_code"].isin(md_2R.loc[non_outliers,"ox_code"])) &\
                 (md_2R["species"] == "An. gambiae"))

## IDing tag SNPs

##### please note: while the code for all inversions is included here, each section takes a long time to run. I recommend running each inversion separately, and definitely saving at intermediate steps, such as after all ten bootstrap iterations are complete, before averaging.

##### to identify candidate tag SNPs, the specimens are split into a training and a validation set. the code below includes options for using the sets used in this paper, or for generating one's own

##### read in the existing identifications

In [None]:
splits = np.load(base + "/metadata/comp_karyo_splits/splits.npy").flat[0]

splits_d = np.load(base + "/metadata/comp_karyo_splits/2Rdj_splits.npy",
                allow_pickle=True).flat[0]

splits["2Rd"] = splits_d["2Rd"]

##### ALTERNATE OPTION: generate one's own splits

##### subset to specimens that could be called for PCA

In [None]:
md_2La = pd.DataFrame(md_2L)
md_2Rj = md_2R.loc[md_2R["new_PCA_2Rj"] != "None",:]
md_2Rb = md_2R.loc[md_2R["new_PCA_2Rb"] != "None",:]
md_2Rc = md_2R.loc[md_2R["new_PCA_2Rc"] != "None",:]
md_2Rd = md_2R.loc[md_2R["new_PCA_2Rd"] != "None",:]
md_2Ru = md_2R.loc[md_2R["new_PCA_2Ru"] != "None",:]

In [None]:
own_splits = {"2Rb" : {}, "2Rc" : {}, "2La" : {}, "2Rd" : {}, "2Ru" : {}, "2Rj" : {}}

In [None]:
own_splits["2La"]["train"], own_splits["2La"]["test"] =\
model_selection.train_test_split(md_2La["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2La["new_PCA_2La"].values)

own_splits["2Rj"]["train"], own_splits["2Rj"]["test"] =\
model_selection.train_test_split(md_2Rj["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rj["new_PCA_2Rj"].values)

own_splits["2Rb"]["train"], own_splits["2Rb"]["test"] =\
model_selection.train_test_split(md_2Rb["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rb["new_PCA_2Rb"].values)

own_splits["2Rc"]["train"], own_splits["2Rc"]["test"] =\
model_selection.train_test_split(md_2Rc["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rc["new_PCA_2Rc"].values)

own_splits["2Rd"]["train"], own_splits["2Rd"]["test"] =\
model_selection.train_test_split(md_2Rd["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rd["new_PCA_2Rd"].values)

own_splits["2Ru"]["train"], own_splits["2Ru"]["test"] =\
model_selection.train_test_split(md_2Ru["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Ru["new_PCA_2Ru"].values)

##### if generating one's own splits, then subsequent references to the "splits" object should be replaced with "own_splits." an easy way to do this would be to simply not run the cell in which the "splits" object is created, and rename the "own_splits" dictionary, at creation, as "splits"

##### mask low-quality genotypes. We don't do this for PCA, because the results depend on thousands of SNPs. When identifying tag SNPs, however, individual SNPs are the focus and we need to exclude those of low quality.

In [None]:
merged_2R = h5py.File(
    "/afs/crc.nd.edu/group/BesanskyNGS2/inversion_genotyping/merged_p2_and_VObs_2R.h5", 
    mode="r")

gq_2R = merged_2R["2R"]['calldata']['GQ'][:]

g_2R.mask = gq_2R < 20

In [None]:
gq_2L = callset_2L['calldata']['GQ'][:]

g_2L.mask = gq_2L < 20

##### 2La

In [None]:
sites_2La = ingenos.construct_filter_expression("2La", ingenos.inversionDict,
                                                buffer=0, whole_inversion=True)

filter_2La = v_2L.eval(sites_2La)

np.sum(filter_2La)

In [None]:
g_2La = g_2L.subset(sel1 = md_2L["ox_code"].isin(splits["2La"]["train"]).values)

md_2La = md_2L.loc[md_2L["ox_code"].isin(splits["2La"]["train"]),:]

In [None]:
a_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(md_2La["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2La["new_PCA_2La"].values)
    
    train_bool = md_2La["ox_code"].isin(train).values
    test_bool = md_2La["ox_code"].isin(test).values
    
    a = ingenos.run_concordance_calculation("2La", v_2L[:], g_2La, 
                                md_2La.loc[train_bool, "new_PCA_2La"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2La,
                                samples_bool = train_bool)
    
    top = a.loc[((a["called_0"] > 0.9) & (a["called_1"] > 0.9) & (a["called_2"] > 0.9) &\
       (a["min"] > 0.995)),"position"].values
    
    site_indices = np.array([np.where(v_2L["POS"] == site)[0][0] for site in top])

    alts = g_2La.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    md_2La_test = pd.DataFrame(md_2La.loc[test_bool,:])
    
    md_2La_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(md_2La_test["assigned"] != md_2La_test["new_PCA_2La"])
    
    a_dict[i] = (train, test, a, top, mismatches)

##### average the ten iterations

In [None]:
a_compiled = []

for i in range(10):
    
    a_compiled.extend(a_dict[i][2]["position"].values)

In [None]:
a_average = pd.DataFrame({"position" : sorted(set(a_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(a_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = a_dict[i][2][a_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
        
    a_average = a_average.append(mean_row)

##### 2Rj

In [None]:
sites_2Rj = ingenos.construct_filter_expression("2Rj", ingenos.inversionDict,
                                                buffer=0, whole_inversion=True)

filter_2Rj = v_2R.eval(sites_2Rj)

np.sum(filter_2Rj)

In [None]:
g_2Rj = g_2R.subset(sel1 = md_2R["ox_code"].isin(splits["2Rj"]["train"]).values)

md_2Rj = md_2R.loc[md_2R["ox_code"].isin(splits["2Rj"]["train"]),:]

In [None]:
j_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(md_2Rj["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rj["new_PCA_2Rj"].values)
    
    train_bool = md_2Rj["ox_code"].isin(train).values
    test_bool = md_2Rj["ox_code"].isin(test).values
    
    j = ingenos.run_concordance_calculation("2Rj", v_2R[:], g_2Rj, 
                                md_2Rj.loc[train_bool, "new_PCA_2Rj"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Rj,
                                samples_bool = train_bool)
    
    top = j.loc[((j["called_0"] > 0.9) & (j["called_1"] > 0.9) & (j["called_2"] > 0.9) &\
       (j["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = g_2Rj.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    md_2Rj_test = pd.DataFrame(md_2Rj.loc[test_bool,:])
    
    md_2Rj_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(md_2Rj_test["assigned"].map(float).map(str) != md_2Rj_test["new_PCA_2Rj"])
    
    j_dict[i] = (train, test, j, top, mismatches)

In [None]:
j_compiled = []

for i in range(10):
    
    j_compiled.extend(j_dict[i][2]["position"].values)

In [None]:
j_average = pd.DataFrame({"position" : sorted(set(j_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(j_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = j_dict[i][2][j_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
        
    j_average = j_average.append(mean_row)

##### 2Rb

In [None]:
sites_2Rb = ingenos.construct_filter_expression("2Rb", ingenos.inversionDict,
                                                buffer=0, whole_inversion=True)

filter_2Rb = v_2R.eval(sites_2Rb)

np.sum(filter_2Rb)

In [None]:
g_2Rb = g_2R.subset(sel1 = md_2R["ox_code"].isin(splits["2Rb"]["train"]).values)

md_2Rb = md_2R.loc[md_2R["ox_code"].isin(splits["2Rb"]["train"]),:]

In [None]:
b_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(md_2Rb["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rb["new_PCA_2Rb"].values)
    
    train_bool = md_2Rb["ox_code"].isin(train).values
    test_bool = md_2Rb["ox_code"].isin(test).values
    
    b = ingenos.run_concordance_calculation("2Rb", v_2R[:], g_2Rb, 
                                md_2Rb.loc[train_bool, "new_PCA_2Rb"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Rb,
                                samples_bool = train_bool)
    
    top = b.loc[((b["called_0"] > 0.9) & (b["called_1"] > 0.9) & (b["called_2"] > 0.9) &\
       (b["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = g_2Rb.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    md_2Rb_test = pd.DataFrame(md_2Rb.loc[test_bool,:])
    
    md_2Rb_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(md_2Rb_test["assigned"].map(float).map(str) != md_2Rb_test["new_PCA_2Rb"])
    
    b_dict[i] = (train, test, b, top, mismatches)

In [None]:
b_compiled = []

for i in range(10):
    
    b_compiled.extend(b_dict[i][2]["position"].values)

In [None]:
b_average = pd.DataFrame({"position" : sorted(set(b_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(b_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = b_dict[i][2][b_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
        
    b_average = b_average.append(mean_row)

##### 2Rc

##### we identified concordant SNPs separetely in coluzzii and gambiae in 2Rc. in addition, we dropped specimens that carried 2Ru in gambiae, which were outliers on the PCA, from the training set.

In [None]:
sites_2Rc = ingenos.construct_filter_expression("2Rc", ingenos.inversionDict,
                                                buffer=0, whole_inversion=True)

filter_2Rc = v_2R.eval(sites_2Rc)

np.sum(filter_2Rc)

In [None]:
g_2Rc = g_2R.subset(sel1 = md_2R["ox_code"].isin(splits["2Rc"]["train"]).values)

md_2Rc = md_2R.loc[md_2R["ox_code"].isin(splits["2Rc"]["train"]),:]

##### coluzzii

In [None]:
col_bool = (md_2Rc["species"] == "An. coluzzii").values

col_c = g_2Rc.subset(sel1 = col_bool)

col_md = md_2Rc.loc[col_bool,:]

In [None]:
col_c_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(col_md["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=col_md["new_PCA_2Rc"].values)
    
    train_bool = col_md["ox_code"].isin(train).values
    test_bool = col_md["ox_code"].isin(test).values
    
    col_c_conc = ingenos.run_concordance_calculation("2Rc", v_2R[:], col_c, 
                                col_md.loc[train_bool, "new_PCA_2Rc"].map(int).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Rc,
                                samples_bool = train_bool)
    
    top = col_c_conc.loc[((col_c_conc["called_0"] > 0.9) &\
                     (col_c_conc["called_1"] > 0.9) &\
                     (col_c_conc["called_2"] > 0.9) &\
       (col_c_conc["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = col_c.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    col_md_test = pd.DataFrame(col_md.loc[test_bool,:])
    
    col_md_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(col_md_test["assigned"].map(str) != col_md_test["new_PCA_2Rc"])
    
    col_c_dict[i] = (train, test, col_c_conc, top, mismatches)

In [None]:
col_c_compiled = []

for i in range(10):
    
    col_c_compiled.extend(col_c_dict[i][2]["position"].values)

In [None]:
col_c_average = pd.DataFrame({"position" : sorted(set(col_c_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = col_c_dict[i][2][col_c_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
    
    #print(pos, mean_row)
    
    col_c_average = col_c_average.append(mean_row)

##### gambiae

In [None]:
gam_trim_bool = ((md_2Rc["species"] == "An. gambiae") &\
                 (md_2Rc["new_PCA_2Rj"] != "2.0") &\
                 (md_2Rc["ox_code"] != "AZ0267-C") &\
                (md_2Rc["ox_code"] != "AV0043-C")).values

gam_c = g_2Rc.subset(sel1 = gam_trim_bool)

gam_md = md_2Rc.loc[gam_trim_bool,:]

In [None]:
gam_c_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(gam_md["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=gam_md["new_PCA_2Rc"].values)
    
    train_bool = gam_md["ox_code"].isin(train).values
    test_bool = gam_md["ox_code"].isin(test).values
    
    gam_c_conc = ingenos.run_concordance_calculation("2Rc", v_2R[:], gam_c, 
                                gam_md.loc[train_bool, "new_PCA_2Rc"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Rc,
                                samples_bool = train_bool)
    
    top = gam_c_conc.loc[((gam_c_conc["called_0"] > 0.9) &\
                     (gam_c_conc["called_1"] > 0.9) &\
                     (gam_c_conc["called_2"] > 0.9) &\
       (gam_c_conc["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = gam_c.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    gam_md_test = pd.DataFrame(gam_md.loc[test_bool,:])
    
    gam_md_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(gam_md_test["assigned"].map(str) != gam_md_test["new_PCA_2Rc"])
    
    gam_c_dict[i] = (train, test, gam_c_conc, top, mismatches)

In [None]:
gam_c_compiled = []

for i in range(10):
    
    gam_c_compiled.extend(gam_c_dict[i][2]["position"].values)

In [None]:
gam_c_average = pd.DataFrame({"position" : sorted(set(gam_c_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(gam_c_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = gam_c_dict[i][2][gam_c_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
    
    #print(pos, mean_row)
    
    gam_c_average = gam_c_average.append(mean_row)

##### 2Rd

##### for identifying concordant SNPs in 2Rd, we use the entire inversion

In [None]:
sites_2Rd = '( (POS > 31495381) & (POS < 42375004) )'

filter_2Rd = v_2R.eval(sites_2Rd)

np.sum(filter_2Rd)

In [None]:
g_2Rd = g_2R.subset(sel1 = md_2R["ox_code"].isin(splits["2Rd"]["train"]).values)

md_2Rd = md_2R.loc[md_2R["ox_code"].isin(splits["2Rd"]["train"]),:]

In [None]:
d_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(md_2Rd["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Rd["new_PCA_2Rd"].values)
    
    train_bool = md_2Rd["ox_code"].isin(train).values
    test_bool = md_2Rd["ox_code"].isin(test).values
    
    d = ingenos.run_concordance_calculation("2Rd", v_2R[:], g_2Rd, 
                                md_2Rd.loc[train_bool, "new_PCA_2Rd"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Rd,
                                samples_bool = train_bool)
    
    top = d.loc[((d["called_0"] > 0.9) & (d["called_1"] > 0.9) & (d["called_2"] > 0.9) &\
       (d["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = g_2Rd.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    md_2Rd_test = pd.DataFrame(md_2Rd.loc[test_bool,:])
    
    md_2Rd_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(md_2Rd_test["assigned"].map(float).map(str) != md_2Rd_test["new_PCA_2Rd"])
    
    d_dict[i] = (train, test, d, top, mismatches)

In [None]:
d_compiled = []

for i in range(10):
    
    d_compiled.extend(d_dict[i][2]["position"].values)

In [None]:
d_average = pd.DataFrame({"position" : sorted(set(d_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(d_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = d_dict[i][2][d_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
        
    d_average = d_average.append(mean_row)

##### 2Ru

In [None]:
sites_2Ru = ingenos.construct_filter_expression("2Ru", ingenos.inversionDict,
                                                buffer=0, whole_inversion=True)

filter_2Ru = v_2R.eval(sites_2Ru)

np.sum(filter_2Ru)

In [None]:
g_2Ru = g_2R.subset(sel1 = md_2R["ox_code"].isin(splits["2Ru"]["train"]).values)

md_2Ru = md_2R.loc[md_2R["ox_code"].isin(splits["2Ru"]["train"]),:]

In [None]:
u_dict = {}

for i in range(10):
    print(i)
    
    train, test =\
    model_selection.train_test_split(md_2Ru["ox_code"].values, train_size=0.75, test_size=0.25,
                                 stratify=md_2Ru["new_PCA_2Ru"].values)
    
    train_bool = md_2Ru["ox_code"].isin(train).values
    test_bool = md_2Ru["ox_code"].isin(test).values
    
    u = ingenos.run_concordance_calculation("2Ru", v_2R[:], g_2Ru, 
                                md_2Ru.loc[train_bool, "new_PCA_2Ru"].map(float).values,
                                        variance_threshold = 0.05, sites_bool = filter_2Ru,
                                samples_bool = train_bool)
    
    top = u.loc[((u["called_0"] > 0.9) & (u["called_1"] > 0.9) & (u["called_2"] > 0.9) &\
       (u["min"] > 0.8)),"position"].values
    
    site_indices = np.array([np.where(v_2R["POS"] == site)[0][0] for site in top])

    alts = g_2Ru.subset(sel0 = site_indices, sel1 = test_bool).to_n_alt()
    
    assigned = []

    for specimen in np.mean(alts, axis=0):
        
        if specimen <= 0.66:
        
            assigned.append(0)
        
        elif specimen > 0.66 and specimen <= 1.33:

            assigned.append(1)

        else:

            assigned.append(2)
            
    md_2Ru_test = pd.DataFrame(md_2Ru.loc[test_bool,:])
    
    md_2Ru_test["assigned"] = pd.Series(assigned).values
    
    mismatches =\
    np.sum(md_2Ru_test["assigned"].map(str) != md_2Ru_test["new_PCA_2Ru"])
    
    u_dict[i] = (train, test, u, top, mismatches)

In [None]:
u_compiled = []

for i in range(10):
    
    u_compiled.extend(u_dict[i][2]["position"].values)

In [None]:
u_average = pd.DataFrame({"position" : sorted(set(u_compiled)),
                          "ref" : np.nan,
                          "alt" : np.nan,
                         "score_0" : np.nan, 
                          "score_1" : np.nan, 
                          "score_2" : np.nan,
                         "overall_score" : np.nan, 
                          "called_0" : np.nan, 
                          "called_1" : np.nan,
                         "called_2" : np.nan, 
                          "overall_called" : np.nan, 
                          "min" : np.nan})\
[["position","ref","alt","score_0","score_1","score_2","overall_score","called_0","called_1",
 "called_2","overall_called","min"]]

In [None]:
for pos in sorted(set(u_compiled)):
    
    rows = {}
    
    refs = []
    
    alts = []

    for i in range(10):

        row = u_dict[i][2][u_dict[i][2]["position"] == pos]

        if len(row) > 0:
            
            refs.append(row["ref"].values[0])
            
            alts.append(row["alt"].values[0])

            rows[i] = row.drop(["ref","alt"], axis=1).reset_index(drop=True)
            
    if not len(set(refs)) == 1 and len(set(alts)) == 1:
        
        raise ValueError(pos, rows)
        
    ref = refs[0]
    
    alt = alts[0]

    mean_row = np.sum(rows[i] for i in rows.keys()) / len(rows.keys())
    
    mean_row["count"] = len(rows.keys())
    
    mean_row["ref"] = ref
    
    mean_row["alt"] = alt
    
    assert mean_row["position"].values[0] == pos, row
        
    u_average = u_average.append(mean_row)

##### take the top SNPs for each inversion

In [None]:
a_top = a_average.loc[((a_average["called_0"] > 0.9) & (a_average["called_1"] > 0.9) &\
                       (a_average["called_2"] > 0.9) &\
               (a_average["min"] > 0.995) & (a_average["count"] >= 8)), "position"].map(int)

In [None]:
j_top = j_average.loc[((j_average["called_0"] > 0.9) & (j_average["called_1"] > 0.9) &\
                       (j_average["called_2"] > 0.9) &\
               (j_average["min"] > 0.8) & (j_average["count"] >= 8)), "position"].map(int)

In [None]:
b_top = b_average.loc[((b_average["called_0"] > 0.9) & (b_average["called_1"] > 0.9) &\
                       (b_average["called_2"] > 0.9) &\
               (b_average["min"] > 0.8) & (b_average["count"] >= 8)), "position"].map(int)

In [None]:
c_col_top = col_average.loc[((col_average["called_0"] > 0.9) &\
                             (col_average["called_1"] > 0.9) &\
                         (col_average["called_2"] > 0.9) &\
               (col_average["min"] > 0.8) & (col_average["count"] >= 8)), "position"].map(int)

In [None]:
c_gam_top = gam_average.loc[((gam_average["called_0"] > 0.9) &\
                             (gam_average["called_1"] > 0.9) &\
                         (gam_average["called_2"] > 0.9) &\
               (gam_average["min"] > 0.8) & (gam_average["count"] >= 8)), "position"].map(int)

In [None]:
d_top = d_average.loc[((d_average["called_0"] > 0.9) & (d_average["called_1"] > 0.9) &\
                       (d_average["called_2"] > 0.9) &\
               (d_average["min"] > 0.8) & (d_average["count"] >= 8)), "position"].map(int)

In [None]:
u_top = u_average.loc[((u_average["called_0"] > 0.9) & (u_average["called_1"] > 0.9) &\
                       (u_average["called_2"] > 0.9) &\
               (u_average["min"] > 0.8) & (u_average["count"] >= 8)), "position"].map(int)

##### save the top SNPs to the desired location as in the example below

In [None]:
a_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2La/comp/predictive_SNPs_train_set_0995", ".tsv"),
            sep="\t", index=False)

j_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Rj/comp/predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)

b_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Rb/comp/predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)

c_col_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Rc/comp/col_predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)

c_gam_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Rc/comp/gam_predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)

d_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Rd/comp/predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)

u_top.to_csv(ingenos.make_date_stamp(base + "/data/results/2Ru/comp/predictive_SNPs_train_set_08", ".tsv"),
            sep="\t", index=False)