In [None]:
import numpy as np
import pandas as pd
import allel
import re


In [None]:
dataset = 'test-agvampir'
metadata_path = '../../../tests/snakemake/results/config/metadata.qcpass.tsv'
kdr_marker_snps_path = '../../../resources/ag-vampir/Kdr_marker_SNPs.csv'
vcf_path = f"../../../tests/snakemake/results/vcfs/targets/{dataset}.annot.vcf"
cohort_cols = 'location,taxon'
wkdir = "../../../tests/snakemake/"
platform = 'illumina'


In [None]:
import os
import sys
import warnings

sys.path.append(os.path.join(wkdir, 'workflow/lib'))
import ampseeker as amp

warnings.filterwarnings('ignore')


### *Kdr* origins and diplotype clustering

The *Vgsc* gene encodes a voltage-gated sodium channel which is the binding target of DDT and pyrethroid insecticides. Variants in this gene, referred to as *Kdr* (knockdown resistance) are associated with resistance to DDT and/or pyrethroids.

This notebook determines the origin of *Kdr* for each sample, and performs diplotype clustering to visualise the distribution of amino acids and haplotypes. 

Knockdown resistance mutations have arisen independently multiple times in different mosquito populations, and genetic backgrounds can help trace the origin and spread of resistance (Martinez-Torres et al., 1998; Ranson et al., 2000).

In [None]:
cohort_cols = cohort_cols.split(",")
cohort_col = cohort_cols[0]

hap_def = pd.read_csv(kdr_marker_snps_path, sep = '\t', index_col = 1)
hap_def['variant_pos'] = hap_def.index.str.replace('.*:', '', regex = True).astype('int')


In [None]:
# Kdr background call functions extracted to workflow/lib/ag_vampir/kdr_analysis.py
_F_kdr_origin_gen = amp._F_kdr_origin_gen
_S_kdr_origin_gen = amp._S_kdr_origin_gen
_402_kdr_origin_gen = amp._402_kdr_origin_gen
_kdr_gen_cleanup = amp._kdr_gen_cleanup
kdr_origin = amp.kdr_origin
get_single_gen_call = amp.get_single_gen_call
_get_single_gen_call_no_402 = amp._get_single_gen_call_no_402
_get_single_gen_call_with_402 = amp._get_single_gen_call_with_402


In [None]:
### Load metadata
metadata = pd.read_csv(metadata_path, sep = '\t')
metadata = metadata.assign(taxon=metadata.taxon.fillna('UNKN'))


In [None]:
### Get the genotype calls
### Filter the SNPs to just the ones useful for kdr origin analysis
geno, pos, contig, metadata, ref, alt, ann = amp.load_vcf(vcf_path, metadata=metadata, platform=platform)
samples = metadata.sample_id

which_snps = (contig == '2L') & np.isin(pos, hap_def['variant_pos']) 

snp_calls = geno[which_snps, :, :]
pos = pos[which_snps]
alt = alt[which_snps, :]
ref = ref[which_snps]


In [None]:
### Convert genotype calls to nucleotides
# combine ref and alt into a single matrix, and add a column of '?' at the end, so that 
# any genotype call of -1 (missing) draws the '?' character'
snp_alleles = np.concatenate([np.reshape(ref, (len(ref), 1)), 
                              alt,
                              np.full((len(ref), 1), '?')], 
                             axis = 1)

# Convert numberic calls to nucleotides, and sort each pair of nucleotides alphabetically
# (so, eg, the genotype 'TA' becomes 'AT')
snp_genotypes_3d = snp_alleles[
    np.array(np.arange(snp_alleles.shape[0])).reshape(snp_alleles.shape[0], 1, 1), 
    snp_calls
]
snp_genotypes = np.apply_along_axis(lambda x: ''.join(np.sort(x)), 2, snp_genotypes_3d)

# Store results in data frame
hap_def.index = hap_def['variant_pos']
gen_df = pd.DataFrame(
    np.transpose(snp_genotypes), 
    index = samples,
    columns = hap_def.loc[pos, 'SNP name']
)


In [None]:
### Obtain kdr origin calls
kdr_origins = pd.concat([kdr_origin(gen_df.iloc[i]) for i in range(gen_df.shape[0])])
kdr_origins['kdr_origin'] = kdr_origins.apply(
    get_single_gen_call, axis = 1
)


In [None]:
#### Merge kdr origins with metadata and write to file. 
kdr_origins_df = pd.merge(kdr_origins, metadata.set_index("sample_id"), left_index = True, right_index = True).rename_axis("sample_id")
kdr_origins_df.to_csv(f'{wkdir}/results/ag-vampir/kdr-origins/kdr_origins.tsv', sep='\t')


In [None]:
### Create a table where each row is a haplotype instead of a genotype (although these are genotype-based calls, 
# so order within each sample will be random, so they can be used for, say, mapping, but not haplotype clustering). Write this table to file.
kdr_genhap_origins_df = pd.DataFrame({'kdr_origin': ','.join(list(kdr_origins['kdr_origin'])).split(',')},
                                     index = np.repeat(kdr_origins.index, 2)
)
kdr_genhap_origins_df = pd.merge(kdr_genhap_origins_df, metadata.set_index("sample_id"), left_index=True, right_index=True)
kdr_genhap_origins_df.to_csv(f'{wkdir}/results/ag-vampir/kdr-origins/kdr_genhap_origins.tsv', sep = '\t')


In [None]:
cols_keep = cohort_cols + ['kdr_origin']
# Count the number of occurances of each haplotypes in each population
# "values" could be any column that isn't specified elsewhere in the function. But it's 
# not allowed to be blank, so we had to pick one. 
pop_origin_counts = kdr_genhap_origins_df[cols_keep].pivot_table(columns='kdr_origin', 
                                                                    index=cohort_cols,
                                                                    aggfunc=len
                                                                   ).fillna(0).astype(int)
pop_origin_counts.to_excel(f'{wkdir}/results/ag-vampir/kdr-origins/kdr_origin_counts.xlsx')
# A function to round a number up to n_signif significant figures
signif = amp.signif

# Calculate row totals of non-"?" columns
if '?' in pop_origin_counts.columns:
    pop_origin_counts = pop_origin_counts.drop('?', axis = 1)

row_totals = pop_origin_counts.sum(axis = 1)
# Calculate origin frequencies. We exclude the "?" calls for this
pop_origin_freqs = pop_origin_counts.div(row_totals, axis = 0)
pop_origin_freqs.to_excel(f'{wkdir}/results/ag-vampir/kdr-origins/kdr_origin_freqs.xlsx')
# Round to 2 significant figures
pop_origin_freqs = signif(pop_origin_freqs, 2)
print('Counts of origins:')
display(pop_origin_counts)
print('\n\nFrequencies of known origins:')
display(pop_origin_freqs)


#### Diplotype clustering

Diplotype clustering groups samples based on genetic similarity at the Vgsc locus, revealing evolutionary relationships at resistance loci and helping identify mutations associated with selective sweeps (Nagi et al., 2024).

In [None]:
import allel
import numpy as np
import pandas as pd
import plotly.express as px

_dipclust_concat_subplots = amp._dipclust_concat_subplots
plot_dendrogram = amp.plot_dendrogram


In [None]:
df_samples = pd.read_csv(metadata_path, sep="\t")
df_kdr = pd.read_csv(f'{wkdir}/results/ag-vampir/kdr-origins/kdr_origins.tsv', sep="\t", index_col=0)
df_kdr = df_kdr.reset_index().rename(columns={'level_0': 'sample_id'})

vcf_path = f"{wkdir}/results/vcfs/amplicons/{dataset}.annot.vcf"
geno, pos, contig, df_samples, ref, alt, ann = amp.load_vcf(vcf_path, metadata=df_samples, platform=platform)

import json
with open(f"{wkdir}/results/config/metadata_colours.json", 'r') as f:
    color_mapping = json.load(f)

bed_path = f"{wkdir}/config/ag-vampir.bed"
df_bed = amp.load_bed(bed_path)

# subset to VGSC SNPs
vgsc_mask = df_bed.eval('mutation.str.contains("Vgsc")').to_numpy()
df_vgsc = df_bed[vgsc_mask]
vgsc_start = df_vgsc['start'].min() - 200 
vgsc_end = df_vgsc['end'].max() + 200
vgsc_mask = np.logical_and(contig == '2L', np.logical_and(pos >= vgsc_start, pos <= vgsc_end))
geno_vgsc = geno.compress(vgsc_mask, axis=0)
pos_vgsc = pos[vgsc_mask]

# remove invariant sites 
ac = geno_vgsc.count_alleles()
is_seg = ac.is_segregating()
geno_vgsc = geno_vgsc.compress(is_seg, axis=0)

# remove highly missing sites 
missing_mask = geno_vgsc.is_missing().mean(axis=1) > 0.1
missing_mask.sum()
geno_vgsc = geno_vgsc.compress(~missing_mask, axis=0)

# distances 
from scipy.spatial.distance import squareform

ac = allel.GenotypeArray(geno_vgsc).to_allele_counts(max_allele=3)
X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))
dists = amp.multiallelic_diplotype_pdist(X, metric=amp.multiallelic_diplotype_mean_cityblock)
dist_matrix = squareform(dists)
df_dists = pd.DataFrame(dist_matrix, index=df_samples.index, columns=df_samples.index)
na_mask = df_dists.isna().any()
df_dists = df_dists.loc[~na_mask, ~na_mask]

df_samples = df_samples.reset_index().merge(df_kdr[['sample_id', 'kdr_origin']])


In [None]:
import scipy

distance_metric = 'cityblock'
leaf_color = cohort_col

fig_dendro, leaf_data = plot_dendrogram(
    dist=scipy.spatial.distance.squareform(df_dists.values),
    linkage_method="complete",
    count_sort=True,
    distance_sort=False,
    render_mode="svg",
    width=800,
    height=500,
    title=f"{dataset} | Vgsc diplotype clustering",
    line_width=0.6,
    line_color='black',
    marker_size=5,
    leaf_data=df_samples[~na_mask.to_numpy()],
    leaf_hover_name="sample_id",
    leaf_hover_data=cohort_cols + ['kdr_origin'],
    leaf_color=leaf_color,
    leaf_symbol=None,
    leaf_y=-0.01,
    leaf_color_discrete_map=color_mapping[leaf_color],
    leaf_category_orders=None,
    template="simple_white",
    y_axis_title=f"Distance ({distance_metric})",
    y_axis_buffer=0.1,
)


df_snps = pd.read_excel(f"{wkdir}/results/vcfs/targets/{dataset}-snps.xlsx")
df_vgsc_snps = df_snps.query("CHROM == '2L' and POS >= @vgsc_start and POS <= @vgsc_end")
df_vgsc_snps = df_bed.rename(columns={'end':'POS', 'contig':'CHROM'})[['CHROM', 'POS', 'mutation']].merge(df_vgsc_snps, on=['CHROM', 'POS'])
df_vgsc_snps = df_vgsc_snps.set_index('mutation').iloc[:, 6:]
df_vgsc_snps = df_vgsc_snps.query("~mutation.str.contains('AIM')").query("~mutation.str.contains('tag')")
df_vgsc_snps = df_vgsc_snps.loc[:, leaf_data.sample_id.to_list()]
df_vgsc_snps.index = df_vgsc_snps.index.str.replace("Vgsc_", "")


In [None]:
import plotly.graph_objects as go

kdr_leaf_data = leaf_data.copy()

figures = [fig_dendro]
subplot_heights = [300]
snp_row_height  = 20
width = 800

kdr_fig = px.scatter(
    data_frame=kdr_leaf_data,
    x=np.arange(kdr_leaf_data.shape[0]),
    y=np.repeat(0, kdr_leaf_data.shape[0]),
    color='kdr_origin',
    hover_name='sample_id',
    symbol_sequence=['square'],
    # hover_data=leaf_hover_data,
    template='simple_white',
    # color_discrete_map=leaf_color_discrete_map,
    # category_orders=leaf_category_orders,
)

for f in kdr_fig.data:
    f.legendgroup = 'kdr_origin'

figures.append(kdr_fig)
subplot_heights.append(20)

# het bar
df_het = pd.DataFrame(
    {"sample_id": samples, "Sample Heterozygosity": geno_vgsc.is_het().sum(axis=0) / geno_vgsc.is_called().sum(axis=0)}
).set_index("sample_id")

# order according to dendrogram and transpose
df_het = df_het.loc[leaf_data.sample_id, :].T
het_trace = go.Heatmap(
    z=df_het,
    y=["Heterozygosity"],
    colorscale="Greys",
    showlegend=False,
    showscale=False,
)

figures.append(het_trace)
subplot_heights.append(30)

snp_trace = go.Heatmap(
    z=df_vgsc_snps[::-1].values,
    y=df_vgsc_snps[::-1].index.to_list(),
    colorscale="Greys",
    showlegend=False,
    showscale=False,
)

figures.append(snp_trace)
subplot_heights.append(snp_row_height * df_vgsc_snps.shape[0])

height = sum(subplot_heights) + 50
fig = _dipclust_concat_subplots(
    figures=figures,
    width=width,
    height=height,
    row_heights=subplot_heights,
    title=f"{dataset} | Vgsc diplotype clustering",
    xaxis_range=(0, df_dists.shape[0]),
)

fig["layout"]["yaxis"]["title"] = f"Distance ({distance_metric})"

aa_idx = len(figures)
fig.add_hline(y=-0.5, line_width=1, line_color="grey", row=aa_idx, col=1)
for i, y in enumerate(df_vgsc_snps.index.to_list()):
    fig.add_hline(y=i+0.5, line_width=1, line_color="grey", row=aa_idx, col=1)

fig['layout'][f'yaxis{aa_idx}']['title']=f'Vgsc mutations'
fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = aa_idx, col = 1, mirror = True)
fig.write_image(f"{wkdir}/results/kdr_dipclust.png", scale=2)
fig.show()
