In [None]:
import allel
import numpy as np
import pandas as pd 
import plotly.express as px

def load_vcf(vcf_path, metadata):
    """
    Load VCF and filter poor-quality samples
    """
    
    sampleIDs = metadata.sampleID.to_list()
    
    # load vcf and get genotypes and positions
    vcf = allel.read_vcf(vcf_path, fields="*")
    samples = vcf['samples']
    # keep only samples in qcpass metadata 
    sample_mask = np.isin(vcf['samples'], metadata.sampleID)
    
    # remove low quality samples 
    geno = allel.GenotypeArray(vcf['calldata/GT'])
    geno = geno.compress(sample_mask, axis=1)
    pos = vcf['variants/POS']
    contig = vcf['variants/CHROM']
    indel = vcf['variants/INDEL']
    
    # remove any indels 
    geno = geno.compress(~indel, axis=0)
    pos = pos[~indel]
    contig = contig[~indel]
    
    metadata = metadata.set_index('sampleID')
    samples = samples[sample_mask]
    
    return geno, pos, contig, metadata.loc[samples, :]

In [None]:
dataset = 'ampseq-vigg002'
vcf_path = f"../../results/vcfs/targets/{dataset}.annot.vcf"
metadata_path = "../../results/config/metadata.qcpass.tsv"
cohort_cols = 'taxon,location'

## Diplotype clustering

In [None]:
cohort_cols = cohort_cols.split(",")

In [None]:
import numba
from scipy.spatial.distance import squareform  # type: ignore

@numba.njit(parallel=True)
def multiallelic_diplotype_pdist(X, metric):
    """Optimised implementation of pairwise distance between diplotypes.

    N.B., here we assume the array X provides diplotypes as genotype allele
    counts, with axes in the order (n_samples, n_sites, n_alleles).

    Computation will be faster if X is a contiguous (C order) array.

    The metric argument is the function to compute distance for a pair of
    diplotypes. This can be a numba jitted function.

    """
    n_samples = X.shape[0]
    n_pairs = (n_samples * (n_samples - 1)) // 2
    out = np.zeros(n_pairs, dtype=np.float32)

    # Loop over samples, first in pair.
    for i in range(n_samples):
        x = X[i, :, :]

        # Loop over observations again, second in pair.
        for j in numba.prange(i + 1, n_samples):
            y = X[j, :, :]

            # Compute distance for the current pair.
            d = metric(x, y)

            # Store result for the current pair.
            k = square_to_condensed(i, j, n_samples)
            out[k] = d

    return out


@numba.njit
def square_to_condensed(i, j, n):
    """Convert distance matrix coordinates from square form (i, j) to condensed form."""

    assert i != j, "no diagonal elements in condensed matrix"
    if i < j:
        i, j = j, i
    return n * j - j * (j + 1) // 2 + i - 1 - j


@numba.njit
def multiallelic_diplotype_mean_cityblock(x, y):
    """Compute the mean cityblock distance between two diplotypes x and y. The
    diplotype vectors are expected as genotype allele counts, i.e., x and y
    should have the same shape (n_sites, n_alleles).

    N.B., here we compute the mean value of the distance over sites where
    both individuals have a called genotype. This avoids computing distance
    at missing sites.

    """
    n_sites = x.shape[0]
    n_alleles = x.shape[1]
    distance = np.float32(0)
    n_sites_called = np.float32(0)

    # Loop over sites.
    for i in range(n_sites):
        x_is_called = False
        y_is_called = False
        d = np.float32(0)

        # Loop over alleles.
        for j in range(n_alleles):
            # Access allele counts.
            xc = np.float32(x[i, j])
            yc = np.float32(y[i, j])

            # Check if any alleles observed.
            x_is_called = x_is_called or (xc > 0)
            y_is_called = y_is_called or (yc > 0)

            # Compute cityblock distance (absolute difference).
            d += np.fabs(xc - yc)

        # Accumulate distance for the current pair, but only if both samples
        # have a called genotype.
        if x_is_called and y_is_called:
            distance += d
            n_sites_called += np.float32(1)

    # Compute the mean distance over sites with called genotypes.
    if n_sites_called > 0:
        mean_distance = distance / n_sites_called
    else:
        mean_distance = np.nan

    return mean_distance

In [None]:
df_samples = pd.read_csv("../../results/config/metadata.qcpass.tsv", sep="\t", index_col=0)

geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)

In [None]:
ac = allel.GenotypeArray(geno).to_allele_counts(max_allele=3)
X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))

dists = multiallelic_diplotype_pdist(X, metric=multiallelic_diplotype_mean_cityblock)

In [None]:
from malariagen_data.plotly_dendrogram import plot_dendrogram

In [None]:
distance_metric = 'cityblock'

fig, leaf_data = plot_dendrogram(
    dist=dists,
    linkage_method="complete",
    count_sort=True,
    distance_sort=False,
    render_mode="svg",
    width=800,
    height=500,
    title=dataset,
    line_width=0.5,
    line_color='black',
    marker_size=5,
    leaf_data=df_samples.reset_index(),
    leaf_hover_name="sampleID",
    leaf_hover_data=cohort_cols,
    leaf_color="taxon",
    leaf_symbol=None,
    leaf_y=-0.05,
    leaf_color_discrete_map=None,
    leaf_category_orders=None,
    template="simple_white",
    y_axis_title=f"Distance ({distance_metric})",
    y_axis_buffer=0.1,
)

fig.show()

### Diplotype clustering at target loci

In [None]:
major_loci = '2L:2_000_000-3_000_000'

In [None]:
df_bed = pd.read_csv("../../config/ag-vampir.bed", sep="\t", header=None)
df_bed