In [None]:
import os
import sys
import plotly.express as px

sys.path.append(os.path.join(os.getcwd(), 'workflow/lib'))
import ampseeker as amp


In [None]:
dataset = 'nomads16'
vcf_path = f"../../results/vcfs/amplicons/{dataset}.annot.vcf"
metadata_path = "../../results/config/metadata.qcpass.tsv"
cohort_cols = 'library_name'
wkdir = "../../"
platform = 'nanopore'


In [None]:
import os
import sys

sys.path.append(os.path.join(wkdir, 'workflow/lib'))
import ampseeker as amp


## Population structure

In this notebook, we run a principal components analysis and build a neighbour joining tree on the amplicon sequencing variant data. For the PCA, we will plot PC1 v PC2 and PC3 v PC4, and the variance explained by the model.

In [None]:
cohort_cols = cohort_cols.split(",")
cohort_col = cohort_cols[0]

metadata = pd.read_csv(metadata_path, sep="\t")

import json
with open(f"{wkdir}/results/config/metadata_colours.json", 'r') as f:
    color_mapping = json.load(f)

geno, pos, contig, metadata, ref, alt, ann = amp.load_variants(vcf_path, metadata, platform=platform, filter_indel=True)
df_pca, model = amp.pca(geno=geno, metadata=metadata, n_components=4, missing_threshold=0.2)


#### Variance explained

The variance explained shows the proportion of total variance in the dataset that is captured by each principal component. Higher values indicate more informative components. As a general rule of thumb, when the variance explained for each PC begins to flatten out, that is when the PCs are no longer informative.

In [None]:
fig = px.bar(model.explained_variance_ratio_ , labels={
                     "value": "Variance Explained",
                     "index": "Principal Component",
                 }, template='simple_white', height=250, width=600)
fig.update_layout(showlegend=False)

fig.show()


### PCA

Principal Component Analysis (PCA) is a dimensionality reduction technique that transforms high-dimensional genetic data into a smaller set of uncorrelated variables (Reich et al., 2008). It helps visualize population structure and genetic relationships between samples.

In [None]:
for coh in cohort_cols:
    fig1 = amp.plot_pca(df_pca, x='PC1',y='PC2',z='PC3', colour_column=coh, cohort_columns=cohort_cols, dataset=dataset, color_mapping=color_mapping)
    fig1.show()


## NJT

Neighbor-Joining Tree (NJT) is a clustering method that reconstructs evolutionary relationships between samples based on genetic distances (Saitou & Nei, 1987). It creates a tree where genetically similar samples cluster together.

In [None]:
njt_figures = amp.run_njt_analysis(
    geno=geno,
    metadata=metadata,
    cohort_cols=cohort_cols,
    cohort_col=cohort_col,
    color_mapping=color_mapping,
    wkdir=wkdir,
)

for fig in njt_figures:
    fig.show()
