In [None]:
import os
import sys
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), 'workflow/lib'))
import ampseeker as amp

vcf_to_snp_dataframe = amp.vcf_to_snp_dataframe
calculate_frequencies_cohort = amp.calculate_frequencies_cohort
plot_allele_frequencies = amp.plot_allele_frequencies


In [None]:
dataset = 'afairy'
metadata_path = "../../results/config/metadata.qcpass.tsv"
cohort_cols = 'cohort'
bed_path = "../../config/afairy.bed"
vcf_path = f"../../results/vcfs/targets/{dataset}.annot.vcf"
wkdir = "../.."
platform = 'nanopore'


In [None]:
import os
import sys

sys.path.append(os.path.join(wkdir, 'workflow/lib'))
import ampseeker as amp


### Plotting allele frequencies

This page shows allele frequencies in each cohort of the SNPs genotyped in the amplicon sequencing protocol. Allele frequency refers to the proportion of a specific genetic variant in a population.

In [None]:
cohort_cols = cohort_cols.split(",")

df_bed = amp.load_bed(bed_path)
metadata = pd.read_csv(metadata_path, sep="\t")


In [None]:
non_aim_snps = df_bed.query("~mutation.str.contains('AIM')").end.to_numpy()
snp_df, geno = vcf_to_snp_dataframe(vcf_path, metadata, platform=platform)

maf_threshold = 0.02

frq_dfs = []
for cohort_col in cohort_cols:
    
    freq_df = calculate_frequencies_cohort(
        snp_df=snp_df, 
        metadata=metadata,
        geno=geno, 
        cohort_col=cohort_col,
        af_filter=maf_threshold,
        missense_filter=False
    )

    freq_df['contig'] = pd.Categorical(freq_df['contig'])
    freq_df = freq_df.sort_values(by=['contig', 'pos'])
    freq_df = freq_df.query("pos in @non_aim_snps")
    frq_dfs.append(freq_df.reset_index(drop=True))

    if freq_df.empty:
        print(f"No variants found after filtering for cohort {cohort_col} at maf > {maf_threshold}")
        continue

    fig = plot_allele_frequencies(
        df=freq_df.filter(like='frq_'),
        cohort_col=cohort_col,
        colscale="Reds"
    )
    fig.write_image(f"{wkdir}/results/allele_frequencies_{cohort_col}.png", scale=2)
    fig.show()


#### SNP frequency summary table

This table summarizes allele frequencies across all cohorts.

In [None]:
pd.set_option("display.max_rows", 200)
pd.set_option('display.max_columns', 100)

snp_df = pd.concat(frq_dfs)

snp_df.to_csv(f"{wkdir}/results/snp_frequencies_summary.tsv", sep="\t")
snp_df


#### Allele frequencies of any SNPs across amplicons

This heatmap visualizes missense mutations found across all amplicons, focusing on functionally relevant variants that change amino acid sequences and potentially affect protein function.

In [None]:
vcf_path = f"{wkdir}/results/vcfs/amplicons/{dataset}.annot.vcf"
cohort_col = cohort_cols[0]

snp_df, geno = vcf_to_snp_dataframe(vcf_path, metadata, platform=platform, filter_missing=0.4)

snp_freq_df = calculate_frequencies_cohort(
    snp_df=snp_df, 
    metadata=metadata,
    geno=geno, 
    cohort_col=cohort_col, 
    af_filter=0.01,
    missense_filter=True
)   

snp_freq_df = snp_freq_df.filter(like='frq')
snp_freq_df.columns = snp_freq_df.columns.str.replace("frq_", "")


plot_allele_frequencies(
    df=snp_freq_df.sort_index(),
    cohort_col=cohort_col
)
