In [None]:
import plotly.express as px
import allel
import numpy as np
import pandas as pd

In [None]:
dataset = 'gaard-sanger'
metadata_path = "../../config/metadata.tsv"
cohort_column = 'location'
bed_path = "../../config/AgamDao.bed"
vcf_path = "../../results/vcfs/targets/calvin2.annot.vcf"

### Plotting allele frequencies

This page shows allele frequencies in each cohort of the SNPs genotyped in the amplicon sequencing protocol.

In [None]:
bed_df = pd.read_csv(bed_path, sep="\t", header=None)
bed_df.columns = ['contig', 'start', 'pos', 'amplicon_id', 'target_id']

# load metadata
if metadata_path.endswith('.xlsx'):
	metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
	metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
	metadata = pd.read_csv(metadata_path, sep=",")
else:
	raise ValueError("Metadata file must be .xlsx or .csv")

# load vcf and get genotypes and positions
vcf = allel.read_vcf(vcf_path, fields=['calldata/GT', 'variants/POS', 'variants/CHROM', 'variants/INFO'])
geno = allel.GenotypeArray(vcf['calldata/GT'])
pos = vcf['variants/POS']
contig = vcf['variants/CHROM']
indel = vcf['variants/INDEL']

# remove indels
# make dataframe of variant positions and merge with bed
vcf_var_df = pd.DataFrame({'contig':contig, 'pos':pos})
vcf_var_df = vcf_var_df[~indel].merge(bed_df)
geno = geno.compress(~indel, axis=0)

# get indices of each population
pop_dict = {}
pops = metadata[cohort_column].unique()
for pop in pops:
    pop_dict[pop] = np.where(metadata[cohort_column] == pop)[0]

# get allele counts for each population
ac = geno.count_alleles_subpops(pop_dict)

# convert to frequencies
for pop in pops:
    pop_dict[pop] = ac[pop].to_frequencies()

# make dataframe of allele frequencies
freq_dfs = []
for pop in pops:
    df = pd.DataFrame({'cohort':pop,
                       'mutation': vcf_var_df['target_id'],
                       'ref':pop_dict[pop][:, 0], 
                       'alt':pop_dict[pop][:, 1]})
    freq_dfs.append(df)

# concatenate dataframes
freq_df = pd.concat(freq_dfs)

In [None]:
df = freq_df.drop(columns='ref').pivot(columns='cohort', index='mutation', values='alt').round(2)

fig = px.imshow(
        img=df,
        zmin=0,
        zmax=1,
        width=400,
        height=1600,
        text_auto=True,
        aspect=1,
        color_continuous_scale="Reds",
        title=f"{dataset} allele frequencies",
    )
fig.update(layout_coloraxis_showscale=False)

fig