In [None]:
import plotly.express as px
import allel
import numpy as np
import pandas as pd

In [57]:
dataset = 'gaard-sanger'
metadata_path = "config/metadata.tsv"
cohort_column = 'location'
bed_path = ""
vcf_path = ""

### Plotting allele frequencies

This page shows allele frequencies in each cohort of the SNPs genotyped in the amplicon sequencing protocol.

In [84]:
bed_df = pd.read_csv(bed_path, sep="\t", header=None)
bed_df.columns = ['contig', 'start', 'pos', 'amplicon_id', 'target_id']
metadata = pd.read_csv(metadata_path, sep="\t")

vcf = allel.read_vcf(vcf_path)

geno = allel.GenotypeArray(vcf['calldata/GT'])
pos = vcf['variants/POS']
contig = vcf['variants/CHROM']

vcf_var_df = pd.DataFrame({'contig':contig, 'pos':pos})
vcf_var_df = vcf_var_df.merge(bed_df)

pop_dict = {}
pops = metadata[cohort_column].unique()
for pop in pops:
    pop_dict[pop] = np.where(metadata[cohort_column] == pop)[0]

ac = geno.count_alleles_subpops(pop_dict)

for pop in pops:
    pop_dict[pop] = ac[pop].to_frequencies()

freq_df = []
for pop in pops:
    df = pd.DataFrame({'cohort':pop,
                       'mutation': vcf_var_df['target_id'],
                       'ref':pop_dict[pop][:, 0], 
                       'alt':pop_dict[pop][:, 1]})
    freq_df.append(df)
    
freq_df = pd.concat(freq_df)


invalid INFO header: '##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)",Version="3">\n'



In [89]:
df = freq_df.drop(columns='ref').pivot(columns='cohort', index='mutation', values='alt').round(2)

fig = px.imshow(
        img=df,
        zmin=0,
        zmax=1,
        width=400,
        height=1600,
        text_auto=True,
        aspect=1,
        color_continuous_scale="Reds",
        title=f"{dataset} allele frequencies",
    )
fig.update(layout_coloraxis_showscale=False)

fig