In [None]:
import allel
import pandas as pd
import numpy as np
import plotly.express as px

def load_vcf(vcf_path, metadata):
    """
    Load VCF and filter poor-quality samples
    """
        
    # load vcf and get genotypes and positions
    vcf = allel.read_vcf(vcf_path, fields='*')
    samples = vcf['samples']
    # keep only samples in qcpass metadata 
    sample_mask = np.isin(vcf['samples'], metadata.sampleID)
    
    # remove low quality samples 
    geno = allel.GenotypeArray(vcf['calldata/GT'])
    geno = geno.compress(sample_mask, axis=1)
    pos = vcf['variants/POS']
    contig = vcf['variants/CHROM']
    indel = vcf['variants/INDEL']
    
    # remove indels 
    geno = geno.compress(~indel, axis=0)
    pos = pos[~indel]
    contig = contig[~indel]
    
    return geno, pos, contig, samples[sample_mask]

In [None]:
metadata_path = '../../../results/config/metadata.qcpass.tsv'
bed_targets_path = "../../../config/ag-vampir.bed"
vcf_path = "../../../results/vcfs/targets/ampseq-vigg01.annot.vcf"
wkdir = "../.."
cohort_cols = 'taxon,location'

### Species ID

In [None]:
metadata = pd.read_csv(metadata_path , sep="\t")
targets = pd.read_csv(bed_targets_path, sep="\t", header=None)
targets.columns = ['contig', 'start', 'end', 'amplicon', 'mutation']

geno, pos, contig, samples = load_vcf(vcf_path=vcf_path, metadata=metadata)
#ref = vcf['variants/REF']

In [None]:
aim_targets = targets.query("mutation.str.contains('AIM')", engine='python')

aim_mask = np.isin(pos, aim_targets.end.to_list())
aim_gn = geno.compress(aim_mask, axis=0)
aim_pos = pos[aim_mask]
aim_contig = contig[aim_mask]

aim_df = pd.DataFrame(aim_gn.to_n_alt(fill=-1)).assign(pos=aim_pos, contig=aim_contig).set_index(['pos', 'contig'])
# reorder contigs 
aim_df = pd.concat([aim_df.query("contig == @contig") for contig in ['2R', '2L', '3R', '3L', 'X']])
aim_pos = aim_df.reset_index()['pos']
aim_contig = aim_df.reset_index()['contig']

x_label = [f"{c}:{p}" for c, p in zip(aim_contig, aim_pos)]

fig = px.imshow(
    aim_df.values.T, 
    x=x_label,
    y=samples, 
    color_continuous_scale='blackbody_r',
    aspect='auto',
    title='gamb vs colu AIM genotypes',
    width=1000, 
    height=800
)
fig.show()

### Species assignments by cohorts