In [None]:
import plotly.express as px
import allel
import numpy as np
import pandas as pd

def load_vcf(vcf_path, metadata):
    """
    Load VCF and filter poor-quality samples
    """
    
    sampleIDs = metadata.sampleID.to_list()
    
    # load vcf and get genotypes and positions
    vcf = allel.read_vcf(vcf_path, fields='*')
    samples = vcf['samples']
    # keep only samples in qcpass metadata 
    sample_mask = np.isin(vcf['samples'], metadata.sampleID)
    
    # remove low quality samples 
    geno = allel.GenotypeArray(vcf['calldata/GT'])
    geno = geno.compress(sample_mask, axis=1)
    pos = vcf['variants/POS']
    contig = vcf['variants/CHROM']
    indel = vcf['variants/INDEL']
    
    # remove indels 
    geno = geno.compress(~indel, axis=0)
    pos = pos[~indel]
    contig = contig[~indel]
    
    return geno, pos, contig, samples[sample_mask]

In [None]:
dataset = 'vigg-01'
metadata_path = "../../results/config/metadata.qcpass.tsv"
cohort_column = 'location'
vcf_path = "../../results/vcfs/amplicons/ampseq-vigg-01.annot.vcf"

### Estimating genetic diversity

This page calculates genetic diversity in individuals and cohorts. 

Calculating genetic diversity from ag-vampir amplicons is tricky because there are so many Vgsc amplicons, results will be biased by the presence of selective sweeps, and aims. 

In [None]:
# load metadata
if metadata_path.endswith('.xlsx'):
	metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
	metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
	metadata = pd.read_csv(metadata_path, sep=",")
else:
	raise ValueError("Metadata file must be .xlsx or .csv")

geno, pos, contig, samples = load_vcf(vcf_path, metadata)

In [None]:
pis = []
for i, sample in enumerate(samples):
    ac = geno.take([i], axis=1).count_alleles()
    pis.append(allel.sequence_diversity(ac=ac, pos=np.arange(len(pos))))
    
sample_pi_df = pd.DataFrame({'sampleID':samples, 
              'pi':pis
             } )

cohs = metadata[cohort_column].unique()
coh_idxs = {loc:np.where(metadata[cohort_column] == loc)[0] for loc in cohs}

pis = []
for coh in cohs:
    ac = geno.take(coh_idxs[coh], axis=1).count_alleles()
    pis.append(allel.sequence_diversity(ac=ac, pos=np.arange(len(pos))))   

cohort_pi_df = pd.DataFrame({'cohort':cohs, 'pi':pis}) 

#### By cohort

In [None]:
px.bar(cohort_pi_df, x='cohort', y='pi', template='simple_white', width=600, height=400)

In [None]:
sample_pi_df