In [1]:
import allel
import numpy as np
import pandas as pd 
import plotly.express as px

def pca(metadata_path, vcf_path):
    """
    Load genotype data and run PCA 
    """
    metadata = pd.read_csv(metadata_path, sep="\t")
    vcf = allel.read_vcf(vcf_path)
    
    # sampleIDs = vcf['samples']
    geno = allel.GenotypeArray(vcf['calldata/GT'])
    # pos = vcf['variants/POS']
    # contig = vcf['variants/CHROM']
    # qual = vcf['variants/QUAL']
    
    ac = geno.count_alleles()
    gn_alt = geno.to_n_alt()

    print("removing any invariant sites")
    loc_var = np.any(gn_alt != gn_alt[:, 0, np.newaxis], axis=1)
    gn_var = np.compress(loc_var, gn_alt, axis=0)
    
    coords, model = allel.pca(gn_var, n_components=10)
    # flip axes back so PC1 is same orientation in each window 
    for i in range(10):
        c = coords[:, i]
    if np.abs(c.min()) > np.abs(c.max()):
        coords[:, i] = c * -1
    
    pca_df = pd.DataFrame(coords)
    pca_df.columns = [f"PC{pc+1}" for pc in range(10)]
    pca_df = pd.concat([metadata, pca_df], axis=1)
    
    return pca_df, model

In [33]:
dataset = 'gaard-agamdao'
vcf_path = f"../../results/vcfs/{dataset}.merged.vcf"
metadata_path = "../../config/metadata.tsv"

## PCA

In this notebook, we run a principal components analysis on the amplicon sequencing variant data, plotting PC1 v PC2 and PC3 v PC4, and the variance explained by the model.

In [44]:
pca_df, model = pca(metadata_path, vcf_path)

removing any invariant sites



invalid INFO header: '##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)",Version="3">\n'



### Variance explained

As a general rule of thumb, when the variance explained for each PC begins to flatten out, that is when the PCs are no longer informative.

In [46]:
px.bar(model.explained_variance_ratio_)

### PC1 v PC2

In [47]:
fig = px.scatter(
    pca_df, 
    x='PC1', 
    y='PC2', 
    title=f"PCA {dataset}", 
    color='taxon', 
    hover_data=['country', 'location'])

fig

### PC3 v PC4

In [48]:
fig = px.scatter(
    pca_df, 
    x='PC3', 
    y='PC4', 
    title=f"PCA {dataset}", 
    color='taxon', 
    hover_data=['country', 'location'])

fig