In [None]:
#!/usr/bin/env python3

"""
A script to perform PCA on the genotype data
"""
import sys
sys.path.insert(1, 'workflow/scripts/')
sys.path.insert(2, '../workflow/scripts/')
import rnaseqpoptools as rnaseqpop
import pandas as pd 
import plotly.express as px
import numpy as np

In [None]:
# Read in parameters from snakemake
dataset = ""
metadata_path = ""
config_path = "config/config.yaml"
ploidy = ""
qualflt = ""
missingprop = ""

In [None]:
metadata = rnaseqpop.load_metadata(metadata_path)
metadata = metadata.sort_values(by='species')
numbers = rnaseqpop.get_numbers_dict(ploidy)

import yaml
with open(config_path) as params_file:
    config_params = yaml.safe_load(params_file)

contigs = config_params["contigs"]

for i, contig in enumerate(contigs):
    
    # Read in and Filter VCF
    path = f"results/variantAnalysis/vcfs/{dataset}.{contig}.vcf.gz"
    vcf, geno, acsubpops, pos, alts, depth, snpeff, subpops, populations = rnaseqpop.readAndFilterVcf(path=path,
                                                           contig=contig,
                                                           samples=metadata,
                                                           ploidy=ploidy,
                                                           qualflt=qualflt,
                                                           missingfltprop=missingprop)
    

    #### Principal Components Analysis (PCA) ####
    # Set up dict to store indices for colours
    d={}
    for name, inds in subpops.items():
        for n in range(len(inds)):
            p = inds[n]
            d[p] = name

    # Store dict as a dataframe and get colours 
    treatment_indices = pd.DataFrame.from_dict(d, orient='index').reset_index()
    treatment_indices = treatment_indices.rename(columns = {'index':'sample_index', 0:"name"})
    pop_colours = rnaseqpop.get_colour_dict(treatment_indices['name'], "viridis")
    
    # Run PCA function defined in tools.py
    print(f"Performing PCA on {dataset} chromosome {contig}")
    coords, model = rnaseqpop.pca(geno, contig, ploidy, dataset, populations, metadata, pop_colours, prune=True, scaler=None)
    pca_df = pd.concat([metadata, coords], axis=1)

# Principal components analysis

PCA, or Principal Component Analysis, is a technique used in data analysis to simplify and reveal patterns in high-dimensional data. It does this by transforming the original data into a new coordinate system, where the first axis (principal component) captures the most significant variation in the data, the second axis captures the second most significant variation, and so on. By reducing the dimensionality of the data, researchers can identify population structure and relationships between samples, helping them understand how populations are related and how they have evolved over time.

### Variance explained

As a general rule of thumb, when the variance explained for each PC begins to flatten out, that is when the PCs are no longer informative.

In [None]:
fig = px.bar(model.explained_variance_ratio_ , labels={
                     "value": "Variance Explained",
                     "index": "Principal Component",
                 }, template='simple_white')
fig.update_layout(showlegend=False)

fig

### PC1 v PC2

In [None]:
fig = px.scatter(
    pca_df, 
    x='PC1', 
    y='PC2', 
    title=f"PCA {dataset} | PC1 vs PC2", 
    color='treatment', 
    hover_data=pca_df.columns, 
    template='simple_white'
)

fig

### PC3 v PC4

In [None]:
fig = px.scatter(
    pca_df, 
    x='PC3', 
    y='PC4', 
    title=f"PCA {dataset} | PC3 vs PC4", 
    color='treatment', 
    hover_data=pca_df.columns, 
    template='simple_white'
)

fig