In [None]:
#!/usr/bin/env python3

"""
A script to calculate various SNP statistics, windowed population genetic statistics (Fst, PBS) and PCA.
Currently not modularised further to reduce the repetition of loading and filtering VCFs (which is slow). 
"""
import sys
sys.path.insert(1, 'workflow/scripts/')
sys.path.insert(2, '../workflow/scripts/')
import rnaseqpoptools as rnaseqpop

import pandas as pd
import numpy as np
import allel
from collections import defaultdict

In [None]:
# Read in parameters from snakemake
dataset = "Ag_Busia"
metadata_path = "../../config/samples_oldnames.tsv"
ploidy = 10
config_path = "../../config/config.yaml"
qualflt = 30
missingprop = 0.8

In [None]:
metadata = rnaseqpop.load_metadata(metadata_path)
metadata = metadata.sort_values(by='species')
numbers = rnaseqpop.get_numbers_dict(ploidy)

import yaml
with open(config_path) as params_file:
    config_params = yaml.safe_load(params_file)

contigs = config_params["contigs"]

# Initialise dicts to store genetic diversity statistic
pi = {}
theta = {}
coefdictchrom= {}

for i, contig in enumerate(contigs):
    # Read in and Filter VCF
    path = f"results/variantAnalysis/vcfs/{dataset}.{contig}.vcf.gz"
    vcf, geno, acsubpops, pos, alts, depth, snpeff, subpops, populations = rnaseqpop.readAndFilterVcf(path=path,
                                                           contig=contig,
                                                           samples=metadata,
                                                           ploidy=ploidy,
                                                           qualflt=qualflt,
                                                           missingfltprop=missingprop)


    # Genome-wide statistics (Pi, Wattersons Theta, inbreeding coefficient)
    pi[contig] = rnaseqpop.windowedDiversity(geno=geno, pos=pos, subpops=subpops, statistic='pi', window_size=20_000)
    theta[contig] = rnaseqpop.windowedDiversity(geno=geno, pos=pos, subpops=subpops, statistic='theta', window_size=20_000)    
    
    coefdict= {}
    allcoef = defaultdict(list)
    for pop in metadata['treatment'].unique():
        # Inbreeding coefficient
        if ploidy > 1:
            gn = geno.take(subpops[pop], axis=1)
            coef = allel.moving_statistic(gn, statistic=allel.inbreeding_coefficient, size=1000, step=100)
            coef = np.nanmean(coef, axis=1)
            coefdict[pop] = np.mean(coef)
            allcoef[pop].append(np.array(coef))

        if ploidy > 1: print(f"{pop} | {contig} | Inbreeding Coef =", np.mean(coef), "\n")
    if ploidy > 1: coefdictchrom[contig] = dict(coefdict)

# Concat contigs, get CIs for Pi and Theta and save to file
pi_df = rnaseqpop.diversity_ci_table(div_dict=pi, statistic='pi')
pi_df.to_csv("results/variantAnalysis/diversity/SequenceDiversity.tsv", sep="\t", index=True)
theta_df = rnaseqpop.diversity_ci_table(div_dict=theta, statistic='theta')
theta_df.to_csv("results/variantAnalysis/diversity/WattersonsTheta.tsv", sep="\t", index=True)

if ploidy > 1: coefdictchrom = rnaseqpop.flip_dict(coefdictchrom)
if ploidy > 1: pd.DataFrame.from_dict(coefdictchrom).to_csv("results/variantAnalysis/diversity/inbreedingCoef.tsv", sep="\t", index=True)
# Get genome wide average stats
if ploidy > 1:
    for pop in allcoef.keys():
        allcoef[pop] = np.nanmean(allcoef[pop])

    coefdf = pd.DataFrame.from_dict(allcoef, orient='index', columns=['InbreedingCoefficient'])
    coefdf.to_csv(f"results/variantAnalysis/diversity/inbreedingCoef.mean.tsv", sep="\t", index=True)

# Genetic Diversity 

**Output Directory:** <span style="color:gray;font-weight:bold">*results/variantAnalysis/diversity/*</span>

**Rules**

<span style="color:gray;font-weight:bold">
    
* *variantAnalysis.smk*
    * SummaryStatistics  

</span>    
    
**Introduction** 

Genetic diversity is an important concept in population genetics, and it refers to the amount of genetic variation in a population. This variation can take many forms, including single nucleotide polymorphisms (SNPs), small insertions or deletions (INDELs), and larger structural changes and rearrangements. 

One measure of genetic diversity is nucleotide diversity (aka pi), which is a measure of the average number of nucleotide differences per site between two randomly chosen genomes within a population. Another measure is Watterson's theta, which is a measure of the average number of segregating sites within a population. These measures can provide valuable insights into the evolutionary history of a population.

In RNA-Seq-Pop, we calculate genetic diversity from the called SNPs using [scikit-allel](https://scikit-allel.readthedocs.io/en/stable/). As indel calling is unreliable from RNA-Seq data, we filter out any indel calls prior to variant analysis. 

### Pi

In [None]:
pi_df

### Theta

In [None]:
theta_df