In [None]:
#!/usr/bin/env python3

"""
A script to calculate windowed population genetic statistics (Fst, PBS).
"""
import sys
sys.path.insert(1, 'workflow/scripts/')
sys.path.insert(2, '../workflow/scripts/')
import rnaseqpoptools as rnaseqpop

import pandas as pd
import plotly.express as px
import numpy as np
import allel
from pathlib import Path

In [None]:
dataset = ""
metadata_path = ""
config_path = ""
ploidy = ""
qualflt = ""
pbs=False
missingprop = ""

# Selection

**Output Directory:**  <span style="color:gray;font-weight:bold">*results/variantAnalysis/selection/*</span>

**Rules**  
    
<span style="color:gray;font-weight:bold">

* *variantAnalysis.smk*
    * PerGeneFstPBSDxyPi
    * WindowedFstPBS

</span>

**Introduction** 

Selection is a process that occurs in natural populations, in which certain genetic variants are more likely to be passed on to the next generation than others due to the fact that they provide a fitness advantage.

One method of identifying selection is to calculate Fst between two populations. Fst is a measure of genetic differentiation. It is calculated as the ratio of the genetic variance between populations to the total genetic variance within and between populations. By calculating Fst, we can identify genes or regions of the genome showing high differentiation, which may indicate selection acting upon the locus.

In RNA-Seq-Pop, we calculate Fst in windows and for each gene. The windowed analysis is shown below. 

In [None]:
import yaml
with open(config_path) as params_file:
    config_params = yaml.safe_load(params_file)

contigs = config_params["contigs"]

metadata = rnaseqpop.load_metadata(metadata_path)
metadata = metadata.sort_values(by='species')
numbers = rnaseqpop.get_numbers_dict(ploidy)

#Fst/PBS window size
windownames =  ['1000snp_window', '2000snp_window', '5000snp_window']
windowsizes =  [1000, 2000, 5000]
windowsteps = [500, 1000, 1000]

# Read in list of contrasts
comparisons = config_params["contrasts"]
comparisons = pd.DataFrame(comparisons, columns=['contrast'])
comparisons = comparisons.contrast.str.split("_", expand=True)
comparisons.columns = ['sus', 'res']
comparisons = [list(row) for i,row in comparisons.iterrows()]
pbscomps = config_params["VariantAnalysis"]['selection']['population-branch-statistic']['contrasts']

for i, contig in enumerate(contigs):

    path = f"results/variantAnalysis/vcfs/{dataset}.{contig}.vcf.gz"
    vcf, geno, acsubpops, pos, alts, depth, snpeff, subpops, populations = rnaseqpop.readAndFilterVcf(path=path,
                                                           contig=contig,
                                                           samples=metadata,
                                                           numbers=numbers,
                                                           ploidy=ploidy,
                                                           qualflt=qualflt,
                                                           missingfltprop=missingprop)

    #### Fst in windows #### 
    for sus, res in comparisons:
        name = sus + "_" + res
        cohortText = f"{sus} v {res}"
        print(f"Calculating Fst values in sliding windows for {name}\n")

        for wname, size, step in zip(windownames, windowsizes, windowsteps):
            
            if geno.shape[0] < size:
                print(f"Skipping {wname} for {name} because there are not enough SNPs in {contig}.")
                print("Touching file to prevent snakemake from erroring out.")
                Path(f"results/variantAnalysis/selection/fst/{wname}/{name}.Fst.{contig}.svg").touch()
            else:
                FstArray = allel.moving_hudson_fst(acsubpops[sus], 
                                acsubpops[res], 
                                size=size, step=step)
                midpoint = allel.moving_statistic(pos, np.median, size=size, step=step)
                
                cohortNoSpaceText = wname + "/" + name 
                rnaseqpop.plotWindowed(statName="Fst",
                            cohortText=cohortText,
                            cohortNoSpaceText=cohortNoSpaceText,
                            values=FstArray, 
                            midpoints=midpoint,
                            colour='dodgerblue',
                            prefix="results/variantAnalysis/selection/fst", 
                            contig=contig, 
                            ylim=1, 
                            save=True)
                
                fig = px.line(y=FstArray, x=midpoint, title=f'Fst {cohortNoSpaceText} | {contig}')
                fig.show()
        
    #### Population Branch Statistic (PBS) in windows ####
    if pbs:
        for pbscomp in pbscomps:
            pop1, pop2, outpop = pbscomp.split("_")
            cohortText = f"(({pop1}, {pop2}), {outpop})"
            print(f"Calculating PBS values in sliding window for {pbscomp}\n")
        
            for wname, size, step in zip(windownames, windowsizes, windowsteps):

                if geno.shape[0] < size:
                    print(f"Skipping {wname} for {pbscomp} because there are not enough SNPs in {contig}.")
                    print("Touching file to prevent snakemake from erroring out.")
                    Path(f"results/variantAnalysis/selection/pbs/{wname}/{pbscomp}.PBS.{contig}.svg").touch()
                else:
                    pbsArray = allel.pbs(acsubpops[pop1], 
                                    acsubpops[pop2], 
                                    acsubpops[outpop], 
                                    window_size=size, window_step=step, normed=True)
                    midpoint = allel.moving_statistic(pos, np.median, size=size, step=step)

                    cohortNoSpaceText =  wname + "/" + pbscomp
                    rnaseqpop.plotWindowed(statName="PBS", 
                                cohortText=cohortText,
                                cohortNoSpaceText=cohortNoSpaceText,
                                values=pbsArray, 
                                midpoints=midpoint, 
                                colour='dodgerblue',
                                prefix="results/variantAnalysis/selection/pbs",
                                contig=contig, 
                                ylim=0.5, 
                                save=True)
                    
                    fig = px.line(y=pbsArray, x=midpoint, title=f'PBS {cohortNoSpaceText} | {contig}')
                    fig.show()