In [None]:
#!/usr/bin/env python3

"""
Plots Karyotype data
"""
import sys
sys.path.insert(1, 'workflow/scripts/')
sys.path.insert(2, '../workflow/scripts/')

import rnaseqpoptools as rnaseqpop
import plotly.express as px
import pandas as pd 
import matplotlib.pyplot as plt

In [None]:
# Read in parameters from snakemake
ploidy = 10
config_path = "../../config/config.yaml"
dataset = "Ag_Busia"
metadata_path = "../../config/samples_oldnames.tsv"

In [40]:
import yaml
with open(config_path) as params_file:
    config_params = yaml.safe_load(params_file)

invs = config_params['variantAnalysis']['karyotype']['inversions']

metadata = rnaseqpop.load_metadata(metadata_path)
metadata = metadata.sort_values(by='species')

karyo = {}
for inv in invs:
    df = pd.read_csv(f"results/karyotype/{inv}.{dataset}.karyo.txt", sep="\s+")#, header=None)
    df = df.rename(columns={0:'sampleID', 1:'KaryoScore', 2:'n_SNPtags'})
    df[inv] = df['KaryoScore']/ploidy
    df.rename(columns={inv:f'{inv}_frequency'}).to_csv(f"results/karyotype/{inv}.{dataset}.karyo.txt", sep="\t")
    karyo[inv] = df[['sampleID', f"{inv}_frequency"]]

# concat all the dfs in the dict and remove duplicate cols
karyo = pd.concat(karyo.values(), axis=1).T.drop_duplicates().T.set_index("sampleID")

## transpose and round to 2 decimals
karyo = karyo.T.astype("float64").round(2)
rnaseqpop.plotRectangular(karyo, path="results/karyotype/karyoFreqs.svg" , cmap='mako_r', ylab='Inversion', figsize=[10,5])

# Produce for average karyos per treatment
df = karyo.T.reset_index()
df = df.merge(metadata[['sampleID', 'treatment']])
df = df.set_index('sampleID').groupby('treatment').agg('mean').T.astype("float64").round(2)
rnaseqpop.plotRectangular(df, path="results/karyotype/karyoOverallFreqs.svg", ylab='Inversion', cbar=False, figsize=[8,4])

### Karyotyping

**Output Directory:** <span style="color:gray;font-weight:bold">*results/karyotype/*</span>

**Rules**  

<span style="color:gray;font-weight:bold">
    
* *variantAnalysis.smk*
    * Karyotype  
    
</span> 


**Introduction** 

Chromosomal inversions are a type of structural variation in which a segment of a chromosome is inverted relative to the normal ancestral arrangement. In *Anopheles gambiae*, chromosomal inversions have been extensively studied due to their role in the evolution and adaptation of the species. These inversions limit recombination in heterokaryotypic individuals, and so can act as barriers to gene flow between opposing karyotypes. The 2La inversion, which is approximately 21Mb long, has been associated with aridity tolerance, *Plasmodium* infection and insecticide resistance. Because the 2La inversion predates the speciation of the gambiae complex, it is the biggest driver of population structure within its breakpoints.

In *RNA-Seq-Pop*, we can estimate the frequency of chromosomal inversions in our samples, using [compkaryo](https://github.com/rrlove/compkaryo) and karyotype-tagging SNPs. These are SNPs which reside within the inversion breakpoints, and show fixed differences between karyotypes, indicating which karyotype a sample contains.

**Results**

In [45]:
fig = px.imshow(
        img=df,
        zmin=0,
        zmax=1,
        width=600,
        height=400,
        text_auto=True,
        aspect=1,
        color_continuous_scale="Greens",
        title=f"{dataset} karyotype frequencies",
    )
fig.update(layout_coloraxis_showscale=False)

fig