# Directive for creating a script for your notebook

The block here below is required at the top of each notebook that you want to create a script for. You will also need to edit the "settings.ini" file, to create a script (see [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Writing your own notebooks**) on loop for more details). Replace **some_string** with a name that makes sense for your notebook. 

In [None]:
# |default_exp Spyogenes_parser


# Libraries
Include all the libraries which should be used in this module. You can also import modules from other notebooks; here, we have imported the functions in the core notebook.

In [None]:
# |export

# standard libs
import os
import re

# Common to template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
from fastcore import (
    test,
)
from fastcore.script import (
    call_parse,
)  # for @call_parse, https://fastcore.fast.ai/script
import json  # for nicely printing json and yaml

#import functions from core module (optional, but most likely needed). 
from ssi_analysis_result_parsers import(
    core,
    blast_parser,
)
#from ssi_analysis_result_parsers.blast_parser import extract_presence_absence

# Project specific libraries
from pathlib import Path
import pandas
import numpy
import sys

In [None]:
# This block should never be exported. It is to have python running in the project (and not the nbs) dir, and to initiate the package using pip.
os.chdir(core.PROJECT_DIR)

# Functions

Add your code here below. If your notebook will be used as a console-script, you need to add a "cli"-function, at the end (see [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Code execution** and **Input, output and options**) on loop for more details)

In [None]:
# | export


def extract_emm_type(emm_blast_tsv: Path):
    """   
    with open(emm_cluster_file) as f:
        emm_clusters = {}
        for line in f:
            line = line.rstrip("\n").split()
            for ele in line:
                emm_clusters[ele] = line[0]
             
    try:
        mga_blast_df = pandas.read_csv(mga_blast_tsv, sep='\t', header = None)
        mga_blast_df.columns = "qseqid sseqid pident length qlen qstart qend sstart send evalue bitscore".split(' ')
        mga_blast_df.sort_values(by=['bitscore'], ascending= False).iloc[0]
        mga_pos = mga_blast_df.iloc[0]['sstart']

    except pandas.errors.EmptyDataError:
        print(f"No mga matches found in assembly")
    """

    emm_types_in_emm_plus_mrp_operons = [] ### to update
    mrp_types_in_emm_plus_mrp_operons = ["134","156","159","164","174","205"] ### to update
    emm_blast_tsv = Path(emm_blast_tsv)
    emm_typing_results = {"EMM_type":"-","ENN_type":"-","MRP_type":"-"}
    if not emm_blast_tsv.exists():
        emm_typing_results["emm_typing_notes"] = "No blast output found for EMM genes"
        return(emm_typing_results)
    else:
        try:
            blast_df = pandas.read_csv(emm_blast_tsv, sep='\t', header = None)
        except pandas.errors.EmptyDataError:
            emm_typing_results["emm_typing_notes"] = "Empty blast output, no EMM genes detected"
            return(emm_typing_results)
    notes = []
    blast_df.columns = "qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore".split(' ')
    blast_df["plen"] = blast_df["length"]/blast_df["qlen"]*100
    blast_df["extended_sstart"] = numpy.where(blast_df["sstart"]<blast_df["send"], round((blast_df["sstart"]-blast_df["qstart"]+1)/100), round((blast_df["send"]-blast_df["qstart"]+1)/100))
    blast_df = blast_df.query("bitscore > 200")
    blast_df_unique = blast_df.sort_values(by=['bitscore'], ascending= False).groupby("extended_sstart").first()
    
    if blast_df_unique.shape[0] == 0:
        notes.append("No blast hits found for EMM genes")
    elif len(set(blast_df_unique["sseqid"])) == 1:
        if blast_df_unique.shape[0] == 1:
            emm_typing_results["EMM_type"] = "EMM"+blast_df_unique.iloc[0]["qseqid"][3:]
            if blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]["qlen"] or blast_df_unique.iloc[0]["pident"] < 100:
                emm_typing_results["EMM_type"] += "*"
                notes.append(f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}")
        else:
            if blast_df_unique.iloc[0]["sstart"] < blast_df_unique.iloc[0]["send"]:
                blast_df_unique = blast_df_unique.sort_values(by=['sstart'], ascending=True)
            else:
                blast_df_unique = blast_df_unique.sort_values(by=['sstart'], ascending=False)
            if blast_df_unique.shape[0] == 2:
                emm_typing_results["EMM_type"] = "EMM"+blast_df_unique.iloc[0]["qseqid"][3:]
                if blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]["qlen"] or blast_df_unique.iloc[0]["pident"] < 100:
                    emm_typing_results["EMM_type"] += "*"
                    notes.append(f"EMM{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}")
                
                emm_typing_results["ENN_type"] = "EMM"+blast_df_unique.iloc[1]["qseqid"][3:]
                if blast_df_unique.iloc[1]["length"] < blast_df_unique.iloc[1]["qlen"] or blast_df_unique.iloc[1]["pident"] < 100:
                    emm_typing_results["ENN_type"] += "*"
                    notes.append(f"ENN{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}")
                emm_maintype = blast_df_unique.iloc[0]["qseqid"][3:].split('.')[0]
                mrp_maintype = blast_df_unique.iloc[1]["qseqid"][3:].split('.')[0]
                if mrp_maintype in emm_types_in_emm_plus_mrp_operons or emm_maintype in mrp_types_in_emm_plus_mrp_operons:
                    emm_typing_results["MRP_type"] = "EMM"+emm_typing_results["EMM_type"][3:]
                    emm_typing_results["EMM_type"] = "EMM"+emm_typing_results["ENN_type"][3:]
                    emm_typing_results["ENN_type"] = '-'
                    notes.append(f"EMM redesignated due to known MRP+EMM operon")


            elif blast_df_unique.shape[0] == 3:
                emm_typing_results["MRP_type"] = "EMM"+blast_df_unique.iloc[0]["qseqid"][3:]
                if blast_df_unique.iloc[0]["length"] < blast_df_unique.iloc[0]['qlen'] or blast_df_unique.iloc[0]["pident"] < 100:
                    emm_typing_results["MRP_type"] += "*"
                    notes.append(f"MRP{blast_df_unique.iloc[0]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[0]['pident'],2)} and length {blast_df_unique.iloc[0]['length']}/{blast_df_unique.iloc[0]['qlen']}")
                
                emm_typing_results["EMM_type"] = "EMM"+blast_df_unique.iloc[1]["qseqid"][3:]
                if blast_df_unique.iloc[1]["length"] < blast_df_unique.iloc[1]['qlen'] or blast_df_unique.iloc[1]["pident"] < 100:
                    emm_typing_results["EMM_type"] += "*"
                    notes.append(f"EMM{blast_df_unique.iloc[1]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[1]['pident'],2)} and length {blast_df_unique.iloc[1]['length']}/{blast_df_unique.iloc[1]['qlen']}")

                emm_typing_results["ENN_type"] = "EMM"+blast_df_unique.iloc[2]["qseqid"][3:]
                if blast_df_unique.iloc[2]["length"] < blast_df_unique.iloc[2]['qlen'] or blast_df_unique.iloc[2]["pident"] < 100:
                    emm_typing_results["ENN_type"] += "*"
                    notes.append(f"ENN{blast_df_unique.iloc[2]['qseqid'][3:]} with pident {round(blast_df_unique.iloc[2]['pident'],2)} and length {blast_df_unique.iloc[2]['length']}/{blast_df_unique.iloc[2]['qlen']}")
    else:
        emm_genes = []
        for index, row in blast_df_unique.iterrows():
            if row["length"] < row["qlen"] or row["pident"] < 100:
                emm_genes.append(row["qseqid"][3:]+"*")
            else:
                emm_genes.append(row["qseqid"][3:])
        notes.append("EMM and EMM-like genes found on multiple contigs. Alleles found: "+"/".join(emm_genes))
        

    emm_typing_results["emm_typing_notes"] = ", ".join(notes)
    return emm_typing_results


class SpyogenesResults(core.PipelineResults):

    @classmethod
    def from_tool_paths(cls, emm_blast_tsv: Path, sample_name = None):
        """
        Alternative constructor for initializing results for single sample,
        Initializes SpyogenesResults instance provided paths to outputs from tools (legionella sbt and lag1 presence blast)
        """
        gas_results = cls.summary(emm_blast_tsv=emm_blast_tsv)
        return cls( {sample_name: gas_results})
    
    @classmethod
    def from_tool_paths_dict(cls, file_paths: dict):
        """
        Alternative constructor for initializing results for multiple samples,
        Initializes SpyogenesResults instance by providing a dictionary of paths to outputs from tools (legionella sbt and lag1 presence blast)
        """
        results_dict = {}
        for sample_name, path_dict in file_paths.items():
            gas_results = cls.summary(emm_blast_tsv=Path(path_dict["emm_results"]))
            results_dict[sample_name] = gas_results
        return cls(results_dict)
    
    @classmethod
    def from_tool_paths_dataframe(cls, file_paths_df: pandas.DataFrame):
        """
        Alternative constructor for initializing results for multiple samples,
        Initializes SpyogenesResults instance by providing a DataFrame of paths to outputs from tools (legionella sbt and lag1 presence blast)
        """
        file_paths = file_paths_df.to_dict(orient="index")
        results_dict = {}
        for sample_name, path_dict in file_paths.items():
            results = cls.summary(emm_blast_tsv=Path(path_dict["emm_results"]))
            results_dict[sample_name] = results
        return cls(results_dict)

    @classmethod
    def from_tool_paths_tsv(cls, tool_paths_tsv: Path):
        """
        Alternative constructor for initializing results for multiple samples,
        Initializes SpyogenesResults instance by providing a tsv-file with paths to outputs from tools (legionella sbt and lag1 presence blast)
        """
        file_paths_df = pandas.read_csv(tool_paths_tsv, sep='\t')
        file_paths_df.set_index("sample_name", inplace=True, drop=True)
        return cls.from_tool_paths_dataframe(file_paths_df)
    
    @staticmethod
    def summary(emm_blast_tsv: Path) -> dict:
        emm_results = extract_emm_type(emm_blast_tsv=emm_blast_tsv)
        results_dict = emm_results
        #results_dict = core.update_results_dict(sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: ")
        if results_dict is None:
            return {}
        return results_dict


    def __repr__(self):
        return(f"< Spyogenes analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > ")




## TESTING


In [None]:



emm_typing_dict = extract_emm_type(emm_blast_tsv="test_input/Spyogenes/emm_typing/test1.emm.blast.tsv")

assert(emm_typing_dict["EMM_type"] == "EMM87.0")
assert(emm_typing_dict["ENN_type"] == "EMM159.0*")
assert(emm_typing_dict["MRP_type"] == "-")
assert(emm_typing_dict["emm_typing_notes"] == "ENN159.0 with pident 97.78 and length 180/180")



Spyogenes_results = SpyogenesResults.from_tool_paths(emm_blast_tsv="test_input/Spyogenes/emm_typing/test2.emm.blast.tsv", sample_name="test_sample")

assert(list(Spyogenes_results.results_dict.keys())[0] == "test_sample")

Spyogenes_results = SpyogenesResults.from_tool_paths_dict(file_paths=  {"sample_3": {"emm_results": "test_input/Spyogenes/emm_typing/test3.emm.blast.tsv"},
                                                                        "sample_4": {"emm_results": "test_input/Spyogenes/emm_typing/test4.emm.blast.tsv"}})

assert(Spyogenes_results.results_dict["sample_3"]["EMM_type"] == "EMM77.0")
assert(Spyogenes_results.results_dict["sample_3"]["ENN_type"] == "EMM159.0")
assert(Spyogenes_results.results_dict["sample_3"]["MRP_type"] == "EMM141.3*")
assert(Spyogenes_results.results_dict["sample_3"]["emm_typing_notes"] == "MRP141.3 with pident 97.22 and length 180/180")



assert(Spyogenes_results.results_df["EMM_type"]["sample_4"] == "EMM13.0")
assert(Spyogenes_results.results_df["ENN_type"]["sample_4"] == "EMM203.4*")
assert(Spyogenes_results.results_df["MRP_type"]["sample_4"] == "EMM141.4")
assert(Spyogenes_results.results_df["emm_typing_notes"]["sample_4"] == "ENN203.4 with pident 97.22 and length 180/180")


Spyogenes_results = SpyogenesResults.from_tool_paths_tsv(tool_paths_tsv="test_input/Spyogenes/batch_parser_file_paths.tsv")


assert(Spyogenes_results.results_dict["sample_5"]["EMM_type"] == "EMM81.0")
assert(Spyogenes_results.results_dict["sample_5"]["ENN_type"] == "-")
assert(Spyogenes_results.results_dict["sample_5"]["MRP_type"] == "EMM156.4*")
assert(Spyogenes_results.results_dict["sample_5"]["emm_typing_notes"] == "EMM156.4 with pident 99.44 and length 180/180, EMM redesignated due to known MRP+EMM operon")


assert(Spyogenes_results.results_df["EMM_type"]["sample_7"] == "-")
assert(Spyogenes_results.results_df["ENN_type"]["sample_7"] == "-")
assert(Spyogenes_results.results_df["MRP_type"]["sample_7"] == "-")
assert(Spyogenes_results.results_df["emm_typing_notes"]["sample_7"] == "EMM and EMM-like genes found on multiple contigs. Alleles found: 203.4*/28.0")

assert(Spyogenes_results.results_df["EMM_type"]["sample_empty"] == "-")
assert(Spyogenes_results.results_df["ENN_type"]["sample_empty"] == "-")
assert(Spyogenes_results.results_df["MRP_type"]["sample_empty"] == "-")
assert(Spyogenes_results.results_df["emm_typing_notes"]["sample_empty"] == "Empty blast output, no EMM genes detected")


assert(Spyogenes_results.results_df["EMM_type"]["sample_nonexist"] == "-")
assert(Spyogenes_results.results_df["ENN_type"]["sample_nonexist"] == "-")
assert(Spyogenes_results.results_df["MRP_type"]["sample_nonexist"] == "-")
assert(Spyogenes_results.results_df["emm_typing_notes"]["sample_nonexist"] == "No blast output found for EMM genes")


In [None]:
# |export



@call_parse
def Spyogenes_parser(
    emm_blast_tsv: Path = None,  # Blast output from blasting EMM and emm-like genes
    output_file: Path = None,  # Path to output tsv
    sample_name: str = None,
) -> None:
    """
    
    """
    results = SpyogenesResults.from_tool_paths(emm_blast_tsv=emm_blast_tsv,
                                                           sample_name=sample_name)
    results.write_tsv(output_file=output_file)

@call_parse
def Spyogenes_batch_parser(
    file_path_tsv: Path = None,  # Path to tsv containing file paths to the outputs from tools to be parsed. Must contain headers "sample_name", "sbt_results", and "lag1_blast_results"
    output_file: Path = None,  # Path to output tsv
) -> None:
    """
    
    """
    results = SpyogenesResults.from_tool_paths_tsv(tool_paths_tsv=file_path_tsv)
    results.write_tsv(output_file)



# Directive for ensuring that the code in your notebook get executed as a script

The code-block here below is required to ensure that the code in the notebook is also transferred to the module (script), otherwise it will just be a notebook. See [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Writing your own notebooks**) on loop for more details.

In [None]:
# | hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the module and isn't just a notebook
import nbdev

nbdev.nbdev_export()