# Directive for creating a script for your notebook

The block here below is required at the top of each notebook that you want to create a script for. You will also need to edit the "settings.ini" file, to create a script (see [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Writing your own notebooks**) on loop for more details). Replace **some_string** with a name that makes sense for your notebook. 

In [None]:
# |default_exp Legionella_parser


# Libraries
Include all the libraries which should be used in this module. You can also import modules from other notebooks; here, we have imported the functions in the core notebook.

In [None]:
# |export

# standard libs
import os
import re

# Common to template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
from fastcore import (
    test,
)
from fastcore.script import (
    call_parse,
)  # for @call_parse, https://fastcore.fast.ai/script
import json  # for nicely printing json and yaml

#import functions from core module (optional, but most likely needed). 
from ssi_analysis_result_parsers import(
    core,
    blast_parser,
)
#from ssi_analysis_result_parsers.blast_parser import extract_presence_absence

# Project specific libraries
from pathlib import Path
import pandas
import sys

In [None]:
# This block should never be exported. It is to have python running in the project (and not the nbs) dir, and to initiate the package using pip.
os.chdir(core.PROJECT_DIR)

# Functions

Add your code here below. If your notebook will be used as a console-script, you need to add a "cli"-function, at the end (see [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Code execution** and **Input, output and options**) on loop for more details)

In [None]:
# | export

def extract_legionella_sbt(legionella_sbt_results_tsv: Path) -> dict:
    """
    Returns dictionary of results found in the Legionella SBT summary output
    """
    if os.path.exists(legionella_sbt_results_tsv):
        try:
            df = pandas.read_csv(legionella_sbt_results_tsv, sep='\t')
            df.set_index("sample", inplace=True, drop=True)
            d = df.to_dict(orient="index")
            fname = next(iter(d))
            return d[fname]
        except pandas.errors.EmptyDataError:
            print(f"No Legionella SBT output empty at {legionella_sbt_results_tsv}", file=sys.stderr)
            return None
    else:
        print(f"No Legionella SBT output found at {legionella_sbt_results_tsv}", file=sys.stderr)
        return None

def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
    sbt_results_dict = extract_legionella_sbt(legionella_sbt_results_tsv=legionella_sbt_results_tsv)
    lag1_blast_dict = blast_parser.extract_presence_absence(blast_output_tsv = lag1_blast_tsv,
                                                                 hits_as_string = False,
                                                                 include_match_stats = False,
                                                                 gene_names = ["lag-1"])
    results_dict = core.update_results_dict(sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: ")
    if results_dict is None:
        return {}
    return results_dict 


def legionella_batch_from_sheet(file_paths: dict, output_file: Path = None):
    results_dict = {}
    for sample_name, path_dict in file_paths.items():
        legionella_results = legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
                                                lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
        results_dict[sample_name] = legionella_results
    if output_file is not None:
        df = pandas.DataFrame.from_dict(results_dict, orient="index").reset_index(names="sample_name")
        df.to_csv(output_file,sep="\t",index=False)
    return results_dict



class LegionellaResults(core.PipelineResults):

    @classmethod
    def from_tool_paths(cls, legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path, sample_name = None):
        legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=legionella_sbt_results_tsv,
                                                lag1_blast_tsv=lag1_blast_tsv)
        return cls( {sample_name: legionella_results})
    
    @classmethod
    def from_tool_paths_dict(cls, file_paths: dict):
        results_dict = {}
        for sample_name, path_dict in file_paths.items():
            legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
                                                    lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
            results_dict[sample_name] = legionella_results
        return cls(results_dict)
    
    @classmethod
    def from_tool_paths_dataframe(cls, file_paths_df: pandas.DataFrame):
        file_paths = file_paths_df.to_dict(orient="index")
        results_dict = {}
        for sample_name, path_dict in file_paths.items():
            legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
                                                    lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
            print(legionella_results)
            results_dict[sample_name] = legionella_results
        return cls(results_dict)

    @classmethod
    def from_tool_paths_tsv(cls, tool_paths_tsv: Path):
        file_paths_df = pandas.read_csv(tool_paths_tsv, sep='\t')
        file_paths_df.set_index("sample_name", inplace=True, drop=True)
        #return_cls = 
        #results_dict = file_paths_df.to_dict(orient="index")
        return cls.from_tool_paths_dataframe(file_paths_df)
        """for sample_name, path_dict in file_paths.items():
            legionella_results = cls.legionella_summary(legionella_sbt_results_tsv=Path(path_dict["sbt_results"]),
                                                    lag1_blast_tsv=Path(path_dict["lag1_blast_results"]))
            results_dict[sample_name] = legionella_results
        return cls(results_dict)"""
    
    
    @staticmethod
    def legionella_summary(legionella_sbt_results_tsv: Path, lag1_blast_tsv: Path) -> dict:
        sbt_results_dict = extract_legionella_sbt(legionella_sbt_results_tsv=legionella_sbt_results_tsv)
        lag1_blast_dict = blast_parser.extract_presence_absence(blast_output_tsv = lag1_blast_tsv,
                                                                    hits_as_string = False,
                                                                    include_match_stats = False,
                                                                    gene_names = ["lag-1"])
        results_dict = core.update_results_dict(sbt_results_dict, lag1_blast_dict, old_duplicate_key_prefix="SBT: ")
        if results_dict is None:
            return {}
        return results_dict


    def __repr__(self):
        return(f"< Legionella analysis results object. {len(self.results_df)} samples with {len(self.results_df.columns)} result variables > ")




## TESTING


In [None]:



f = LegionellaResults.from_results_tsv("./test_output/test_batch_output.tsv")
assert(len(f) == 2)
assert(f.results_dict["sample_1"]["ST"] == 23)
print(f.results_df)



f = LegionellaResults.from_tool_paths_dict(file_paths=  {"sample_1": {"sbt_results": "test_input/Legionella/test.sbt.tsv", "lag1_blast_results": "test_input/Legionella/lag-1_blast.tsv"},
                                                            "sample_2": {"sbt_results": "test_input/Legionella/test2.sbt.tsv", "lag1_blast_results": "test_input/Legionella/lag-1_blast_2.tsv"}})



f = LegionellaResults.from_tool_paths(legionella_sbt_results_tsv="test_input/Legionella/test.sbt.tsv",
                                        lag1_blast_tsv="test_input/Legionella/lag-1_blast.tsv")

print(f)
print(f.results_dict)

f = LegionellaResults.from_tool_paths_tsv(tool_paths_tsv="test_input/Legionella/batch_parser_file_paths.tsv")


print(f.results_df)


{'sample_1': {'ST': 23, 'flaA': 2, 'pilE': 3, 'asd': 9, 'mip': 10, 'mompS': 2, 'proA': 1, 'neuA': 6, 'notes': 'Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call', 'lag-1': 1}, 'sample_2': {'ST': 182, 'flaA': 3, 'pilE': 4, 'asd': 1, 'mip': 3, 'mompS': 35, 'proA': 9, 'neuA': 11, 'notes': 'Exact ST match, Heterozygous mompS alleles, High confidence mompS allele call', 'lag-1': 0}}
           ST  flaA  pilE  asd  mip  mompS  proA  neuA  \
sample_1   23     2     3    9   10      2     1     6   
sample_2  182     3     4    1    3     35     9    11   

                                                      notes  lag-1  
sample_1  Exact ST match, Heterozygous mompS alleles, Hi...      1  
sample_2  Exact ST match, Heterozygous mompS alleles, Hi...      0  
Blast output file test_input/Legionella/lag-1_blast_2.tsv empty. Assuming 0 blast hits.
{'sample_1': {'ST': 23, 'flaA': 2, 'pilE': 3, 'asd': 9, 'mip': 10, 'mompS': 2, 'proA': 1, 'neuA': 6, 'notes': 'Exact ST m

No Legionella SBT output empty at test_input/empty_file.txt


In [None]:
# |export



@call_parse
def legionella_parser(
    legionella_sbt_file: Path = None,  # Path "*.sbt.tsv from legionella_sbt program"
    lag_1_blast_output: Path = None, #  Path to output from lag1_blast. Generated with blastn -query lag-1.fasta -subject assembly.fasta -outfmt "6 qseqid sseqid pident length qlen qstart qend sstart send sseq evalue bitscore"
    output_file: Path = None,  # Path to output tsv
    sample_name: str = None,
    config_file: str = None,  # config file to set env vars from
) -> None:
    """
    
    """
    #config = core.get_config(config_file)  # Set env vars and get config variables
    legionella_results = LegionellaResults.from_tool_paths(legionella_sbt_results_tsv=legionella_sbt_file,
                                                           lag1_blast_tsv=lag_1_blast_output,
                                                           sample_name=sample_name)
    legionella_results.write_tsv(output_file=output_file)

@call_parse
def legionella_batch_parser(
    file_path_tsv: Path = None,  # Path to tsv containing file paths to the outputs from tools to be parsed. Must contain headers "sample_name", "sbt_results", and "lag1_blast_results"
    output_file: Path = None,  # Path to output tsv
    config_file: str = None,  # config file to set env vars from
) -> None:
    """
    
    """
    #config = core.get_config(config_file)  # Set env vars and get config variables
    legionella_results = LegionellaResults.from_tool_paths_tsv(tool_paths_tsv=file_path_tsv)
    legionella_results.write_tsv(output_file)



# Directive for ensuring that the code in your notebook get executed as a script

The code-block here below is required to ensure that the code in the notebook is also transferred to the module (script), otherwise it will just be a notebook. See [Coding in NBdev](https://dksund.sharepoint.com/:fl:/g/contentstorage/CSP_7c761ee7-b577-4e08-8517-bc82392bf65e/ETlSfUyArSNJhX8veMI_JQ8By1aXGHzDJkhotpfpXx4mmw?e=037EwH&nav=cz0lMkZjb250ZW50c3RvcmFnZSUyRkNTUF83Yzc2MWVlNy1iNTc3LTRlMDgtODUxNy1iYzgyMzkyYmY2NWUmZD1iJTIxNXg1MmZIZTFDRTZGRjd5Q09TdjJYblkwVlNiWXFYcE1yaHVrVmZqTVJUVEE4X1VwZjhTd1JxcjRNdmFrSmh2RCZmPTAxVlVLVzVWSlpLSjZVWkFGTkVORVlLN1pQUERCRDZKSVAmYz0lMkYmYT1Mb29wQXBwJnA9JTQwZmx1aWR4JTJGbG9vcC1wYWdlLWNvbnRhaW5lciZ4PSU3QiUyMnclMjIlM0ElMjJUMFJUVUh4a2EzTjFibVF1YzJoaGNtVndiMmx1ZEM1amIyMThZaUUxZURVeVpraGxNVU5GTmtaR04zbERUMU4yTWxodVdUQldVMkpaY1Zod1RYSm9kV3RXWm1wTlVsUlVRVGhmVlhCbU9GTjNVbkZ5TkUxMllXdEthSFpFZkRBeFZsVkxWelZXU1RJMVJsaFBNalkyUlZkQ1FqTTFRVmhKVTBkRFVVcFdXa1klM0QlMjIlMkMlMjJpJTIyJTNBJTIyNzRmNzM1ZmUtYzg4Ny00MjhhLWFkZmYtNTEyZTg2YmNmZmQzJTIyJTdE) 
(**Writing your own notebooks**) on loop for more details.

In [None]:
# | hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the module and isn't just a notebook
import nbdev

nbdev.nbdev_export()