## Run ecolityping
Functions that allow the user to run ecolityping.py

In [None]:
#|default_exp run_ecolityping
# This will create a package named bps_fbi_sp_ecoli/run_ecolityping.py

In [None]:
#|hide
# Nbdev requires this imports
import nbdev
from nbdev.showdoc import *

In [None]:
#|export
# Standard libs to be used in the notebook
import os
import re
import sys
import shutil
import subprocess
from pathlib import Path

# Common to nbdev template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
import fastcore.script
from fastcore.script import call_parse


# Project specific libraries
from bps_fbi_sp_ecoli import core, helpers

In [None]:
#|hide
print(core.__all__)
print(helpers.__all__)

In [None]:
#|export
# Get and check requirements
tools = ['kma']
helpers.tools_are_present(tools)

In [None]:
#|hide
# This is the notebook config to use for developing purposes
notebook_config = core.get_config(f"{core.PROJECT_DIR}/config/config.default.env")
#core.show_project_env_vars(notebook_config)

# Programs
KMA_PATH =  notebook_config['run_ecolityping']['kma_path']
# Inputs
DB_PATH = notebook_config['run_ecolityping']['db_path']
DB_UPDATE = notebook_config['run_ecolityping']['db_update']
INPUT_DIR = notebook_config['run_ecolityping']['input']['dir']
SAMPLE_SHEET= notebook_config['run_ecolityping']['input']['sample_sheet']
# Outputs
OUTPUT_DIR = notebook_config['run_ecolityping']['output']['dir']
COMMAND_FILE = notebook_config["run_ecolityping"]["output"]["command_file"]

In [None]:
#|export
def ecolityping(sampleid: str, read1: Path, read2: Path, database_path: Path, kma_path: Path, output_dir:Path, update:bool) -> str:
    """A function that generates a command for 1 sample.
    """
    # Get the full paths of relative paths
    read1 = os.path.abspath(read1)
    read2 = os.path.abspath(read2)
    database_path = os.path.abspath(database_path)
    helpers.check_db(database_path)
    kma_path = os.path.abspath(kma_path)
    output_dir = os.path.abspath(output_dir)

    # Note the ecolityping command will create output folders if they don't exist. This includes parent directories.
    command = f"""
    python3 {core.PACKAGE_DIR}/ecoli_fbi/ecolityping.py -i {sampleid} -R1 {read1} -R2 {read2} -db {database_path} -k {kma_path} -o {output_dir} --update {update}
    """.strip()
    # This is where one would normally run it with subprocess or such but am utilizing linux commands in Notebooks as the alternative.
    return command

In [None]:
#|export
def generate_ecolityping_commands(input_dir: Path, sample_sheet: str, output_dir:Path, database_path:Path, kma_path: Path, update:bool):
    """A function that generates commands for many samples in a folder."""
    commands = []
    # with open(fastqs_to_ecolityping_file_path, 'r') as f:
    #     fastqs = [line.strip() for line in f if not line.startswith("#")]
    metadata = helpers.process_sample_sheet(input_dir, sample_sheet)
    fastqs = helpers.find_fastqs(input_dir)
    metadata = helpers.add_fastqs_to_metadata(metadata, fastqs)
    for sample_dict in metadata:
        try:
            sampleid = sample_dict['SampleID']
            read1 = sample_dict['R1']
            read2 = sample_dict['R2']
            organism = sample_dict['Organism']
            if (organism == 'Ecoli') or (organism == 'E. coli'):
                commands.append(ecolityping(sampleid, read1, read2, database_path, kma_path, output_dir, update))
        except KeyError as err:
            #print(f"Warning: SampleID {sampleid} is missing {err}.")
            pass
    return commands

In [None]:
# #|export
# commands = generate_ecolityping_commands(INPUT_DIR, SAMPLE_SHEET, OUTPUT_DIR, DB_PATH, KMA_PATH, DB_UPDATE)
# helpers.write_list_to_file(commands, COMMAND_FILE)
# for command in commands:
#     print(command)
#     !{command}
# #     helpers.notification(NOTIFICATIONS)

## Turn the ecoli_fbi into a commmand line tool

In [None]:
#|export
# This are the command line options for FBI_run_ecolityping
# Add 'FBI_run_bifrostpostkma' into settings.ini
@call_parse
def cli(
    input:Path = None, # Path to input folder that contains the .fastq.gz files
    sample_sheet:str = None, # Name of the sample_sheet file
    output:Path = None, # Path to the output directory
    db_path:Path = None, # Path to db folder that contains the indexed database files e.g., file.comp.b, file.fsa, file.index.db, file.lenght.b, file.name, file.seq.b 
    db_update:bool = False, # Option to build/update the database
    kma_path:Path = None, # Path to kma program
    command_file:str = None, # Path to file to write commands to
    execute:bool = True, # Run commands in command file
    to_stdout:bool = False, # If true, will write to stdout instead of file, mutually exclusive with output_file
    overwrite:bool = False, # If true, will overwrite output_file if it exists
    config_file:str = None # Config file to overwrite default settings, arg parse values will override config file values,
    ) -> None:
    """This program takes as input a folder containing fastq.gz files and a sample_sheet containing 
    \nSampleID	Organism	SupplyingLab
    \necoli1  Ecoli FBI
    \nand allows you to run ecolityping.py on all Ecoli samples.
    """
    config = core.get_config(config_file) # Set env vars and get config variables

    if input is not None:
        config["run_ecolityping"]["input"]["dir"] = input
    if sample_sheet is not None:
        config['run_ecolityping']['input']['sample_sheet'] = sample_sheet
    if output is not None:
        config['run_ecolityping']['output']['dir'] = output
    if db_path is not None:
        config['run_ecolityping']['db_path'] = db_path
    if kma_path is not None:
        config['run_ecolityping']['kma_path'] = kma_path
    if command_file is not None:
        config['run_ecolityping']['output']['command_file'] = command_file
    if db_update is True:
        config['run_ecolityping']['db_update'] = True
    if to_stdout is True:
        config['run_ecolityping']['output']['to_stdout'] = True
    if overwrite is True:
        config['run_ecolityping']['output']['overwrite'] = True

    # Rember the order of the arguments here is dependent on the function:
    # generate_ecolityping_commands(input_dir: Path, sample_sheet: str, output_dir:Path, database_path:Path, kma_path: Path, update:bool):
    commands = generate_ecolityping_commands(
        config["run_ecolityping"]["input"]["dir"],
        config["run_ecolityping"]["input"]["sample_sheet"],
        config["run_ecolityping"]["output"]["dir"],
        config['run_ecolityping']['db_path'],
        config['run_ecolityping']['kma_path'],
        config['run_ecolityping']['db_update'],

    )
    helpers.write_list_to_file(
        commands,
        config["run_ecolityping"]["output"]["command_file"],
        overwrite=config["run_ecolityping"]["output"]["overwrite"],
        to_stdout=config["run_ecolityping"]["output"]["to_stdout"]
    )

    # For running make sure command is present
    if execute and helpers.tools_are_present(["kma"]):
        helpers.execute_commands_from_file(config["run_ecolityping"]["output"]["command_file"])
        #core.notification(config["demultiplex_runs"]["notification"] )


In [None]:
#|hide
#cli(config_file=f"{core.PROJECT_DIR}/config/config.default.env", execute=False)

In [None]:
#| hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the associated python package
nbdev.nbdev_export()