In [None]:
# |default_exp core
# This will create a package named pingme/core.py

In [None]:
# |hide
import nbdev
from nbdev.showdoc import *  # ignore this Pylance warning in favor of following nbdev docs

For help with the Markdown language, see [this guide](https://www.markdownguide.org/basic-syntax/).

# Global static vars
These are used to modify the template for individual use cases

In [None]:
# |export
# Need the pingme for a few functions, this can be considered a static var

import importlib
import importlib.util
import os
from pydantic_settings import BaseSettings

PACKAGE_NAME: str = "pingme"  # Make sure to adjust this to your package name
DEV_MODE: bool = (
    False  # set below to override, as this is in an export block it'll be exported while the dev mode section is not
)

PACKAGE_DIR = None
try:
    spec = importlib.util.find_spec(PACKAGE_NAME)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    PACKAGE_DIR = os.path.dirname(module.__file__)
except ImportError:
    DEV_MODE = True
except AttributeError:
    DEV_MODE = True
PROJECT_DIR = os.getcwd()  # override value in dev mode
if PROJECT_DIR.endswith("nbs"):
    DEV_MODE = True
    PROJECT_DIR = os.path.split(PROJECT_DIR)[0]


class Settings(BaseSettings):
    """
    Base settings class for the package, primarily to gain config_file for dev mode through pydantic
    """

    app_name: str = "PingMe"
    config_file: str = ""

    @classmethod
    def create(cls):
        """Factory method to create settings based on environment"""
        if DEV_MODE:
            return cls(config_file=f"{PROJECT_DIR}/config/config.env")
        else:
            return cls()


settings = Settings().create()

import logging
import os
import sys


# Set up logging
def setup_logging(log_level=None):
    """Configure logging for the application"""
    log_level = log_level or (logging.DEBUG if DEV_MODE else logging.INFO)

    # Create formatter
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )

    # Configure root logger
    root_logger = logging.getLogger()
    root_logger.setLevel(log_level)

    # Console handler
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(formatter)
    root_logger.addHandler(console_handler)

    # File handler (optional)
    if DEV_MODE:
        log_dir = os.path.join(PROJECT_DIR, "logs")
        os.makedirs(log_dir, exist_ok=True)
        file_handler = logging.FileHandler(os.path.join(log_dir, "pingme.log"))
        file_handler.setFormatter(formatter)
        root_logger.addHandler(file_handler)

    # Create a logger for pingme
    logger = logging.getLogger("pingme")

    return logger


# Initialize logger
logger = setup_logging()

# Dev mode
If you're developing this versus running this, you'll have access to slightly different things. Notable the nbdev functions are only for development and not for runtime. This matters for items such as the config. So we need to detect if you are in dev mode or not and the code has to adjust accordingly. Notice that this section is not exported so will only work in the notebook and not in the module.

In [None]:
# This section uses nbdev functions so should not be exported as it's for dev purposes
import os

if DEV_MODE:
    PACKAGE_DIR = nbdev.config.get_config(cfg_name="settings.ini", path=os.getcwd())[
        "lib_path"
    ]  # the library is the package of course
    PROJECT_DIR = nbdev.config.get_config(
        cfg_name="settings.ini", path=os.getcwd()
    ).config_path  # the default location of nbdev config file (settings.ini)

# Core

 A module which contains common functions to be used by other modules. Those that exist in the template are meant to be common functions we can use against multiple packages.

#|hide

Notebook blocks starting with #|hide are not shown in the documentation and not exported to the python package. Blocks with #|export are exported to the python package. Blocks with neither are shown to the documentation but not exported to the python package.

## Libraries

Currently all libraries included are listed at the top and calls to them are also made in the block of code that uses them. This is for readability and the performance hit of the import is negligible.

In [None]:
# |export
# standard libs
import os
import sys

# Common to template
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
from fastcore import (
    test,
)
from fastcore.script import (
    call_parse,
)  # for @call_parse, https://fastcore.fast.ai/script

# Project specific libraries
import shutil  # using shell utilities
import pandas as pd
from pydantic import BaseModel, field_validator
from pathlib import Path
from typing import Any

## Config

Our config file holds all program and user specific variables. This is a good practice to follow as it allows us to easily change variables without having to change code. It also allows us to easily change variables based on the environment we are running in. For example, we may want to run a program in a test environment with a different database than we would in production. This is also a good practice to follow as it allows us to easily change variables without having to change code. It also allows us to easily change variables based on the environment we are running in. For example, we may want to run a program in a test environment with a different database than we would in production.

Configuration is templated to rely on environment (ENV) variables. A default ENV config is provided in `./config/config.default.env` and more advanced data structures are supported in `./config/config.default.yaml`. The `.yaml` file is meant to represent what your program actually works with and the `.env` file options the user can change at run time.

Make sure you know the priority of variables and check on them when debugging your code. Also ensure that your yaml file is referenced appropriately in the `.env` file. 

When in use there's an expectation you'll have multiple config files for different use cases e.g. development, production environment for different paths, etc.

### set env variables
A helper function for getting your config values, this will set the environment variables with the provided `.env` values. If you're missing values it'll ensure they're loaded in with the defaults file.

In [None]:
# |export
def set_env_variables(config_path: str, overide_env_vars: bool = True) -> bool:
    """
    Load dot env sets environmental values from a file, if the value already exists it will not be overwritten unless override is set to True.
    If we have multiple .env files then we need to apply the one which we want to take precedence last with overide.

    Order of precedence: .env file > environment variables > default values
    When developing, making a change to the config will not be reflected until the environment is restarted

    Set the env vars first, this is needed for the card.yaml to replace ENV variables
    NOTE: You need to adjust PROJECT_NAME to your package name for this to work, the exception is only for dev purposes
    This here checks if your package is installed, such as through pypi or through pip install -e  [.dev] for development. If it is then it'll go there and use the config files there as your default values.

    Args:
    config_path (str): path to the config file
    overide_env_vars (bool): if True, will overwrite existing env variables

    Returns:
    bool: True if successful, False otherwise
    """
    try:
        dotenv.load_dotenv(f"{PACKAGE_DIR}/config/config.default.env", override=False)
    except Exception as e:
        print(f"Error: {PACKAGE_DIR}/config/config.default.env does not exist")
        return False

    # 2. set values from file:
    if os.path.isfile(config_path):
        dotenv.load_dotenv(config_path, override=overide_env_vars)

    return True

### get config

When you run this function, assuming things are set up properly, you end up with a dict that matches your `.yaml` file. This file will have all the inputs for the package and settings of your program.

To do this it will use a `.env` config file, which has an associated yaml file defined with `CORE_YAML_CONFIG_FILE` in the `.env` file. And then use the `.env` file to load values into the associated `.yaml` file.

In [None]:
# |export
def get_config(config_path: str = None, overide_env_vars: bool = True) -> dict:
    """
    Load the config.env from the config path, the config.env should reference the config.yaml file, which will be loaded and returned as
    a dictionary. The config.yaml file should be in the same directory as the config.env file.

    Args:
    config_path (str): The path to the config.env file
    overide_env_vars (bool): If the env vars should be overriden by the config.yaml file

    Returns:
    dict: The config.yaml file as a dictionary, it'll also replace any ENV variables in the yaml file
    """
    if config_path is None:
        config_path = ""
    # First sets environment with variables from config_path, then uses those variables to fill in appropriate values in the config.yaml file, the yaml file is then returned as a dict
    # If you want user env variables to take precedence over the config.yaml file then set overide_env_vars to False
    set_env_variables(config_path, overide_env_vars)

    config: dict = envyaml.EnvYAML(
        os.environ.get(
            "CORE_YAML_CONFIG_FILE", f"{PACKAGE_DIR}/config/config.default.yaml"
        ),
        strict=False,
    ).export()

    return config

### Variables

All the user input variables and machine adjustable variables should be in your config, which is a dict. Reference config.default.yaml for how to access your variables. Also note that with python dicts you can use `dict_variable.get("variable", default_value)` to ensure that you don't get a key error if the variable is not set.

In [None]:
# |export
# create a os.PathLike object
config = get_config(os.environ.get("CORE_CONFIG_FILE", ""))

### show project env vars
A helper function intended to only be used with debugging. It shows all your project specific environmental variables.

In [None]:
# |export


def show_project_env_vars(config: dict) -> None:
    """
    Show all the project environment variables, this is useful for debugging and seeing what is being set

    Args:
    config (dict): The dictionary of all the environment variables

    Returns:
    None
    """
    for k, v in config.items():
        # If ENV var starts with PROJECTNAME_ then print
        if k.startswith(config["CORE_PROJECT_VARIABLE_PREFIX"]):
            print(f"{k}={v}")

In [None]:
# |hide
# checking local vars
show_project_env_vars(config)

In [None]:
# |export
# import shutil # called at top
def tool_is_present(tool_name: str) -> bool:
    """
    Check if a tool is present in the current environment

    Args:
    tool_name (str): The name of the tool to check

    Returns:
    bool: True if the tool is present, False otherwise
    """
    return shutil.which(tool_name) is not None

In [None]:
# |export
# import sys # for stderr, called at top
def tools_are_present(tool_names: list) -> bool:
    """
    Check if all tools are present in the current environment

    Args:
    tool_names (list): A list of tools to check

    Returns:
    bool: True if all tools are present, False otherwise
    """
    tools_present: bool = True
    for tool in tool_names:
        if not tool_is_present(tool):
            print(f"Tool {tool} is not present in current environment", file=sys.stderr)
            tools_present = False
    return tools_present

## get_samplesheet
This function is to unify the way we work with sample_sheet's which is for us a file with a table of values, typically samples for batch processing. We want to approach doing it this way so all programs have batch processing in mind and working with the same data structure.

To make use of it we have a small sample_sheet yaml object which looks like
    
```yaml
sample_sheet:
    path: path/to/sample_sheet.tsv
    delimiter: '\t' # Optional, will assume , for csv and \t otherwises
    header: 0 # Optional, 0 indicates first row is header, None indicates no header
    columns: ['column1', 'column2', 'column3'] # Optional, if not provided all columns will be used
```

Make sure to add that to your relevant section in your config (can be multiple times if you're working with different sheets or different columns), then call the function on this object and it'll either mention somethings wrong or return a pandas dataframe with the columns of interest.

This is an example of a common sample_sheet we work with. We will ingest the hash at the beginning so it doesn't affect column naming. Extra empty rows at the end are also stripped.
```tsv
#sample_id	file_path	metadata1	metadata2
Sample1	/path/to/sample1.fasta	value1	option1
Sample2	/path/to/sample2.fasta	value2	option2
Sample3	/path/to/sample3.fasta	value3	option1
Sample4	/path/to/sample4.fasta	value1	option2
Sample5	/path/to/sample5.fasta	value2	option1
```

In [None]:
# | export


# library imports handled at top
# import pandas as pd
# from pydantic import BaseModel, field_validator
# from pathlib import Path
# from typing import Any
class SamplesheetConfig(BaseModel):
    """
    Configuration class for loading a sample sheet into a pandas dataframe

    Extends:
    BaseModel
    """

    path: str
    delimiter: str = "\t"
    header: int = 0
    columns: list | None = None

    # Custom validator to check if the file exists
    @field_validator("path")
    def check_file_exists(cls, value):
        if not Path(value).is_file():
            raise ValueError(f"The file at path '{value}' does not exist.")
        return value

    # Override __init__ to accept a dictionary directly, for backwards compatibility probably should just use parse_obj
    def __init__(self, config: dict[str, Any]):
        super().__init__(**config)  # Unpack dictionary internally


def get_samplesheet(samplesheet_config: SamplesheetConfig) -> pd.DataFrame:
    """
    Load the sample sheet into a pandas dataframe
    If columns is not None then it will only load those columns
    If the sample sheet is a csv then it will load it as a csv, otherwise it will assume it's a tsv

    Expected samplesheet_config:
    sample_sheet:
      path: path/to/sample_sheet.tsv
      delimiter: '\t' # Optional, will assume , for csv and \t otherwises
      header: 0 # Optional, 0 indicates first row is header, None indicates no header
      columns: ['column1', 'column2', 'column3'] # Optional, if not provided all columns will be used

    Example sample sheet:
    #sample_id	file_path	metadata1	metadata2
    Sample1	/path/to/sample1.fasta	value1	option1
    Sample2	/path/to/sample2.fasta	value2	option2
    Sample3	/path/to/sample3.fasta	value3	option1
    Sample4	/path/to/sample4.fasta	value1	option2
    Sample5	/path/to/sample5.fasta	value2	option1

    Args:
    samplesheet_config (SamplesheetConfig): The configuration for loading the sample sheet

    Returns:
    pd.DataFrame: The sample sheet as a pandas dataframe
    """
    try:
        # note when we have a header the first column may begin with a #, so we need to remove it
        df = pd.read_csv(
            samplesheet_config.path,
            delimiter=samplesheet_config.delimiter,
            header=samplesheet_config.header,
            comment=None,
        )
    except Exception as e:
        print(
            "Error: Could not load sample sheet into dataframe, you have a problem with your sample sheet or the configuration."
        )
        raise e

    # Check the first header has a # in it, if so remove it for only that item
    if df.columns[0].startswith("#"):
        df.columns = [col.lstrip("#") for col in df.columns]
    # Ensure the sample sheet has the correct columns
    if samplesheet_config.columns is not None and not all(
        [col in df.columns for col in samplesheet_config.columns]
    ):
        raise ValueError("Error: Sample sheet does not have the correct columns")
    # also drop columns which are not needed
    if samplesheet_config.columns is not None:
        df = df[samplesheet_config.columns]

    # Clean the df of any extra rows that can be caused by empty lines in the sample sheet
    df = df.dropna(how="all")
    return df

In [None]:
config_dict = {
    "path": f"{PROJECT_DIR}/input/example_samplesheet.tsv",
    "delimiter": "\t",
    "columns": None,
}

samplesheet_config = SamplesheetConfig(config_dict)

print(get_samplesheet(samplesheet_config))

The functions below are **not** tempalted and you should adjust this with your own code. It's included as an example of how to code some functions with associated tests and how to make it work on the command line. It is best to code by creating a new workbook and then importing the functions of this into that one.

In [None]:
# |export


def hello_world(name: str = "Not given") -> str:
    """
    A simple function that returns a hello world message with a name, for testing purposes
    """
    return f"Hello World! My name is {name}"

This here is a a test as part of fastcore.test, all fastcore tests will be automatically run when doing nbdev_test as well as through github actions.

In [None]:
test.test_eq("Hello World! My name is Kim", hello_world("Kim"))

The @call_parse will, with the settings.ini entry way, automatically transform your function into a command line tool. Comments of the functions will appear for help messages and the initial docstring will appear in the help as well. You can also define defaults for the arguments and should define a typehint to control inputs. The function will likely have to resolve variables with ENV vars and config files. The recommended way to do this is to assume variables passed here are a higher priority.

In [None]:
# |export


# from fastcore.script import call_parse # called at top, with settings.ini it will let you call it from the command line
@call_parse
def cli(
    name: str,  # Your name
    config_file: str = None,  # config file to set env vars from
):
    """
    This will print Hello World! with your name

    Args:
    name (str): Your name
    config_file (str): The path to the config file, if not provided it will use the default config file
    """
    config = get_config(config_file)  # Set env vars and get config variables
    if name is not None:
        config["example"]["input"]["name"] = name

    print(hello_world(config["example"]["input"]["name"]))

Test the function with potentially variable input to confirm output

In [None]:
test.test_eq(
    "Hello World! My name is Kim", hello_world(config["example"]["input"]["name"])
)
test.test_eq(None, cli("Kim"))

In [None]:
# | export
import subprocess


def create_bash_script(script_path, script_content, slurm_params=None):
    """
    Create a bash script with the given content.
    """

    # Add slurm headers to the script
    if slurm_params:
        with open(script_path, "r") as script_file:
            script_content = script_file.read()
        script_content = (
            "#!/bin/bash\n"
            + "\n".join(
                [f"#SBATCH --{key}={value}" for key, value in slurm_params.items()]
            )
            + "\n"
            + script_content
        )
        with open(script_path, "w") as script_file:
            script_file.write(script_content)
    else:
        with open(script_path, "w") as script_file:
            script_file.write(script_content)


def execute_job(script_path, use_slurm=False, slurm_params=None):
    """
    Executes a bash script either locally (bash) or via Slurm.
    """
    if use_slurm:
        # Build sbatch command
        sbatch_command = ["sbatch"]
        if slurm_params:
            for key, value in slurm_params.items():
                sbatch_command.append(f"--{key}={value}")
        sbatch_command.append(script_path)

        # Submit to Slurm
        result = subprocess.run(sbatch_command, capture_output=True, text=True)
        if result.returncode == 0:
            print(f"Job submitted to Slurm: {result.stdout.strip()}")
        else:
            print(f"Error submitting to Slurm: {result.stderr.strip()}")
    else:
        # Run locally
        result = subprocess.run(["bash", script_path], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"Script executed locally: {result.stdout.strip()}")
        else:
            print(f"Error executing script locally: {result.stderr.strip()}")

In [None]:
def submit_job(script_path, dependency=None) -> str | None:
    """
    Submit a Slurm job with optional dependencies.

    Args:
    script_path (str): Path to the script to submit.
    dependency (str): Job ID to depend on.

    Returns:
    str: Job ID if successful, None otherwise.
    """
    command = ["sbatch"]
    if dependency:
        command.append(f"--dependency=afterok:{dependency}")
    command.append(script_path)

    result = subprocess.run(command, capture_output=True, text=True)
    if result.returncode == 0:
        job_id = result.stdout.strip().split()[-1]  # Extract job ID from output
        print(f"Job submitted: {job_id}")
        return job_id
    else:
        print(f"Error submitting job: {result.stderr.strip()}")
        return None

In [None]:
def submit_workflow(scripts) -> str | None:
    """
    Submit a series of scripts with dependencies.

    Args:
    scripts: List of script paths to submit, they will be submitted in order with previous job as a dependency.

    Returns:
    str: Job ID of the last submitted job if successful, None otherwise.
    """
    previous_job_id = None
    for script in scripts:
        dependency = (  # Construct the dependency argument if there is a previous job
            f"--dependency=afterok:{previous_job_id}" if previous_job_id else None
        )
        previous_job_id = submit_job(script, dependency)
    return previous_job_id

In [None]:
# | hide
# This is included at the end to ensure when you run through your notebook the code is also transferred to the associated python package
import nbdev

nbdev.nbdev_export()