diff --git a/docs/snakefiles/rules.rst b/docs/snakefiles/rules.rst index e1f2f0076..fb195befc 100644 --- a/docs/snakefiles/rules.rst +++ b/docs/snakefiles/rules.rst @@ -909,6 +909,124 @@ If your script uses any of these packages, you do not need to ``use`` them in yo .. |json_typegen| replace:: ``json_typegen`` .. _json_typegen: https://github.com/evestera/json_typegen + +Bash +~~~~ + +Bash scripts work much the same as the other script languages above, but with some important differences. Access to the +rule's directives is provided through the use of `associative arrays `_ - **requiring Bash version 4.0 or greater**. +One "limitation" of associative arrays is they cannot be nested. As such, the following rule directives are found in a separate +variable, named as ``snakemake_``: + +* ``input`` +* ``output`` +* ``log`` +* ``wildcards`` +* ``resources`` +* ``params`` +* ``config`` + +Access to the ``input`` directive is faciliated through the bash associative array named ``snakemake_input``. The +remaining directives can be found in the variable ``snakemake``. + +.. sidebar:: Note + + As arrays cannot be nested in Bash, use of python's ``dict`` in directives is not supported. So, adding a ``params`` key of ``data={"foo": "bar"}`` will not be reflected - ``${snakemake_params[data]}`` actually only returns ``"foo"``. + +Bash Example 1 +^^^^^^^^^^^^^^ + +.. code-block:: python + + rule align: + input: + "{sample}.fq", + reference="ref.fa", + output: + "{sample}.sam" + params: + opts="-a -x map-ont", + threads: 4 + log: + "align/{sample}.log" + conda: + "envs/align.yaml" + script: + "scripts/align.sh" + + + +``align.sh`` + +.. code-block:: bash + + #!/usr/bin/env bash + + echo "Aligning sample ${snakemake_wildcards[sample]} with minimap2" 2> "${snakemake_log[0]}" + + minimap2 ${snakemake_params[opts]} -t ${snakemake[threads]} "${snakemake_input[reference]}" \ + "${snakemake_input[0]}" > "${snakemake_output[0]}" 2>> "${snakemake_log[0]}" + + +If you don't add a shebang, the default ``#!/usr/bin/env bash`` will be inserted for you. A tutorial on how to use +associative arrays can be found `here `_. + +You may also have noticed the mixed use of double-quotes when accessing some variables. It is generally good practice in +Bash to double-quote variables for which you want to `prevent word splitting `_; generally, you will want to +double-quote any variable that could contain a file name. However, `in some cases `_, word splitting *is* desired, +such as ``${snakemake_params[opts]}`` in the above example. + +Bash Example 2 +^^^^^^^^^^^^^^ + +.. code-block:: python + + rule align: + input: + reads=["{sample}_R1.fq", "{sample}_R2.fq]"], + reference="ref.fa", + output: + "{sample}.sam" + params: + opts="-M", + threads: 4 + log: + "align/{sample}.log" + conda: + "envs/align.yaml" + script: + "scripts/align.sh" + + +In this example, the ``input`` variable ``reads``, which is a python list, actually gets stored as a space-separated string +in Bash because, you guessed it, you can't nest arrays in Bash! So in order to access the individual members, we turn the +string into an array; allowing us to access individual elements of the list/array. See `this stackoverflow question `_ for other solutions. + +``align.sh`` + +.. code-block:: bash + + #!/usr/bin/env bash + + exec 2> "${snakemake_log[0]}" # send all stderr from this script to the log file + + reads=(${snakemake_input[reads]}) # don't double-quote this - we want word splitting + + r1="${reads[0]}" + r2="${reads[1]}" + + bwa index "${snakemake_input[reference]}" + bwa mem ${snakemake_params[opts]} -t ${snakemake[threads]} \ + "${snakemake_input[reference]}" "$r1" "$r2" > "${snakemake_output[0]}" + +If, in the above example, the fastq reads were not in a named variable, but were instead just a list, they would be available +as ``"${snakemake_input[0]}"`` and ``"${snakemake_input[1]}"``. + +.. _arrays: https://www.gnu.org/software/bash/manual/html_node/Arrays.html#Arrays +.. _split: https://github.com/koalaman/shellcheck/wiki/SC2046 +.. _exception: https://github.com/koalaman/shellcheck/wiki/SC2046#exceptions +.. _so: https://stackoverflow.com/q/1469849/5299417 + ---- For technical reasons, scripts are executed in ``.snakemake/scripts``. The original script directory is available as ``scriptdir`` in the ``snakemake`` object. diff --git a/snakemake/script.py b/snakemake/script.py index dff4726dd..a38de294a 100644 --- a/snakemake/script.py +++ b/snakemake/script.py @@ -6,6 +6,8 @@ import inspect import itertools import os +from collections.abc import Iterable + from snakemake import sourcecache from snakemake.sourcecache import ( LocalSourceFile, @@ -22,7 +24,7 @@ import re from abc import ABC, abstractmethod from pathlib import Path -from typing import Tuple, Pattern, Union, Optional +from typing import Tuple, Pattern, Union, Optional, List from urllib.request import urlopen, pathname2url from urllib.error import URLError @@ -310,6 +312,78 @@ def encode_namedlist(cls, namedlist): return source +class BashEncoder: + """bash docs for associative arrays - https://www.gnu.org/software/bash/manual/html_node/Arrays.html#Arrays""" + + def __init__( + self, + namedlists: List[str] = None, + dicts: List[str] = None, + prefix: str = "snakemake", + ): + """namedlists is a list of strings indicating the snakemake object's member + variables which are encoded as Namedlist. + dicts is a list of strings indicating the snakemake object's member variables + that are encoded as dictionaries. + Prefix is the prefix for the bash variable name(s) e.g., snakemake_input + """ + if dicts is None: + dicts = [] + if namedlists is None: + namedlists = [] + self.namedlists = namedlists + self.dicts = dicts + self.prefix = prefix + + def encode_snakemake(self, smk: Snakemake) -> str: + """Turn a snakemake object into a collection of bash associative arrays""" + arrays = [] + main_aa = dict() + for var in vars(smk): + val = getattr(smk, var) + if var in self.namedlists: + aa = f"{self.prefix}_{var.strip('_').lower()}={self.encode_namedlist(val)}" + arrays.append(aa) + elif var in self.dicts: + aa = f"{self.prefix}_{var.strip('_').lower()}={self.dict_to_aa(val)}" + arrays.append(aa) + else: + main_aa[var] = val + + arrays.append(f"{self.prefix}={self.dict_to_aa(main_aa)}") + return "\n".join(arrays) + + @staticmethod + def dict_to_aa(d: dict) -> str: + """Converts a dictionary to an associative array""" + s = "( " + for k, v in d.items(): + s += f'[{k}]="{v}" ' + + s += ")" + return s + + @classmethod + def encode_namedlist(cls, named_list) -> str: + """Convert a namedlist into a bash associative array + This produces the array component of the variable. + e.g. ( [var1]=val1 [var2]=val2 ) + to make it a correct bash associative array, you need to name it with + name= + """ + aa = "(" + + for i, (name, val) in enumerate(named_list._allitems()): + if isinstance(val, Iterable) and not isinstance(val, str): + val = " ".join(val) + aa += f' [{i}]="{val}"' + if name is not None: + aa += f' [{name}]="{val}"' + + aa += " )" + return aa + + class ScriptBase(ABC): editable = False @@ -424,7 +498,7 @@ def _execute_cmd(self, cmd, **kwargs): singularity_args=self.singularity_args, resources=self.resources, threads=self.threads, - **kwargs + **kwargs, ) @@ -1229,6 +1303,95 @@ def _strip_code_block_manifest(src: str) -> Tuple[str, str]: return "", src +class BashScript(ScriptBase): + @staticmethod + def generate_preamble( + path, + source, + basedir, + input_, + output, + params, + wildcards, + threads, + resources, + log, + config, + rulename, + conda_env, + container_img, + singularity_args, + env_modules, + bench_record, + jobid, + bench_iteration, + cleanup_scripts, + shadow_dir, + is_local, + ) -> str: + snakemake = Snakemake( + input_=input_, + output=output, + params=params, + wildcards=wildcards, + threads=threads, + resources=resources, + log=log, + config=config, + rulename=rulename, + bench_iteration=bench_iteration, + scriptdir=path.get_basedir().get_path_or_uri(), + ) + + namedlists = ["input", "output", "log", "resources", "wildcards", "params"] + dicts = ["config"] + encoder = BashEncoder(namedlists=namedlists, dicts=dicts) + preamble = encoder.encode_snakemake(snakemake) + return preamble + + def get_preamble(self): + preamble = BashScript.generate_preamble( + path=self.path, + source=self.source, + basedir=self.basedir, + input_=self.input, + output=self.output, + params=self.params, + wildcards=self.wildcards, + threads=self.threads, + resources=self.resources, + log=self.log, + config=self.config, + rulename=self.rulename, + conda_env=self.conda_env, + container_img=self.container_img, + singularity_args=self.singularity_args, + env_modules=self.env_modules, + bench_record=self.bench_record, + jobid=self.jobid, + bench_iteration=self.bench_iteration, + cleanup_scripts=self.cleanup_scripts, + shadow_dir=self.shadow_dir, + is_local=self.is_local, + ) + return preamble + + def write_script(self, preamble, fd): + content = self.combine_preamble_and_source(preamble) + fd.write(content.encode()) + + def combine_preamble_and_source(self, preamble: str): + rgx = re.compile(r"^#![^\[].*?(\r\n|\n)") + shebang, source = strip_re(rgx, self.source) + if not shebang: + shebang = r"#!/usr/bin/env bash" + + return "\n".join([shebang, preamble, source]) + + def execute_script(self, fname, edit=False): + self._execute_cmd("bash {fname:q}", fname=fname) + + def strip_re(regex: Pattern, s: str) -> Tuple[str, str]: """Strip a substring matching a regex from a string and return the stripped part and the remainder of the original string. @@ -1289,6 +1452,8 @@ def get_language(source_file, source): language = "julia" elif filename.endswith(".rs"): language = "rust" + elif filename.endswith(".sh"): + language = "bash" # detect kernel language for Jupyter Notebooks if language == "jupyter": @@ -1344,6 +1509,7 @@ def script( "rmarkdown": RMarkdown, "julia": JuliaScript, "rust": RustScript, + "bash": BashScript, }.get(language, None) if exec_class is None: raise ValueError( diff --git a/tests/test_script.py b/tests/test_script.py index 93103f1aa..cf429ab25 100644 --- a/tests/test_script.py +++ b/tests/test_script.py @@ -1,6 +1,7 @@ from textwrap import dedent -from snakemake.script import RustScript +from snakemake.io import InputFiles +from snakemake.script import RustScript, BashEncoder class TestRustScriptExtractManifest: @@ -556,3 +557,30 @@ def test_code_block_manifest_with_outer_line_doc_comment(self): ) assert remaining_src == expected_remaining_src + + +class TestBashEncoder: + def test_named_list_one_named_one_str(self): + """InputFiles is a subclass of snakemake.io.NamedInput + ierate over input and store each with the integer index - i.e 0, 1, 2 + then use input.items() to iterate over the named files and store them as named also + check how this works with named things being lists + """ + named_list = InputFiles(["test.in", "named.in"]) + named_list._set_name("named", 1) + + actual = BashEncoder.encode_namedlist(named_list) + expected = r"""( [0]="test.in" [1]="named.in" [named]="named.in" )""" + + assert actual == expected + + def test_named_list_named_is_list(self): + """Named lists that are lists of files become a space-separated string as you + can't nest arrays in bash""" + named_list = InputFiles(["test1.in", ["test2.in", "named.in"]]) + named_list._set_name("named", 1) + + actual = BashEncoder.encode_namedlist(named_list) + expected = r"""( [0]="test1.in" [1]="test2.in named.in" [named]="test2.in named.in" )""" + + assert actual == expected diff --git a/tests/test_script/Snakefile b/tests/test_script/Snakefile index 8d767f773..e0500d355 100644 --- a/tests/test_script/Snakefile +++ b/tests/test_script/Snakefile @@ -10,6 +10,7 @@ rule all: "rust.out", "rust-manifest.out", "rust-outer-line-doc.out", + "bash.out", rule: input: @@ -96,3 +97,23 @@ rule: "envs/rust.yaml" script: "scripts/test-outer-line-doc.rs" + + +rule bash: + input: + "test2.in", + named_input="test.in", + output: + "bash.out" + params: + integer=123, + string="foo", + d={"a": "foo"} + resources: + mem_mb=1024 + log: + "bash.log" + conda: + "envs/bash.yaml" + script: + "scripts/test.sh" diff --git a/tests/test_script/envs/bash.yaml b/tests/test_script/envs/bash.yaml new file mode 100644 index 000000000..11b962fbb --- /dev/null +++ b/tests/test_script/envs/bash.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - bash >=4.0 diff --git a/tests/test_script/expected-results/bash.out b/tests/test_script/expected-results/bash.out new file mode 100644 index 000000000..123202edf --- /dev/null +++ b/tests/test_script/expected-results/bash.out @@ -0,0 +1,3 @@ +The first input file is test2.in +The named input file is test.in +The requested number of threads is 1 diff --git a/tests/test_script/scripts/test.sh b/tests/test_script/scripts/test.sh new file mode 100644 index 000000000..6f4f60ede --- /dev/null +++ b/tests/test_script/scripts/test.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +echo "The first input file is ${snakemake_input[0]}" > "${snakemake_output[0]}" 2> "${snakemake_log[0]}" +echo "The named input file is ${snakemake_input[named]}" >> "${snakemake_output[0]}" 2>> "${snakemake_log[0]}" +echo "The requested number of threads is ${snakemake[threads]}" >> "${snakemake_output[0]}" 2>> "${snakemake_log[0]}" +