Merge pull request #6 from cokelaer/dev

refactorise to remove modules, include resources
sequana · Aug 12, 2022 · 0d7fdf6 · 0d7fdf6
2 parents 4d0ff06 + c1bf859
commit 0d7fdf6
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 60 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -3,17 +3,20 @@ name: Tests
 on:
   push:
     branches:
-      - master
+      - main
+      - dev
   pull_request:
     branches-ignore: []
+  schedule:
+    - cron: '0 0 * * SUN'
 
 jobs:
   build-linux:
     runs-on: ubuntu-latest
     strategy:
       max-parallel: 5
       matrix:
-        python: [3.7,3.8]
+        python: [3.7,3.8, 3.9]
       fail-fast: false
 
 
@@ -35,12 +38,11 @@ jobs:
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
-        conda update ruamel_yaml
+        #conda update ruamel_yaml
 
     - name: conda
       run: |
-        conda install -c conda-forge mamba --quiet
-        mamba install -c bioconda -c conda-forge --quiet -y fastqc falco graphviz
+        conda install -c conda-forge -c bioconda --quiet -y python=${{ matrix.python }} fastqc falco graphviz
 
     - name: Install dependencies
       run: |

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -8,9 +8,9 @@ on:
 jobs:
   build-n-publish:
     name: Build and publish to PyPI and TestPyPI
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@main
     - name: Set up Python 3.7
       uses: actions/setup-python@v1
       with:
@@ -26,14 +26,14 @@ jobs:
           python setup.py sdist
 
     - name: Publish distribution to Test PyPI
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
         repository_url: https://test.pypi.org/legacy/
     - name: Publish distribution to PyPI
       if: startsWith(github.ref, 'refs/tags')
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         user: __token__
         password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/sequana_pipelines/fastqc/config.yaml b/sequana_pipelines/fastqc/config.yaml
@@ -11,6 +11,34 @@
 input_directory: '.'
 input_pattern: '*fastq.gz'
 
+################################################################################
+# extra_prefixes_to_strip
+#
+# In most cases, the input_directory and input_pattern (and input_read_tag for paired analysis)
+# allows us to retrieve unique sample names. We automatically removed common prefixes for you.
+#
+# Sometimes, some prefixes are not common to all samples but still need to be removed
+# to get unique sample names or better output. You can provide extra prefixes to be removed
+# by uncommenting and filling the field extra_prefixes_to_strip.
+#
+# For instance, if you have files called prefix.mess.A.fastq.gz and prefix.B.fastq.gz
+# 'prefix.' will be removed automatically because it is common, but not 'mess'. 
+# use thoses prefixes in the left to right order ['prefix', 'mess'] or ['prefix.mess']
+#
+# extra_prefixes_to_strip: []
+
+
+################################################################################
+# sample_pattern
+#
+# You may have trailing words that are in sample names but not wanted. Consider
+# the filename A_mess.fastq.gz, you can get rid of _mess if it appears in all 
+# samples using a pattern as follows '{sample}_mess.fastq.gz
+#
+# uncomment and fill to use this option. Be aware that prefixes are not removed
+# if you use sample_pattern
+#
+# sample_pattern: '{sample}_mess.fastq.gz'
 
 ##############################################################################
 # general section
@@ -28,10 +56,13 @@ general:
 fastqc:
     options: ''
     threads: 4
-
+    resources:
+        mem: 8G
 falco:
     options: ''
     threads: 4
+    resources:
+        mem: 8G
 
 ##############################################################################
 #
@@ -45,6 +76,11 @@ multiqc:
     options: -p -f
     input_directory: "."
     modules: fastqc    # falco is not set; the fastqc module works for falco
-    config_file:
+    config_file: 
+    resources:
+        mem: 8G
 
+plotting_and_stats:
+    resources:
+        mem: 8G
 
diff --git a/sequana_pipelines/fastqc/fastqc.rules b/sequana_pipelines/fastqc/fastqc.rules
@@ -19,35 +19,28 @@ import pandas as pd
 from sequana.utils.datatables_js import DataTable
 from sequana.utils.tree import HTMLDirectory
 
-from sequana_pipetools import PipelineManagerGeneric
+from sequana_pipetools import PipelineManager
 from sequana_pipetools import snaketools as sm
 
 # This must be defined before the include
 configfile: "config.yaml"
 
-# A convenient manager
-def func(filename):
-    return filename.split("/")[-1].split('.', 1)[0]
+manager = PipelineManager("fastqc", config)
 
-manager = PipelineManagerGeneric("fastqc", config, sample_func=func)
-
-# the manager can figure out whether data is paired or not but with input
-# sam/bam it is not for sure, so we need some additional simple code here:
-# we can try to figure out whether input data is paired.
-PAIRED = False
-
-# do we have illumina paired data with tag _R1_
+# This is just for information. Not used in the pipeline but only for HTML rpeort
+# do we have illumina paired data with tag _R1_ ?
 R1 = [1 for x in manager.samples.values() if "_R1_" in x.split("/")[-1]]
 R2 = [1 for x in manager.samples.values() if "_R2_" in x.split("/")[-1]]
 
+PAIRED = False
 if len(R1) == len(R2) and len(R1) != 0:
     PAIRED = True
 else:
     R1 = [1 for x in manager.samples.values() if "_1." in x.split("/")[-1]]
     R2 = [1 for x in manager.samples.values() if "_2." in x.split("/")[-1]]
     if len(R1) == len(R2) and len(R1) != 0:
         PAIRED = True
-manager.paired = PAIRED
+manager._paired = PAIRED
 
 
 # Some sanity checks
@@ -73,19 +66,40 @@ if 'general' in config and 'method_choice' in config['general'] and \
 
     METHOD = "falco"
 
-    __falco__input = manager.getrawdata()
-    __falco__ouptut = "samples/{sample}/summary.txt"
-    include: sm.modules["falco"]
-    __qc_done__ = expand(__falco__ouptut, sample=manager.samples)
+    rule falco:
+        input: manager.getrawdata()
+        output: "samples/{sample}/summary.txt"
+        log:
+            "samples/{sample}/falco.log"
+        threads:
+            config['falco']['threads']
+        params:
+            options=config['falco']['options'],
+            working_directory="samples/{sample}"
+        resources:
+            **config['falco']['resources']
+        wrapper:
+            "main/wrappers/falco"
+    __multiqc__input = expand("samples/{sample}/summary.txt", sample=manager.samples)
 
 else:
     METHOD = "fastqc"
-    __fastqc__input = manager.getrawdata()
-    __fastqc__output = "samples/{sample}/fastqc.done"
-    __fastqc__log = "samples/{sample}/fastqc.log"
-    __fastqc__wkdir = "samples/{sample}"
-    include: sm.modules["fastqc"]
-    __qc_done__ = expand(__fastqc__output, sample=manager.samples)
+
+    rule fastqc:
+        input: manager.getrawdata()
+        output: "samples/{sample}/fastqc.done"
+        log:
+            "samples/{sample}/fastqc.log"
+        threads:
+            config['fastqc']['threads']
+        params:
+            options=config['fastqc']['options'],
+            working_directory="samples/{sample}"
+        resources:
+            **config['fastqc']['resources']
+        wrapper:
+            "main/wrappers/fastqc"
+    __multiqc__input = expand("samples/{sample}/fastqc.done", sample=manager.samples)
 
 
 # define a list of files for the md5sum
@@ -130,43 +144,40 @@ comments += f"""<br><b><a href="https://github.com/sequana/sequana_pipetools">Se
 # Multiqc rule
 if config['multiqc']['do']:
 
-    if METHOD == "falco":
-        __multiqc__input = expand("samples/{sample}/summary.txt", sample=manager.samples)
-    else:
-        __multiqc__input = expand(__qc_done__, sample=manager.samples)
     # do not specify fastqc itself alone, otherwise it fails (feb 2020)
-
     config['multiqc']['options'] = config["multiqc"]["options"] + f" --comment '{comments}'"
 
 
-    __multiqc__output = "multiqc/multiqc_report.html"
     rule multiqc:
-       input:
+        input:
             __multiqc__input
-       output:
-           __multiqc__output
-       params:
+        output:
+            "multiqc/multiqc_report.html"
+        params:
            options=config['multiqc']['options'],
            input_directory=config['multiqc']['input_directory'],
            config_file=config['multiqc']['config_file'],
            modules=config['multiqc']['modules']
-       log:
+        log:
            "multiqc/multiqc.log"
-       wrapper:
+        resources:
+            **config["multiqc"]["resources"]
+        wrapper:
            "main/wrappers/multiqc"
 
 
 # ====================================================================== rulegraph 
 sequana_rulegraph_mapper = {}
 if config['multiqc']['do']:
-    sequana_rulegraph_mapper["multiqc"] = f"../{__multiqc__output}"
+    sequana_rulegraph_mapper["multiqc"] = "../multiqc/multiqc_report.html"
 include: sm.modules['rulegraph']
 
 
-
 rule plotting_and_stats:
-    input: __qc_done__
+    input: expand("samples/{sample}/" + f"{METHOD}.done", sample=manager.samples)
     output: "outputs/summary.png", "outputs/summary.json"
+    resources:
+        **config["multiqc"]["resources"]
     run:
         import glob
         from sequana.fastqc import FastQC
@@ -229,7 +240,7 @@ onsuccess:
     manager.teardown()
 
     if config['multiqc']['do']:
-        manager.clean_multiqc(__multiqc__output)
+        manager.clean_multiqc("multiqc/multiqc_report.html")
 
     # Now, the main HTML report
 

diff --git a/sequana_pipelines/fastqc/schema.yaml b/sequana_pipelines/fastqc/schema.yaml
@@ -20,6 +20,9 @@ mapping:
             "threads":
                 type: int
                 required: True
+            "resources":
+                type: any
+                required: true
 
     "multiqc":
         type: map
@@ -34,6 +37,9 @@ mapping:
                 type: str
             "input_directory":
                 type: str
+            "resources":
+                type: any
+                required: true
     "general":
         type: map
         mapping:
@@ -51,3 +57,13 @@ mapping:
             "threads":
                 type: int
                 required: True
+            "resources":
+                type: any
+                required: true
+
+    "plotting_and_stats":
+        type: map
+        mapping:
+            "resources":
+                type: any
+                required: true
diff --git a/setup.py b/setup.py
@@ -6,8 +6,8 @@
 import subprocess
 
 _MAJOR               = 1
-_MINOR               = 4
-_MICRO               = 2
+_MINOR               = 5
+_MICRO               = 0
 version              = '%d.%d.%d' % (_MAJOR, _MINOR, _MICRO)
 release              = '%d.%d' % (_MAJOR, _MINOR)
 
@@ -68,22 +68,19 @@ def run(self):
     classifiers      = metainfo['classifiers'],
 
     # package installation
-    packages = ["sequana_pipelines.fastqc",
-        'sequana_pipelines.fastqc.data' ],
+    packages = ["sequana_pipelines.fastqc"],
 
     install_requires = open("requirements.txt").read(),
 
     # This is recursive include of data files
     exclude_package_data = {"": ["__pycache__"]},
     package_data = {
-        '': ['*.yaml', "*.rules", "*.json", "requirements.txt", "*png"],
-        'sequana_pipelines.fastqc.data' : ['*.*'], 
+        '': ['*.yaml', "*.rules", "*.json", "requirements.txt", "*png", "*yml", "*smk"]
         },
 
     zip_safe=False,
 
     entry_points = {'console_scripts':[
-        'sequana_pipelines_fastqc=sequana_pipelines.fastqc.main:main',
         'sequana_fastqc=sequana_pipelines.fastqc.main:main']
     }
 

diff --git a/test/test_main.py b/test/test_main.py
@@ -12,7 +12,7 @@
 # 
 def test_standalone_subprocess():
     directory = tempfile.TemporaryDirectory()
-    cmd = "sequana_pipelines_fastqc --input-directory {} "
+    cmd = "sequana_fastqc --input-directory {} "
     cmd += "--working-directory {} --run-mode local --force"
     cmd = cmd.format(sharedir, directory.name)
     subprocess.call(cmd.split())
@@ -31,7 +31,7 @@ def test_full():
     with tempfile.TemporaryDirectory() as directory:
         wk = directory
 
-        cmd = "sequana_pipelines_fastqc --input-directory {} "
+        cmd = "sequana_fastqc --input-directory {} "
         cmd += "--working-directory {} --run-mode local --force"
         cmd = cmd.format(sharedir, wk)
         subprocess.call(cmd.split())
@@ -46,5 +46,5 @@ def test_full():
         assert os.path.exists(wk + "/multiqc/multiqc_report.html")
 
 def test_version():
-    cmd = "sequana_pipelines_fastqc --version"
+    cmd = "sequana_fastqc --version"
     subprocess.call(cmd.split())