Skip to content

Commit

Permalink
Merge pull request #6 from cokelaer/dev
Browse files Browse the repository at this point in the history
refactorise to remove modules, include resources
  • Loading branch information
cokelaer committed Aug 12, 2022
2 parents 4d0ff06 + c1bf859 commit 0d7fdf6
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 60 deletions.
12 changes: 7 additions & 5 deletions .github/workflows/main.yml
Expand Up @@ -3,17 +3,20 @@ name: Tests
on:
push:
branches:
- master
- main
- dev
pull_request:
branches-ignore: []
schedule:
- cron: '0 0 * * SUN'

jobs:
build-linux:
runs-on: ubuntu-latest
strategy:
max-parallel: 5
matrix:
python: [3.7,3.8]
python: [3.7,3.8, 3.9]
fail-fast: false


Expand All @@ -35,12 +38,11 @@ jobs:
run: |
# $CONDA is an environment variable pointing to the root of the miniconda directory
echo $CONDA/bin >> $GITHUB_PATH
conda update ruamel_yaml
#conda update ruamel_yaml
- name: conda
run: |
conda install -c conda-forge mamba --quiet
mamba install -c bioconda -c conda-forge --quiet -y fastqc falco graphviz
conda install -c conda-forge -c bioconda --quiet -y python=${{ matrix.python }} fastqc falco graphviz
- name: Install dependencies
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/pypi.yml
Expand Up @@ -8,9 +8,9 @@ on:
jobs:
build-n-publish:
name: Build and publish to PyPI and TestPyPI
runs-on: ubuntu-18.04
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@master
- uses: actions/checkout@main
- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
Expand All @@ -26,14 +26,14 @@ jobs:
python setup.py sdist
- name: Publish distribution to Test PyPI
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
repository_url: https://test.pypi.org/legacy/
- name: Publish distribution to PyPI
if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
40 changes: 38 additions & 2 deletions sequana_pipelines/fastqc/config.yaml
Expand Up @@ -11,6 +11,34 @@
input_directory: '.'
input_pattern: '*fastq.gz'

################################################################################
# extra_prefixes_to_strip
#
# In most cases, the input_directory and input_pattern (and input_read_tag for paired analysis)
# allows us to retrieve unique sample names. We automatically removed common prefixes for you.
#
# Sometimes, some prefixes are not common to all samples but still need to be removed
# to get unique sample names or better output. You can provide extra prefixes to be removed
# by uncommenting and filling the field extra_prefixes_to_strip.
#
# For instance, if you have files called prefix.mess.A.fastq.gz and prefix.B.fastq.gz
# 'prefix.' will be removed automatically because it is common, but not 'mess'.
# use thoses prefixes in the left to right order ['prefix', 'mess'] or ['prefix.mess']
#
# extra_prefixes_to_strip: []


################################################################################
# sample_pattern
#
# You may have trailing words that are in sample names but not wanted. Consider
# the filename A_mess.fastq.gz, you can get rid of _mess if it appears in all
# samples using a pattern as follows '{sample}_mess.fastq.gz
#
# uncomment and fill to use this option. Be aware that prefixes are not removed
# if you use sample_pattern
#
# sample_pattern: '{sample}_mess.fastq.gz'

##############################################################################
# general section
Expand All @@ -28,10 +56,13 @@ general:
fastqc:
options: ''
threads: 4

resources:
mem: 8G
falco:
options: ''
threads: 4
resources:
mem: 8G

##############################################################################
#
Expand All @@ -45,6 +76,11 @@ multiqc:
options: -p -f
input_directory: "."
modules: fastqc # falco is not set; the fastqc module works for falco
config_file:
config_file:
resources:
mem: 8G

plotting_and_stats:
resources:
mem: 8G

89 changes: 50 additions & 39 deletions sequana_pipelines/fastqc/fastqc.rules
Expand Up @@ -19,35 +19,28 @@ import pandas as pd
from sequana.utils.datatables_js import DataTable
from sequana.utils.tree import HTMLDirectory

from sequana_pipetools import PipelineManagerGeneric
from sequana_pipetools import PipelineManager
from sequana_pipetools import snaketools as sm

# This must be defined before the include
configfile: "config.yaml"

# A convenient manager
def func(filename):
return filename.split("/")[-1].split('.', 1)[0]
manager = PipelineManager("fastqc", config)

manager = PipelineManagerGeneric("fastqc", config, sample_func=func)

# the manager can figure out whether data is paired or not but with input
# sam/bam it is not for sure, so we need some additional simple code here:
# we can try to figure out whether input data is paired.
PAIRED = False

# do we have illumina paired data with tag _R1_
# This is just for information. Not used in the pipeline but only for HTML rpeort
# do we have illumina paired data with tag _R1_ ?
R1 = [1 for x in manager.samples.values() if "_R1_" in x.split("/")[-1]]
R2 = [1 for x in manager.samples.values() if "_R2_" in x.split("/")[-1]]

PAIRED = False
if len(R1) == len(R2) and len(R1) != 0:
PAIRED = True
else:
R1 = [1 for x in manager.samples.values() if "_1." in x.split("/")[-1]]
R2 = [1 for x in manager.samples.values() if "_2." in x.split("/")[-1]]
if len(R1) == len(R2) and len(R1) != 0:
PAIRED = True
manager.paired = PAIRED
manager._paired = PAIRED


# Some sanity checks
Expand All @@ -73,19 +66,40 @@ if 'general' in config and 'method_choice' in config['general'] and \

METHOD = "falco"

__falco__input = manager.getrawdata()
__falco__ouptut = "samples/{sample}/summary.txt"
include: sm.modules["falco"]
__qc_done__ = expand(__falco__ouptut, sample=manager.samples)
rule falco:
input: manager.getrawdata()
output: "samples/{sample}/summary.txt"
log:
"samples/{sample}/falco.log"
threads:
config['falco']['threads']
params:
options=config['falco']['options'],
working_directory="samples/{sample}"
resources:
**config['falco']['resources']
wrapper:
"main/wrappers/falco"
__multiqc__input = expand("samples/{sample}/summary.txt", sample=manager.samples)

else:
METHOD = "fastqc"
__fastqc__input = manager.getrawdata()
__fastqc__output = "samples/{sample}/fastqc.done"
__fastqc__log = "samples/{sample}/fastqc.log"
__fastqc__wkdir = "samples/{sample}"
include: sm.modules["fastqc"]
__qc_done__ = expand(__fastqc__output, sample=manager.samples)

rule fastqc:
input: manager.getrawdata()
output: "samples/{sample}/fastqc.done"
log:
"samples/{sample}/fastqc.log"
threads:
config['fastqc']['threads']
params:
options=config['fastqc']['options'],
working_directory="samples/{sample}"
resources:
**config['fastqc']['resources']
wrapper:
"main/wrappers/fastqc"
__multiqc__input = expand("samples/{sample}/fastqc.done", sample=manager.samples)


# define a list of files for the md5sum
Expand Down Expand Up @@ -130,43 +144,40 @@ comments += f"""<br><b><a href="https://github.com/sequana/sequana_pipetools">Se
# Multiqc rule
if config['multiqc']['do']:

if METHOD == "falco":
__multiqc__input = expand("samples/{sample}/summary.txt", sample=manager.samples)
else:
__multiqc__input = expand(__qc_done__, sample=manager.samples)
# do not specify fastqc itself alone, otherwise it fails (feb 2020)

config['multiqc']['options'] = config["multiqc"]["options"] + f" --comment '{comments}'"


__multiqc__output = "multiqc/multiqc_report.html"
rule multiqc:
input:
input:
__multiqc__input
output:
__multiqc__output
params:
output:
"multiqc/multiqc_report.html"
params:
options=config['multiqc']['options'],
input_directory=config['multiqc']['input_directory'],
config_file=config['multiqc']['config_file'],
modules=config['multiqc']['modules']
log:
log:
"multiqc/multiqc.log"
wrapper:
resources:
**config["multiqc"]["resources"]
wrapper:
"main/wrappers/multiqc"


# ====================================================================== rulegraph
sequana_rulegraph_mapper = {}
if config['multiqc']['do']:
sequana_rulegraph_mapper["multiqc"] = f"../{__multiqc__output}"
sequana_rulegraph_mapper["multiqc"] = "../multiqc/multiqc_report.html"
include: sm.modules['rulegraph']



rule plotting_and_stats:
input: __qc_done__
input: expand("samples/{sample}/" + f"{METHOD}.done", sample=manager.samples)
output: "outputs/summary.png", "outputs/summary.json"
resources:
**config["multiqc"]["resources"]
run:
import glob
from sequana.fastqc import FastQC
Expand Down Expand Up @@ -229,7 +240,7 @@ onsuccess:
manager.teardown()

if config['multiqc']['do']:
manager.clean_multiqc(__multiqc__output)
manager.clean_multiqc("multiqc/multiqc_report.html")

# Now, the main HTML report

Expand Down
16 changes: 16 additions & 0 deletions sequana_pipelines/fastqc/schema.yaml
Expand Up @@ -20,6 +20,9 @@ mapping:
"threads":
type: int
required: True
"resources":
type: any
required: true

"multiqc":
type: map
Expand All @@ -34,6 +37,9 @@ mapping:
type: str
"input_directory":
type: str
"resources":
type: any
required: true
"general":
type: map
mapping:
Expand All @@ -51,3 +57,13 @@ mapping:
"threads":
type: int
required: True
"resources":
type: any
required: true

"plotting_and_stats":
type: map
mapping:
"resources":
type: any
required: true
11 changes: 4 additions & 7 deletions setup.py
Expand Up @@ -6,8 +6,8 @@
import subprocess

_MAJOR = 1
_MINOR = 4
_MICRO = 2
_MINOR = 5
_MICRO = 0
version = '%d.%d.%d' % (_MAJOR, _MINOR, _MICRO)
release = '%d.%d' % (_MAJOR, _MINOR)

Expand Down Expand Up @@ -68,22 +68,19 @@ def run(self):
classifiers = metainfo['classifiers'],

# package installation
packages = ["sequana_pipelines.fastqc",
'sequana_pipelines.fastqc.data' ],
packages = ["sequana_pipelines.fastqc"],

install_requires = open("requirements.txt").read(),

# This is recursive include of data files
exclude_package_data = {"": ["__pycache__"]},
package_data = {
'': ['*.yaml', "*.rules", "*.json", "requirements.txt", "*png"],
'sequana_pipelines.fastqc.data' : ['*.*'],
'': ['*.yaml', "*.rules", "*.json", "requirements.txt", "*png", "*yml", "*smk"]
},

zip_safe=False,

entry_points = {'console_scripts':[
'sequana_pipelines_fastqc=sequana_pipelines.fastqc.main:main',
'sequana_fastqc=sequana_pipelines.fastqc.main:main']
}

Expand Down
6 changes: 3 additions & 3 deletions test/test_main.py
Expand Up @@ -12,7 +12,7 @@
#
def test_standalone_subprocess():
directory = tempfile.TemporaryDirectory()
cmd = "sequana_pipelines_fastqc --input-directory {} "
cmd = "sequana_fastqc --input-directory {} "
cmd += "--working-directory {} --run-mode local --force"
cmd = cmd.format(sharedir, directory.name)
subprocess.call(cmd.split())
Expand All @@ -31,7 +31,7 @@ def test_full():
with tempfile.TemporaryDirectory() as directory:
wk = directory

cmd = "sequana_pipelines_fastqc --input-directory {} "
cmd = "sequana_fastqc --input-directory {} "
cmd += "--working-directory {} --run-mode local --force"
cmd = cmd.format(sharedir, wk)
subprocess.call(cmd.split())
Expand All @@ -46,5 +46,5 @@ def test_full():
assert os.path.exists(wk + "/multiqc/multiqc_report.html")

def test_version():
cmd = "sequana_pipelines_fastqc --version"
cmd = "sequana_fastqc --version"
subprocess.call(cmd.split())

0 comments on commit 0d7fdf6

Please sign in to comment.