From d4a97a9a5a85cc36b45a0563dbd9a5793242c766 Mon Sep 17 00:00:00 2001
From: Thomas Cokelaer
Date: Wed, 1 Apr 2026 11:28:52 +0200
Subject: [PATCH] Update rules to use get_shell
---
.github/workflows/main.yml | 4 +-
README.rst | 218 +++++++++++---
pyproject.toml | 12 +-
sequana_pipelines/chipseq/__init__.py | 10 +-
sequana_pipelines/chipseq/chipseq.rules | 367 +++++++++++++++---------
sequana_pipelines/chipseq/config.yaml | 13 +-
sequana_pipelines/chipseq/schema.yaml | 5 +-
test/README | 5 +
8 files changed, 441 insertions(+), 193 deletions(-)
create mode 100644 test/README
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f9ac3a7..e66b267 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
strategy:
max-parallel: 5
matrix:
- python: [3.8, 3.9, '3.10']
+ python: ['3.10', '3.11', '3.12']
fail-fast: false
@@ -29,7 +29,7 @@ jobs:
sudo apt-get install libopenblas-dev # for scipy
- name: checkout git repo
- uses: actions/checkout@v2
+ uses: actions/checkout@v4
- name: conda/mamba
uses: mamba-org/setup-micromamba@v1
diff --git a/README.rst b/README.rst
index 7a11d46..5ec9de2 100644
--- a/README.rst
+++ b/README.rst
@@ -2,35 +2,81 @@
.. image:: https://badge.fury.io/py/sequana-chipseq.svg
:target: https://pypi.python.org/pypi/sequana_chipseq
+.. image:: https://github.com/sequana/chipseq/actions/workflows/main.yml/badge.svg
+ :target: https://github.com/sequana/chipseq/actions/workflows/main.yml
+
+.. image:: https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg
+ :target: https://pypi.python.org/pypi/sequana
+ :alt: Python 3.10 | 3.11 | 3.12
+
.. image:: http://joss.theoj.org/papers/10.21105/joss.00352/status.svg
:target: http://joss.theoj.org/papers/10.21105/joss.00352
:alt: JOSS (journal of open source software) DOI
-.. image:: https://github.com/sequana/chipseq/actions/workflows/main.yml/badge.svg
- :target: https://github.com/sequana/chipseq/actions/
-
+This is the **chipseq** pipeline from the `Sequana `_ project.
-.. image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C3.10-blue.svg
- :target: https://pypi.python.org/pypi/sequana
- :alt: Python 3.8 | 3.9 | 3.10
+:Overview: ChIP-seq pipeline from raw reads to peaks, IDR statistics, and functional annotation
+:Input: Paired or single-end FastQ files and a CSV experimental design file
+:Output: HTML summary report, narrow/broad peak files, IDR statistics, bigwig tracks, and annotation tables
+:Status: Production
+:Citation: Cokelaer et al, (2017), 'Sequana': a Set of Snakemake NGS pipelines, Journal of Open Source Software, 2(16), 352, JOSS DOI https://doi:10.21105/joss.00352
-This is is the **chipseq** pipeline from the `Sequana `_ project
+.. image:: sequana_pipelines/chipseq/dag.png
+ :width: 100%
-:Overview: ChIP-seq pipeline to detect peaks using IDR statistics
-:Input: Set of fastq files and a design file
-:Output: HTML reports and various plots and annotation files
-:Status: production
-:Citation: Cokelaer et al, (2017), ‘Sequana’: a Set of Snakemake NGS pipelines, Journal of Open Source Software, 2(16), 352, JOSS DOI doi:10.21105/joss.00352
+.. image:: sequana_pipelines/chipseq/dag_complete.png
+ :width: 100%
Installation
~~~~~~~~~~~~
-Just install this package using Python **pip** software::
+::
pip install sequana_chipseq --upgrade
+You will also need the third-party tools listed under Requirements below.
+
+
+Quick Start
+~~~~~~~~~~~
+
+**1. Prepare a design file** ``design.csv``::
+
+ type,condition,replicat,sample_name
+ IP,EXP1,1,IP_EXP1_rep1
+ IP,EXP1,2,IP_EXP1_rep2
+ Input,EXP1,1,Input_EXP1
+
+- ``type`` must be ``IP`` (immunoprecipitated) or ``Input`` (control).
+- ``sample_name`` must match the prefix of the corresponding FastQ file
+ (e.g. ``IP_EXP1_rep1`` matches ``IP_EXP1_rep1_R1_.fastq.gz``).
+- At least two IP replicates per condition are required for IDR analysis.
+
+**2. Prepare a genome directory** named after the genome, containing:
+
+- ``.fa`` — reference genome FASTA
+- ``.gff`` or ``.gff3`` — gene annotation
+
+Example::
+
+ ecoli_MG1655/
+ ├── ecoli_MG1655.fa
+ └── ecoli_MG1655.gff
+
+**3. Set up the pipeline**::
+
+ sequana_chipseq \
+ --input-directory DATAPATH \
+ --genome-directory /path/to/ecoli_MG1655 \
+ --design-file design.csv
+
+**4. Run the pipeline**::
+
+ cd chipseq
+ sh chipseq.sh
+
Usage
~~~~~
@@ -38,45 +84,115 @@ Usage
::
sequana_chipseq --help
- sequana_chipseq --input-directory DATAPATH
-This creates a directory with the pipeline and configuration file. You will then need
-to execute the pipeline::
+Key pipeline-specific options:
+
+``--genome-directory``
+ Path to the genome directory (must contain ``.fa`` and ``.gff``).
+
+``--design-file``
+ CSV experimental design file (see Quick Start above).
+
+``--aligner-choice``
+ Aligner to use. Currently only ``bowtie2`` is supported.
+
+``--blacklist-file``
+ BED3 file of genomic regions to exclude from analysis (tab-separated:
+ chromosome, start, end).
+
+``--genome-size``
+ Effective genome size for macs3 peak calling. Automatically computed from
+ the FASTA file if not provided; override with a plain integer.
+
+``--do-fingerprints``
+ Enable ``plotFingerprint`` QC to assess ChIP enrichment quality.
+
+Run on a SLURM cluster::
cd chipseq
- sh chipseq.sh # for a local run
+ sbatch chipseq.sh
-This launch a snakemake pipeline. If you are familiar with snakemake, you can
-retrieve the pipeline itself and its configuration files and then execute the pipeline yourself with specific parameters::
+Or drive Snakemake directly::
- snakemake -s chipseq.rules -c config.yaml --cores 4 --stats stats.txt
+ snakemake -s chipseq.rules --cores 4 --stats stats.txt
-Or use `sequanix `_ interface.
-Requirements
-~~~~~~~~~~~~
+Usage with Apptainer
+~~~~~~~~~~~~~~~~~~~~~
-This pipeline requires the following executable(s):
+Run every tool inside pre-built containers — no local tool installation needed::
-- idr This python package is not on pypi. Manual installation is required. Instructions are here:
-https://github.com/nboley/idr but we also provide a singularity in https://damona.readthedocs.io
+ sequana_chipseq \
+ --input-directory DATAPATH \
+ --genome-directory /path/to/genome \
+ --design-file design.csv \
+ --use-apptainer
-.. image:: https://raw.githubusercontent.com/sequana/chipseq/main/sequana_pipelines/chipseq/dag.png
-.. image:: https://raw.githubusercontent.com/sequana/chipseq/main/sequana_pipelines/chipseq/dag_complete.png
+Store images in a shared location to avoid re-downloading::
+ sequana_chipseq ... --use-apptainer --apptainer-prefix ~/.sequana/apptainers
-Details
-~~~~~~~~~
+Then run as usual::
+
+ cd chipseq
+ sh chipseq.sh
-This pipeline runs **chipseq** in parallel on the input fastq files (paired or not).
-A brief sequana summary report is also produced.
+Requirements
+~~~~~~~~~~~~
-Rules and configuration details
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The following tools must be available (install via conda/bioconda)::
+
+ mamba env create -f environment.yml
+
+- **bowtie2** — read alignment
+- **fastp** — adapter trimming and quality filtering
+- **fastqc** — per-read quality control
+- **samtools** — BAM sorting, indexing, and flagstat
+- **deeptools** — bigwig generation (genomeCoverageBed) and fingerprint QC
+- **ucsc-bedgraphtobigwig** — bedGraph to bigWig conversion
+- **macs3** — narrow and broad peak calling
+- **homer** — peak annotation (``annotatePeaks.pl``)
+- **idr** — Irreproducibility Discovery Rate between replicates
+- **multiqc** — aggregated QC report
+
+
+Pipeline overview
+~~~~~~~~~~~~~~~~~
+
+1. **Trimming** — fastp removes low-quality reads and adapters.
+2. **QC** — FastQC on raw and cleaned reads.
+3. **Alignment** — bowtie2 maps reads to the reference genome.
+4. **[Optional] Mark duplicates** — Picard marks PCR duplicates.
+5. **[Optional] Blacklist removal** — bedtools removes artefact-prone regions.
+6. **bigwig** — per-sample coverage tracks for genome browsers (e.g. IGV).
+7. **[Optional] Fingerprints** — plotFingerprint QC to assess ChIP enrichment.
+8. **Phantom peak** — strand cross-correlation analysis (NSC, RSC, Qtag scores).
+9. **Peak calling** — macs3 detects narrow and broad peaks for each IP vs Input pair.
+10. **FRiP** — Fraction of Reads in Peaks per sample and comparison.
+11. **IDR** — Irreproducibility Discovery Rate on true replicates, pseudo-replicates, and self-pseudo-replicates.
+12. **Annotation** — homer annotates peaks relative to genomic features.
+13. **MultiQC** — aggregated QC across all samples.
+14. **HTML report** — summary with phantom peaks, FRiP plots, IDR tables, and annotation plots.
+
+
+Configuration
+~~~~~~~~~~~~~
+
+Here is the `latest documented configuration file `_.
+Key sections:
+
+- ``general`` — aligner choice and genome directory path
+- ``fastp`` — trimming options (length, quality, adapters)
+- ``fastqc`` — FastQC options and threads
+- ``bowtie2_mapping`` / ``bowtie2_index`` — mapping options, threads, memory
+- ``macs3`` — peak calling parameters (genome size, bandwidth, q-value, broad cutoff)
+- ``idr`` — IDR thresholds, rank metric, number of pseudo-replicates
+- ``fingerprints`` — enable/disable and number of bins
+- ``mark_duplicates`` — enable/disable PCR duplicate marking
+- ``remove_blacklist`` — enable/disable and path to BED blacklist
+- ``multiqc`` — MultiQC options
-Here is the `latest documented configuration file `_
-to be used with the pipeline. Each rule used in the pipeline may have a section in the configuration file.
Changelog
~~~~~~~~~
@@ -84,12 +200,36 @@ Changelog
========= ====================================================================
Version Description
========= ====================================================================
-0.11.0 * switch to click and new sequana_pipetools
-0.10.0 * Fix design in case of samples that starts with same prefix
+0.12.0 * Fix ``plot_FRiP``: was iterating over all comparisons inside each
+ rule invocation causing ``FileNotFoundError`` in parallel runs;
+ now processes only its own wildcard
+ * Fix IDR rules (``idr_NT``, ``self_pseudo_replicate_idr``,
+ ``pseudo_replicate_idr``): IDR exits non-zero on sparse data;
+ added ``|| true`` + conditional ``mv`` so the pipeline continues
+ and downstream Python rules handle empty results gracefully
+ peaks and Homer returns an empty DataFrame
+ * Fix ``fastp`` rule: use ``input.fastq`` / ``output.r1`` /
+ ``output.r2`` to match the sequana-wrappers fastp shell interface;
+ split into paired/single-end branches
+ * Add ``log:`` directives and stderr redirection to rules that were
+ missing them: ``phantom_align``, ``chrom_sizes``, ``fingerprints``,
+ ``bam_to_bed``, ``bed_to_bigwig``, ``pseudo_replicate_idr``
+ * Update ``sequana_tools`` container to ``26.1.14``
+ * Update CI: Python 3.10/3.11/3.12; ``actions/checkout@v4``
+0.11.0 * Switch to click and new sequana_pipetools
+0.10.0 * Fix design in case of samples that start with the same prefix
* Include final IDR plots and tables
* Fix containers and wrappers in the config file
* Better HTML report
0.9.1 * Fix requirements and setup.py (remove wrong idr package)
-0.9.0 * use latest wrappers and apptainer (for rulegraph)
+0.9.0 * Use latest wrappers and apptainer (for rulegraph)
0.8.0 **First release.**
========= ====================================================================
+
+
+Contribute & Code of Conduct
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To contribute to this project, please take a look at the
+`Contributing Guidelines `_ first. Please note that this project is released with a
+`Code of Conduct `_. By contributing to this project, you agree to abide by its terms.
diff --git a/pyproject.toml b/pyproject.toml
index eb78260..543752f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,13 +4,13 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "sequana-chipseq"
-version = "0.11.0"
+version = "0.12.0"
description = "A ChIP-seq pipeline from raw reads to peaks"
authors = ["Sequana Team"]
license = "BSD-3"
-repository = "https://github.com/sequana/rnaseq"
+repository = "https://github.com/sequana/chipseq"
readme = "README.rst"
-keywords = ["snakemake, sequana, RNAseq, RNADiff, differential analysis"]
+keywords = ["snakemake, sequana, ChIP-seq, differential analysis"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Education",
@@ -19,9 +19,9 @@ classifiers = [
"Intended Audience :: Science/Research",
"License :: OSI Approved :: BSD License",
"Operating System :: POSIX :: Linux",
- "Programming Language :: Python :: 3.8",
- "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Topic :: Scientific/Engineering :: Information Analysis",
@@ -33,7 +33,7 @@ packages = [
[tool.poetry.dependencies]
-python = ">=3.8,<4.0"
+python = ">=3.10,<4.0"
sequana = ">=0.16.0"
sequana_pipetools = ">=0.16.4"
click-completion = "^0.5.2"
diff --git a/sequana_pipelines/chipseq/__init__.py b/sequana_pipelines/chipseq/__init__.py
index b46a933..66f4043 100644
--- a/sequana_pipelines/chipseq/__init__.py
+++ b/sequana_pipelines/chipseq/__init__.py
@@ -1,6 +1,6 @@
-import pkg_resources
-try:
- version = pkg_resources.require("sequana_chipseq")[0].version
-except:
- version = ">=0.8.0"
+from importlib.metadata import PackageNotFoundError, version
+try:
+ version = version("sequana-chipseq")
+except PackageNotFoundError:
+ version = "unknown"
diff --git a/sequana_pipelines/chipseq/chipseq.rules b/sequana_pipelines/chipseq/chipseq.rules
index 05c5d06..9bc2a1f 100644
--- a/sequana_pipelines/chipseq/chipseq.rules
+++ b/sequana_pipelines/chipseq/chipseq.rules
@@ -81,14 +81,14 @@ def get_control_macs3(wildcards):
def get_idr_NT_input(wildcards):
NT_inputs = cc.get_idr_NT_inputs()[wildcards.condition]
- return (f"macs3/{wildcards.broad_narrow}/{x}_peaks.{wildcards.broad_narrow}Peak" for x in NT_inputs)
+ return [f"macs3/{wildcards.broad_narrow}/{x}_peaks.{wildcards.broad_narrow}Peak" for x in NT_inputs]
# ============================================================================ bowtie2 index
if manager.config.general.aligner == "bowtie2":
- if manager.config['bowtie2_mapping']['genome_size_larger_than_4gb']:
+ if config['bowtie2_mapping']['genome_size_larger_than_4gb']:
bt2_ext = "bt2l"
else:
bt2_ext = "bt2"
@@ -134,8 +134,8 @@ if manager.config.general.aligner == "bowtie2":
config['apptainers']['sequana_tools']
resources:
**config['bowtie2_index']['resources']
- wrapper:
- f"{manager.wrappers}/wrappers/bowtie2/build"
+ shell:
+ manager.get_shell("bowtie2/build", "v1")
# ================================================================== fastqc raw data
@@ -156,8 +156,8 @@ if not config['fastqc']['skip_fastqc_raw']:
config['apptainers']['fastqc']
log:
"{sample}/fastqc_raw/fastqc.log"
- wrapper:
- f"{manager.wrappers}/wrappers/fastqc"
+ shell:
+ manager.get_shell("fastqc/run", "v1")
expected_output.extend(expand("{sample}/fastqc_raw/fastqc.done", sample=manager.samples))
@@ -168,30 +168,53 @@ if not config['fastqc']['skip_fastqc_raw']:
if manager.config.trimming.do is False:
__clean_fastq__output = manager.getrawdata()
+elif manager.paired:
+ __clean_fastq__output = [
+ "{sample}/fastp/{sample}_R1_.fastp.fastq.gz",
+ "{sample}/fastp/{sample}_R2_.fastp.fastq.gz",
+ ]
+
+ rule fastp:
+ input:
+ fastq=manager.getrawdata()
+ output:
+ r1="{sample}/fastp/{sample}_R1_.fastp.fastq.gz",
+ r2="{sample}/fastp/{sample}_R2_.fastp.fastq.gz",
+ html="{sample}/fastp/fastp_{sample}.html",
+ json="{sample}/fastp/fastp_{sample}.json",
+ log:
+ "{sample}/fastp/{sample}.log"
+ params:
+ options=config['fastp']["options"],
+ adapters=config["fastp"]["adapters"]
+ threads:
+ config["fastp"].get("threads", 4)
+ container:
+ config['apptainers']['fastp']
+ shell:
+ manager.get_shell("fastp/run", "v1")
+
else:
__clean_fastq__output = ["{sample}/fastp/{sample}_R1_.fastp.fastq.gz"]
- if manager.paired:
- __clean_fastq__output += ["{sample}/fastp/{sample}_R2_.fastp.fastq.gz"]
-
rule fastp:
- input:
- sample=manager.getrawdata()
- output:
- trimmed=__clean_fastq__output,
- html="{sample}/fastp/fastp_{sample}.html",
- json="{sample}/fastp/fastp_{sample}.json", # must be named fastp
- log:
- "{sample}/fastp/{sample}.log"
- params:
- options=config['fastp']["options"],
- adapters=config["fastp"]["adapters"]
- threads:
- config["fastp"].get("threads", 4)
- container:
- config['apptainers']['fastp']
- wrapper:
- f"{manager.wrappers}/wrappers/fastp"
+ input:
+ fastq=manager.getrawdata()
+ output:
+ r1="{sample}/fastp/{sample}_R1_.fastp.fastq.gz",
+ html="{sample}/fastp/fastp_{sample}.html",
+ json="{sample}/fastp/fastp_{sample}.json",
+ log:
+ "{sample}/fastp/{sample}.log"
+ params:
+ options=config['fastp']["options"],
+ adapters=config["fastp"]["adapters"]
+ threads:
+ config["fastp"].get("threads", 4)
+ container:
+ config['apptainers']['fastp']
+ shell:
+ manager.get_shell("fastp/run", "v1")
@@ -212,8 +235,8 @@ rule fastqc_clean:
**config["fastqc"]['resources']
container:
config['apptainers']['fastqc']
- wrapper:
- f"{manager.wrappers}/wrappers/fastqc"
+ shell:
+ manager.get_shell("fastqc/run", "v1")
expected_output.extend(expand("{sample}/fastqc_clean/fastqc.done", sample=manager.samples))
@@ -245,8 +268,8 @@ if manager.config.general.aligner == "bowtie2":
config["apptainers"]["sequana_tools"]
resources:
**config['bowtie2_mapping']['resources']
- wrapper:
- f"{manager.wrappers}/wrappers/bowtie2/align"
+ shell:
+ manager.get_shell("bowtie2/align", "v1")
# ===================================================== mark duplicates
@@ -254,31 +277,31 @@ if manager.config.general.aligner == "bowtie2":
# Mark duplicates
rule mark_duplicates:
input:
- f"{{sample}}/{aligner}/{{sample}}.sorted.bam"
+ bam = f"{{sample}}/{aligner}/{{sample}}.sorted.bam"
output:
bam = "{sample}/mark_duplicates/{sample}.sorted.bam",
metrics = "{sample}/mark_duplicates/{sample}.sorted.metrics",
log:
- out = "{sample}/mark_duplicates/log.out",
- err = "{sample}/mark_duplicates/log.err"
+ "{sample}/mark_duplicates/mark_dup.log"
params:
remove_dup = "false",
- tmpdir = "{sample}/mark_duplicates/tmp"
+ tmpdir = "{sample}/mark_duplicates/tmp",
+ options = config['mark_duplicates'].get('options', '')
threads:
config['mark_duplicates']['threads']
container:
config["apptainers"]["sequana_tools"]
resources:
**config['mark_duplicates']['resources']
- wrapper:
- f"{manager.wrappers}/wrappers/mark_duplicates"
+ shell:
+ manager.get_shell("mark_duplicates/run", "v1")
# ===================================================== blacklist
rule remove_blacklist:
input:
- bam="{sample}/mark_duplicates/{sample}.sorted.bam",
+ bam=get_final_bam_location("{sample}"),
blacklist=config["remove_blacklist"]["blacklist_file"]
output:
bam="{sample}/blacklist/{sample}.sorted.bam"
@@ -311,7 +334,7 @@ rule plotCorrelation_npz:
config["apptainers"]["sequana_tools"]
shell:
"""
- multiBigwigSummary bins -b {input} -o {output} -bs {params.bins} > {log}
+ multiBigwigSummary bins -b {input} -o {output} -bs {params.bins} >{log} 2>&1
"""
@@ -323,12 +346,12 @@ rule plotCorrelation:
params:
zMin=0
log:
- "logs/plotCorrelation.log"
+ "logs/plotCorrelation_plot.log"
container:
config["apptainers"]["sequana_tools"]
shell:
"""
- plotCorrelation --corData {input} --corMethod pearson -p heatmap --plotFile {output} --zMin {params.zMin} 1>>{log} 2>>{log}
+ plotCorrelation --corData {input} --corMethod pearson -p heatmap --plotFile {output} --zMin {params.zMin} >{log} 2>&1
"""
@@ -361,13 +384,15 @@ rule phantom_align:
f"{{sample}}/{aligner}/{{sample}}.sorted.bam"
output:
align=f"{{sample}}/{aligner}/{{sample}}.sorted.align"
+ log:
+ f"{{sample}}/{aligner}/{{sample}}.phantom_align.log"
container:
config["apptainers"]["samtools"]
shell:
"""
# Converting the SAM to alignment
# note that for awk, we escape the braces by doubling them
- samtools view -F 0x0204 {input} | awk 'BEGIN {{FS="\\t";OFS="\\t"}} {{if (and($2,16) > 0) {{print $3,($4-1),($4-1+length($10)),"N","1000","-"}} else {{print $3,($4-1),($4-1+length($10)),"N","1000","+"}}}}' - > {output.align}
+ samtools view -F 0x0204 {input} | awk 'BEGIN {{FS="\\t";OFS="\\t"}} {{if (and($2,16) > 0) {{print $3,($4-1),($4-1+length($10)),"N","1000","-"}} else {{print $3,($4-1),($4-1+length($10)),"N","1000","+"}}}}' - > {output.align} 2>{log}
"""
@@ -403,11 +428,13 @@ rule chrom_sizes:
fasta_file
output:
"chrom.sizes"
+ log:
+ "logs/chrom_sizes.log"
container:
config["apptainers"]["sequana_tools"]
shell:
"""
- samtools faidx {input}
+ samtools faidx {input} >{log} 2>&1
cut -f 1,2 {input[0]}.fai > {output[0]}
"""
@@ -426,6 +453,8 @@ rule fingerprints:
raw="fingerprints/plotFingerprint.raw.txt",
metrics="fingerprints/plotFingerprint.qcmetrics.txt",
pdf="fingerprints/plotFingerprint.pdf"
+ log:
+ "fingerprints/plotFingerprint.log"
params:
labels=" ".join(sorted(manager.samples)),
bins=config['fingerprints']['bins']
@@ -434,8 +463,7 @@ rule fingerprints:
config['apptainers']['sequana_tools']
shell:
"""
- plotFingerprint --bamfiles {input} --plotFile {output.pdf} --labels {params.labels} --outRawCounts {output.raw} --outQualityMetrics {output.metrics} --skipZeros --numberOfProcessors {threads} --numberOfSamples {params.bins}
-
+ plotFingerprint --bamfiles {input} --plotFile {output.pdf} --labels {params.labels} --outRawCounts {output.raw} --outQualityMetrics {output.metrics} --skipZeros --numberOfProcessors {threads} --numberOfSamples {params.bins} >{log} 2>&1
"""
if config['fingerprints']['do']:
@@ -455,13 +483,15 @@ rule bam_to_bed:
get_final_bam_location("{sample}")
output:
bedgraph=temp("{sample}/bigwig/{sample}.bedGraph"),
+ log:
+ "{sample}/bigwig/{sample}.bam_to_bed.log"
params:
paired = " -pc " if manager.paired else " "
container:
config['apptainers']['sequana_tools']
shell:
"""
- genomeCoverageBed -ibam {input[0]} -bg -scale 1 {params.paired} | sort -T '.' -k1,1 -k2,2n > {output.bedgraph}
+ genomeCoverageBed -ibam {input[0]} -bg -scale 1 {params.paired} | sort -T '.' -k1,1 -k2,2n > {output.bedgraph} 2>{log}
"""
@@ -472,11 +502,13 @@ rule bed_to_bigwig:
chrom_sizes="chrom.sizes"
output:
bigwig="{sample}/bigwig/{sample}.bigwig"
+ log:
+ "{sample}/bigwig/{sample}.bed_to_bigwig.log"
container:
config['apptainers']['ucsc']
shell:
"""
- bedGraphToBigWig {input.bedgraph} {input.chrom_sizes} {output.bigwig}
+ bedGraphToBigWig {input.bedgraph} {input.chrom_sizes} {output.bigwig} >{log} 2>&1
"""
@@ -495,11 +527,12 @@ rule macs3:
paired = manager.paired,
prefix = "{comp}",
qvalue = config['macs3']['qvalue'],
- mode = "{broad_narrow}"
+ mode = "{broad_narrow}",
+ outdir = "macs3/{broad_narrow}"
log:
"macs3/{broad_narrow}/{comp}_out.log"
- wrapper:
- f"{manager.wrappers}/wrappers/macs3"
+ shell:
+ manager.get_shell("macs3/run", "v1")
# ===================================================== naive consensus summary
@@ -548,13 +581,13 @@ rule FRiP:
results[bamfile] = {'count': count, 'FRiP': FRiP, 'in_peaks': in_peaks}
with open(output[0], "w") as fout:
- fout.write("bamfile,count,in_peaks, FRiP,comparison\n")
+ fout.write("bamfile,count,in_peaks,FRiP,comparison\n")
for k, v in results.items():
frip = v['FRiP']
count = v['count']
in_peaks = v['in_peaks']
comparison = wildcards.comparison
- fout.write(f"{k},{count},{in_peaks},{frip}, {comparison}\n")
+ fout.write(f"{k},{count},{in_peaks},{frip},{comparison}\n")
# ===================================================== plot FRiP
@@ -563,11 +596,10 @@ rule plot_FRiP:
output: "FRiP/{broad_narrow}/{comparison}_FRiP.png"
run:
from sequana.frip import FRiP
- for k in design.keys():
- c = FRiP(f"macs3/{wildcards.broad_narrow}/{k}_FRiP.txt") #, config["general"]["design_file"])
- c.plot()
- from pylab import savefig
- savefig(f"FRiP/{wildcards.broad_narrow}/{k}_FRiP.png")
+ from pylab import savefig
+ c = FRiP(input[0])
+ c.plot()
+ savefig(output[0])
@@ -635,10 +667,13 @@ if config['idr']['do']:
# ============================================ true replicates ====
# =========================================================================
rule idr_NT:
- input:
+ input:
get_idr_NT_input
- output:
+ output:
"IDR/{broad_narrow}/{condition}.csv"
+ log:
+ out="IDR/{broad_narrow}/{condition}.out",
+ err="IDR/{broad_narrow}/{condition}.err"
params:
rank=config['idr']['rank'],
soft_idr_threshold=config['idr']['soft_idr_threshold']
@@ -646,15 +681,22 @@ if config['idr']['do']:
config['apptainers']['idr']
shell:
"""
- idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} --output-file IDR/{wildcards.broad_narrow}/{wildcards.condition} --plot --soft-idr-threshold {params.soft_idr_threshold}
-
- mv IDR/{wildcards.broad_narrow}/{wildcards.condition} IDR/{wildcards.broad_narrow}/{wildcards.condition}.csv
- """
+ idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} \
+ --output-file IDR/{wildcards.broad_narrow}/{wildcards.condition} \
+ --plot --soft-idr-threshold {params.soft_idr_threshold} \
+ 1>{log.out} 2>{log.err} || true
+ if [ -f "IDR/{wildcards.broad_narrow}/{wildcards.condition}" ]; then
+ mv IDR/{wildcards.broad_narrow}/{wildcards.condition} {output}
+ else
+ touch {output}
+ fi
+ """
rule plot_idr:
input:
"IDR/{broad_narrow}/{condition}.csv"
output:
+ image_idr_vs_peaks="IDR/{broad_narrow}/{condition}_idr_vs_peaks.png",
image_ranks="IDR/{broad_narrow}/{condition}_ranks.png",
image_scores="IDR/{broad_narrow}/{condition}_scores.png",
image_rank_idr="IDR/{broad_narrow}/{condition}_rank_idr.png",
@@ -664,6 +706,7 @@ if config['idr']['do']:
from sequana.idr import IDR
idr = IDR(input[0])
if len(idr.df):
+ idr.plot_idr_vs_peaks(output.image_idr_vs_peaks, savefig=True)
idr.plot_ranks(output.image_ranks, savefig=True)
idr.plot_scores(output.image_scores, savefig=True)
idr.plot_rank_vs_idr_score(output.image_rank_idr, savefig=True)
@@ -751,11 +794,12 @@ if config['idr']['do'] and config['idr']['self_pseudo_replicates']:
paired = manager.paired,
prefix = "{condition}_rep{rep}_rep{rep2}_SPR{N}",
qvalue = config['macs3']['qvalue'],
- mode = "{broad_narrow}"
+ mode = "{broad_narrow}",
+ outdir = "SPR_IDR/macs3/{broad_narrow}"
log:
"SPR_IDR/macs3/{broad_narrow}/{condition}_rep{rep}_rep{rep2}_SPR{N}.out"
- wrapper:
- f"{manager.wrappers}/wrappers/macs3"
+ shell:
+ manager.get_shell("macs3/run", "v1")
rule self_pseudo_replicate_idr:
@@ -774,9 +818,15 @@ if config['idr']['do'] and config['idr']['self_pseudo_replicates']:
err="SPR_IDR/idr/{broad_narrow}/{condition}_rep{rep}_SPR{N}.err"
shell:
"""
- idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} --output-file SPR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_rep{wildcards.rep}_SPR{wildcards.N} --plot --soft-idr-threshold {params.soft_idr_threshold} 1>{log.out} 2>{log.err}
- mv SPR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_rep{wildcards.rep}_SPR{wildcards.N} {output}
- """
+ idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} \
+ --output-file SPR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_rep{wildcards.rep}_SPR{wildcards.N} \
+ --plot --soft-idr-threshold {params.soft_idr_threshold} 1>{log.out} 2>{log.err} || true
+ if [ -f "SPR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_rep{wildcards.rep}_SPR{wildcards.N}" ]; then
+ mv SPR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_rep{wildcards.rep}_SPR{wildcards.N} {output}
+ else
+ touch {output}
+ fi
+ """
expected_output += expand("SPR_IDR/idr/{broad_narrow}/{condition}_rep{rep}_SPR{N}.csv", broad_narrow=['broad','narrow'], condition=cc.conditions, N=range(1, config['idr'].get('self_pseudo_replicates',1)+1), rep=[1,2])
@@ -837,11 +887,12 @@ if config['idr']['do'] and config['idr']['pseudo_replicates']:
paired = manager.paired,
prefix = "{condition}_{rep}_PR{N}",
qvalue = config['macs3']['qvalue'],
- mode = "{broad_narrow}"
+ mode = "{broad_narrow}",
+ outdir = "PR_IDR/macs3/{broad_narrow}"
log:
"PR_IDR/macs3/{broad_narrow}/{condition}_{rep}_PR{N}.log"
- wrapper:
- f"{manager.wrappers}/wrappers/macs3"
+ shell:
+ manager.get_shell("macs3/run", "v1")
rule pseudo_replicate_idr:
@@ -850,6 +901,9 @@ if config['idr']['do'] and config['idr']['pseudo_replicates']:
"PR_IDR/macs3/{broad_narrow}/{condition}_rep2_PR{N}_peaks.{broad_narrow}Peak"]
output:
"PR_IDR/idr/{broad_narrow}/{condition}_PR{N}.csv"
+ log:
+ out="PR_IDR/idr/{broad_narrow}/{condition}_PR{N}.out",
+ err="PR_IDR/idr/{broad_narrow}/{condition}_PR{N}.err"
params:
rank=config['idr']['rank'],
soft_idr_threshold=config['idr']['soft_idr_threshold']
@@ -857,9 +911,15 @@ if config['idr']['do'] and config['idr']['pseudo_replicates']:
config['apptainers']['idr']
shell:
"""
- idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} --output-file PR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_PR{wildcards.N} --plot --soft-idr-threshold {params.soft_idr_threshold}
- mv PR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_PR{wildcards.N} {output}
- """
+ idr --samples {input} --input-file-type {wildcards.broad_narrow}Peak --rank {params.rank} \
+ --output-file PR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_PR{wildcards.N} \
+ --plot --soft-idr-threshold {params.soft_idr_threshold} 1>{log.out} 2>{log.err} || true
+ if [ -f "PR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_PR{wildcards.N}" ]; then
+ mv PR_IDR/idr/{wildcards.broad_narrow}/{wildcards.condition}_PR{wildcards.N} {output}
+ else
+ touch {output}
+ fi
+ """
expected_output += expand("PR_IDR/idr/{broad_narrow}/{condition}_PR{N}.csv", broad_narrow=['broad','narrow'], condition=cc.conditions, N=range(1, config['idr'].get('pseudo_replicates',1)+1))
@@ -974,7 +1034,7 @@ if config['idr']['do']:
shell:
"""
annotatePeaks.pl {input.macs3} {input.fasta} -gid -gff {input.gff} -cpu {threads} 1>{output.annotations} 2>{log}
- annotatePeaks.pl {input.macs3_significative_only} {input.fasta} -gid -gff {input.gff} -cpu {threads} 1>{output.annotations_significative_only} 2>{log}
+ annotatePeaks.pl {input.macs3_significative_only} {input.fasta} -gid -gff {input.gff} -cpu {threads} 1>{output.annotations_significative_only} 2>>{log}
"""
expected_output += expand( "annotate_peaks_idr/{broad_narrow}/{comp}.txt",
@@ -1040,7 +1100,7 @@ if config['idr']['do']:
config['apptainers']['sequana_tools']
shell:
"""
- featureCounts -FSAF -O --fracOverlap 0.2 -T {threads} {params.paired} -a {input.saf} -o {output} {input.bamfiles} 1>{log} 2>{log}
+ featureCounts -FSAF -O --fracOverlap 0.2 -T {threads} {params.paired} -a {input.saf} -o {output} {input.bamfiles} >{log} 2>&1
"""
expected_output += expand("featureCounts/{broad_narrow}/{comp}.feature_counts.out", broad_narrow=['broad', 'narrow'], comp=cc.get_idr_NT_inputs().keys())
@@ -1074,12 +1134,12 @@ if config['idr']['do']:
f, axes = pylab.subplots(1,2, figsize=(10,6))
axes[0].bar(X, narrow['N'], label='Peaks found in common')
- axes[0].bar(X, narrow['Ns'], label='Significative peaks found in common')
+ axes[0].bar(X, narrow['Ns'], label='Significant peaks found in common')
pylab.sca(axes[0])
pylab.xticks(X, labels)
axes[1].bar(X, broad['N'], label='Peaks found in common')
- axes[1].bar(X, broad['Ns'], label='Significative peaks found in common')
+ axes[1].bar(X, broad['Ns'], label='Significant peaks found in common')
axes[1].legend(loc='upper right')
pylab.sca(axes[1])
pylab.xticks(X, labels)
@@ -1103,7 +1163,7 @@ rule igv_list:
from sequana_pipelines.chipseq.tools import ChIPExpDesign
c = ChIPExpDesign(input.design)
conditions = c.conditions
- colors = ["blue", "red", "yellow", "green"]
+ colors = ["blue", "red", "green", "orange", "purple", "cyan", "magenta", "brown"] * 4
with open(output[0], "w") as fout:
for i, condition in enumerate(c.conditions):
for sample in c.df.query("condition==@condition").sample_name:
@@ -1195,8 +1255,8 @@ rule multiqc:
config["apptainers"]["multiqc"]
log:
"multiqc/multiqc.log"
- wrapper:
- f"{manager.wrappers}/wrappers/multiqc"
+ shell:
+ manager.get_shell("multiqc/run", "v1")
os.makedirs("rulegraph", exist_ok=True)
@@ -1210,8 +1270,8 @@ rule rulegraph:
params:
mapper = {"multiqc": "../multiqc/multiqc_report.html"},
configname = "config.yaml"
- wrapper:
- f"{manager.wrappers}/wrappers/rulegraph"
+ run:
+ manager.get_run("rulegraph/run", "v1")(input, output, params)
rule dot2svg:
@@ -1222,7 +1282,7 @@ rule dot2svg:
container:
config['apptainers']['graphviz']
shell:
- """dot -Tsvg {input} -o {output}"""
+ manager.get_shell("graphviz/dot2svg", "v1")
# Those rules takes a couple of seconds so no need for a cluster
@@ -1264,15 +1324,25 @@ onsuccess:
intro = f"""
Overview
- The ChIP-seq pipeline from Sequana maps the IP and Input reads on the provided reference ({fasta_file}).
- Peaks are then extracted with macs3 tool and IDR statistics are provided with replicates.
+ The ChIP-seq pipeline from Sequana maps IP and Input reads to the provided reference
+ ({fasta_file}). Peaks are called with macs3 (narrow and broad) and
+ IDR statistics are computed across replicates to identify reproducible peaks.
- A multiqc report is available, where various QCs and mapping quality plots are also complementary to the following plots.
+
A MultiQC report is also available,
+ providing complementary QC and mapping-quality plots across all samples.
- Data structure
- Each sample has its own directory with sub-directories for each processing step including cleanup (fastp), fastqc (fastqc and fastqc_clean), mapping (e.g. bowtie2/), creation of bigwig file (bigwig/) and phantom peak detection. Single Peak analysis is available in a global diretory called macs3/ and annotations of the single peaks are in annotate_peaks/. IDR peak analysis is also available and stored in IDR/ ; annotations of IDR peaks are in annotate_peaks_idr. Additional directoy calld PR_IDR and SPR_IDR are summarised in this report and may not be distributed.
+
Output structure
+ Each sample has its own directory containing sub-directories for each processing
+ step: read trimming (fastp/), quality control (fastqc_raw/
+ and fastqc_clean/), alignment (bowtie2/), coverage tracks
+ (bigwig/), and phantom peak analysis (phantom/).
+ Single-replicate peak files are in macs3/ (narrow and broad) with
+ annotations in annotate_peaks/. IDR peak analysis is in IDR/
+ with annotations in annotate_peaks_idr/. Pseudo-replicate IDR working
+ directories (PR_IDR/ and SPR_IDR/) are summarised in this
+ report and may be safely removed after the analysis is complete.
@@ -1281,16 +1351,27 @@ onsuccess:
intro += """
QC plots
Phantom peaks
- Phantom peaks are found in poor quality samples. A way to detect their presence is to compute the strand cross-correlation from the BAM files. Ideally, you should obtain a peak at a strand-shift of 100-200bp. However, phantom peaks may be found around position 0. A metric called RSC is the ratio between the data peak and the phantom peak. It is expected to be larger than 1. Another metric is called NSC and correspond to the normalised strand cross-correlation, which is the main peak cross-correlation divided by the minimum CC. A value of 1 means no enrichment; generally we expect value >1.1 although it really depends on your experiment. The RSC metric is more relevant and use to tag the quality of the data:
-
+
Phantom peaks appear in poor-quality ChIP-seq samples. They can be detected by
+ computing the strand cross-correlation from the BAM files. In a good experiment,
+ the cross-correlation profile shows a clear peak at the fragment length
+ (typically 100–200 bp). Phantom peaks appear near position 0 (the read length)
+ and indicate poor enrichment.
+ Two summary metrics are reported:
+
+ - NSC (Normalised Strand Cross-correlation): ratio of the peak
+ cross-correlation to the background cross-correlation. A value of 1 means no
+ enrichment; values > 1.1 are generally considered acceptable.
+ - RSC (Relative Strand Correlation): ratio of the fragment-length peak
+ to the phantom peak. Values > 1 indicate good enrichment. RSC is used to
+ assign a quality tag:
+
-- tag of -2 correspond to RSC<0.25; (bad quality)
-- tag of -1 corresponds to RSC in [0.25,0.5[
-- tag of 0 corresponds to RSC in [0.5,1[
-- tag of 1 correspond to RSC in [1,1.5[
-- tag of 2 is attributed to quality run with RSC>=1.5 (good Quality)
+- Qtag −2: RSC < 0.25 (very poor quality)
+- Qtag −1: RSC in [0.25, 0.5)
+- Qtag 0: RSC in [0.5, 1)
+- Qtag 1: RSC in [1, 1.5)
+- Qtag 2: RSC ≥ 1.5 (good quality)
-
"""
df = pd.DataFrame([
@@ -1314,7 +1395,7 @@ onsuccess:
intro +='"
files = glob.glob("*/phantom/phantom.png")
@@ -1325,9 +1406,14 @@ onsuccess:
intro += """
FRiP
-
FRiP is the fraction of reads found in the detected peaks. When we detect peaks, we have two options: narrow or broad. Besides,
- one need to decide upon the sample(s) where peaks have to be detected as compare to sample(s) considered as noise. Consequently, the combination can be quite large depending on your experimental design. However, in a given ChiP-seq experiment, we expect the FRiP to be larger in IP samples, and lower in Input/Control samples. Replicates should also have similar behaviours. In the following figures, we show the number of reads in peaks versus the FRiP. This two values should be correlated. Moreover, the IP samples should cluster togther and control should cluster together (though this may depend on the conditions). Note that in the following plots, we show all samples but for peaks detected in specific conditions (e.g. we show the FRiP calculated on 10 samples but from peaks computed using 1 IP and 1 control).
"""
- intro +='
'
+
FRiP (Fraction of Reads in Peaks) measures the proportion of reads that fall
+ within detected peaks. Peaks can be narrow or broad, and are always called for a
+ specific IP versus its matched control. In a well-enriched ChIP-seq experiment,
+ IP samples should show higher FRiP values than their Input/control counterparts,
+ and replicates within the same condition should behave consistently. Each plot below
+ shows all samples evaluated against peaks called from one specific IP–control
+ comparison.
"""
+ intro +='
'
files = glob.glob("FRiP/*/*.png")
captions = ["/".join(filename.split("/")[1:]) for filename in files]
for filename, caption in zip(files, captions):
@@ -1337,15 +1423,20 @@ onsuccess:
intro+="""
Results
-
Design experimental
-
Here below are the samples names with their type (IP for immuno-precipitated), condition and replicate. The sample name is derived from the filename.
+
Experimental design
+
The table below lists all samples with their type (IP for immunoprecipitated, or Input for control), condition, and replicate number. The sample name is derived from the input filename prefix.
"""
intro += df2html(cc.df)
intro += """
Single peak detection
-
The following files contain the detected peaks in various combinations of IP samples versus controls. We only report a given IP sample versus its control in the same condition. Moreover, if there are two controls, there are combined. So we will not report e.g. IP sample 1 versus Control1 and then IP sample1 versus Control2 but rather IP sample 1 versus Control 1 and 2. However, if there are several replicates for a given IP condition, we report all combinations (replicate 1 versus its control, replicate 2 versus its control and replicate1+2 versus its control). The format of the CSV output file is made of 10 columns, which is the output of the software macs. Columns are: chromosome name, start position, end position, name, score, ?, fold enrichment, -log 10pvalue, -log10 qvalue, and_submit from start and length (for narrow case only).
-
+
The following files contain peaks detected for each IP–control comparison within the same condition.
+ If there are multiple controls for a condition, they are combined into a single control track.
+ If there are several IP replicates for a condition, peaks are called for each replicate individually
+ and also for the pooled replicates. Both narrow and broad peak calls are reported.
+ Peak files are in macs3 format (10 columns for narrow peaks, 9 for broad):
+ chromosome, start, end, name, score, strand, fold-enrichment, −log10(p-value),
+ −log10(q-value), and summit offset from start (narrow only).
"""
keys = sorted(cc.get_IP_versus_control().keys())
@@ -1356,10 +1447,10 @@ onsuccess:
df = pd.DataFrame({'narrow': narrow, 'narrow_links':narrow_links,
'broad': broad, 'broad_links':broad_links}, index=keys)
intro += df2html(df, dom='rt', show_index=True) + "
"
- intro += "
Here below, we summarize the number of peaks per type (narrow and broad) across all combinaisons of valid IPs versus controls.
"
+ intro += "
The plots below summarize the number of peaks per type (narrow and broad) across all combinations of IP samples versus their matched controls.
"
- intro += SequanaReport.png_to_embedded_png("peak_narrow_counts", "outputs/peak_narrow_counts.png", style="width:45%; height:40%")
- intro += SequanaReport.png_to_embedded_png("peak_broad_counts", "outputs/peak_broad_counts.png", style="width:45%; height:40%")
+ intro += SequanaReport.png_to_embedded_png("peak_narrow_counts", "outputs/peak_narrow_counts.png", style="width:48%; max-width:600px")
+ intro += SequanaReport.png_to_embedded_png("peak_broad_counts", "outputs/peak_broad_counts.png", style="width:48%; max-width:600px")
for key in keys:
@@ -1373,7 +1464,7 @@ onsuccess:
# ===================================================== single peak annotation plots
intro += """
Single peak annotation
-
"""
+
"""
files = glob.glob("annotate_peaks/narrow/*png") + glob.glob("annotate_peaks/broad/*png")
captions = files
@@ -1386,11 +1477,13 @@ onsuccess:
intro += "
Naive peak consensus (overlap replicates)
"
for cond in cc.get_idr_NT_inputs().keys():
for mode in ['narrow', 'broad']:
- intro += SequanaReport.png_to_embedded_png(f"peak_{mode}_{cond}_venn", f"macs3/{mode}/{cond}_venn.png", style="width:30%; height:40%")
+ intro += SequanaReport.png_to_embedded_png(f"peak_{mode}_{cond}_venn", f"macs3/{mode}/{cond}_venn.png", style="width:30%; max-width:400px")
# ===================================================== IDR
- intro += """
IDR peak detection
Overview
The figure below shows the number of significant peaks found in common and with significant IDR (orange) as compared to the common ones. IDRs are computed only when 2 replicates are available. """
+ intro += """
IDR peak detection
Overview
+
The figures below show the number of significant peaks found in common across replicates (blue) and the subset
+ passing the IDR threshold (orange). IDR is computed only when at least 2 replicates are available.
"""
if config['idr']['do']:
@@ -1402,7 +1495,7 @@ onsuccess:
# all data
intro += f"
Case replicate {key} ({peak_type})
"
- idr = IDR(f"IDR/narrow/{key}.csv")
+ idr = IDR(f"IDR/{peak_type}/{key}.csv")
df = idr.df
intro += f"
A total of {len(df)} peaks were found after IDR analysis and {idr.N_significant_peaks} are found with an IDR<0.05
"
intro += df2html(df, dom='Bfrtip')
@@ -1410,7 +1503,7 @@ onsuccess:
intro +='
"
files = [f"IDR/{peak_type}/{key}_idr_vs_peaks.png",
@@ -1424,13 +1517,13 @@ onsuccess:
"log10 scores of each replicates (black are kept)",
#"venn diagram common peaks",
"compared ranks in both replicates"]
- intro += 'IDR plots here below are standard plots for Chip-seq IDR analysis.'
+ intro += '
Standard IDR diagnostic plots for ChIP-seq analysis are shown below.
'
for filename, caption in zip(files, captions):
intro += '

'.format(filename, caption)
intro +="
"
- # significative only
- intro += f"
Case replicate {key} ({peak_type}) significative only
"
+ # significant only
+ intro += f"
Case replicate {key} ({peak_type}) — significant peaks only (IDR < 0.05)
"
idr = IDR(f"IDR/{peak_type}/{key}_significative.csv")
df = idr.df
intro += df2html(df, dom='Bfrtip')
@@ -1438,16 +1531,25 @@ onsuccess:
# Pseudo replicates analysis
if config['idr']['do'] and config['idr']['pseudo_replicates']:
intro += """
Pseudo replicates analysis
"""
- intro += """
The number of peaks found in the Pseudo replicates analysis should be in agreement with the True replicates. IDR analysis on the pooled pseudo-replicates should results in a number of peaks that are similar (within a factor of 2), which indicates that there are truly good replicates. Black dots here below indicates the true number of peaks found in the IDR analysis. boxplot are the number of peaks found with the pseudo replicates . We filtered out the peak with IDR>0.05
"""
+ intro += """
The number of peaks found in the pseudo-replicate analysis should agree with the true-replicate IDR.
+ IDR on pooled pseudo-replicates should yield a peak count within a factor of 2 of the true-replicate IDR,
+ which indicates robust replicates. Black dots show the true-replicate IDR peak count;
+ boxplots show the distribution across pseudo-replicates. Peaks with IDR > 0.05 are excluded.
"""
intro += SequanaReport.png_to_embedded_png("IDR_NT_narrow",
- "outputs/IDR_NT_vs_PR_narrow.png")
+ "outputs/IDR_NT_vs_PR_narrow.png", style="width:48%; max-width:600px")
intro += SequanaReport.png_to_embedded_png("IDR_NT_broad",
- "outputs/IDR_NT_vs_PR_broad.png")
+ "outputs/IDR_NT_vs_PR_broad.png", style="width:48%; max-width:600px")
if config['idr']['do'] and config['idr']['self_pseudo_replicates']:
# Self pseudo replicates analysis
- intro += """
Self pseudo replicates analysis
We created pseudo replicates for each replicates (in each condition) by randomly splitting the reads of replicate 1 (and replicate 2). The IDR analysis on the first self-replicates data should result in a number of peaks that are similar (within a factor of 2) to the second self-replicates. If not, the replicates are probably not good replicates. Here is a table with the ratio of the number of IDR peaks found in replicate 1 and 2 for the different conditions (both narrow and broad cases)
"""
+ intro += """
Self pseudo replicates analysis
+
For each replicate, reads are randomly split into two equal halves (self-pseudo-replicates).
+ IDR is then run on each pair of self-pseudo-replicates independently. The peak counts from
+ the two halves should agree within a factor of 2; a higher ratio suggests the replicate
+ itself is of poor quality. The table below shows the IDR peak counts for self-pseudo-replicate
+ 1 and 2, and their ratio, for each condition and peak type (narrow/broad).
+ Ratios above 2 are highlighted in red.
"""
@@ -1479,7 +1581,7 @@ onsuccess:
if ratio > 2:
intro += f'
{ratio} | '
else:
- intro += f'
{ratio} | '
+ intro += f'
{ratio} | '
intro += ""
intro +=""""""
@@ -1487,7 +1589,7 @@ onsuccess:
# ========================================================== IDR peak annotation
if config["idr"]["do"]:
intro += """
IDR peak annotation
-
"""
+
"""
files = glob.glob("annotate_peaks_idr/narrow/*png") + glob.glob("annotate_peaks_idr/broad/*png")
captions = files
@@ -1500,20 +1602,21 @@ onsuccess:
for key in sorted(cc.conditions):
from sequana.homer import Homer
h = Homer(f"annotate_peaks_idr/{peak_type}/{key}_significative.txt")
- del h.df['ID']
- intro += f"
Case replicate {key} ({peak_type}) significative only
"
+ if 'ID' in h.df.columns:
+ del h.df['ID']
+ intro += f"
Case replicate {key} ({peak_type}) — significant peaks only (IDR < 0.05)
"
intro += df2html(h.df, dom='Bfrtip', show_index=False)
# ========================================================== Final information
intro += """
Final information
"""
- intro += """
All alignments can be found in sample_name/bowtie2/*sorted.bam. Those files can be loaded in genome browser such as IGV. Yet, it may be slow, and bigwig files are provided for you in sample_name/bigwig.
-An IGV XML file is provided (igv.xml) that preloads the bigwig and individual peaks (on replicates).
-
-
-
-IDR files are to be found in IDR directory (narrow and broad) where CSV files can be found.
-There are the same as those in this HTML reports in the section "IDR peak detection". Note that they
-contains both significant and non significant peaks
+ intro += """
All alignments are in <sample>/bowtie2/*.sorted.bam. These files can be
+loaded in a genome browser such as IGV, though bigwig files in <sample>/bigwig/
+are recommended for faster browsing. An IGV session file (igv.xml) is provided that
+preloads all bigwig tracks and per-replicate peak files.
+
+
IDR results are in the IDR/ directory (separate narrow/ and broad/
+subdirectories) as CSV files. These correspond to the tables shown in the “IDR peak detection”
+section above and contain both significant and non-significant peaks.
"""
diff --git a/sequana_pipelines/chipseq/config.yaml b/sequana_pipelines/chipseq/config.yaml
index 1c18744..b4e5f35 100644
--- a/sequana_pipelines/chipseq/config.yaml
+++ b/sequana_pipelines/chipseq/config.yaml
@@ -7,24 +7,23 @@
# If input_directory provided, use it otherwise if input_pattern provided,
# use it, otherwise use input_samples.
# ============================================================================
-sequana_wrappers: "v23.12.5"
-
-input_directory: ""
+input_directory: ""
input_readtag: _R[12]_
input_pattern: '*fastq.gz'
+exclude_pattern: ~
# =========================================== Sections for the users
apptainers:
- graphviz: "https://zenodo.org/record/7928262/files/graphviz_7.0.5.img"
- sequana_tools: https://zenodo.org/record/7341710/files/sequana_tools_0.14.5.img
+ graphviz: https://zenodo.org/record/7928262/files/graphviz_7.0.5.img
+ sequana_tools: https://zenodo.org/record/18257162/files/sequana_tools_26.1.14.img
fastqc: https://zenodo.org/record/7015004/files/fastqc_0.11.9-py3.img
fastp: https://zenodo.org/record/7319782/files/fastp_0.23.2.img
phantompeak: https://zenodo.org/record/7301453/files/phantompeakqualtools_1.2.2.img
homer: https://zenodo.org/record/7305501/files/homer_4.11.0.img
ucsc: https://zenodo.org/record/10011490/files/ucsc_3.7.7.img
samtools: https://zenodo.org/record/7437898/files/samtools_1.16.1.img
- idr: ''
- multiqc: "https://zenodo.org/record/10205070/files/multiqc_1.16.0.img"
+ idr: https://zenodo.org/record/7305443/files/idr_2.0.3.img
+ multiqc: https://zenodo.org/record/10205070/files/multiqc_1.16.0.img
#############################################################################
diff --git a/sequana_pipelines/chipseq/schema.yaml b/sequana_pipelines/chipseq/schema.yaml
index b9bab87..ab7c26a 100644
--- a/sequana_pipelines/chipseq/schema.yaml
+++ b/sequana_pipelines/chipseq/schema.yaml
@@ -1,8 +1,6 @@
type: map
mapping:
- "sequana_wrappers":
- type: str
"input_directory":
type: str
required: True
@@ -12,6 +10,9 @@ mapping:
"input_pattern":
type: str
required: True
+ "exclude_pattern":
+ type: str
+ required: False
"apptainers":
type: any
diff --git a/test/README b/test/README
new file mode 100644
index 0000000..7b59392
--- /dev/null
+++ b/test/README
@@ -0,0 +1,5 @@
+sequana_chipseq --input-directory data/ \
+ --genome-directory data/ecoli_MG1655/ \
+ --design-file data/design.csv --monitor \
+ --force --apptainer-prefix ~/images
+