diff --git a/bio/pyroe/makeunspliceunspliced/environment.yaml b/bio/pyroe/makeunspliceunspliced/environment.yaml new file mode 100644 index 0000000000..9fdb85a503 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/environment.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - pyroe=0.9.1 + - bedtools=2.30.0 \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/meta.yaml b/bio/pyroe/makeunspliceunspliced/meta.yaml new file mode 100644 index 0000000000..6617a120be --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/meta.yaml @@ -0,0 +1,19 @@ +name: pyroe make-spliced+unspliced +url: https://pyroe.readthedocs.io/en/latest/building_splici_index.html#preparing-a-spliced-unspliced-transcriptome-reference +description: > + Build spliceu reference files for Alevin-fry. The spliceu (the spliced + unspliced) transcriptome reference, where the unspliced transcripts of each gene represent the entire genomic interval of that gene. +author: + - Thibault Dayris +input: + - gtf: Path to the genome annotation (GTF formatted) + - fasta: Path to the genome sequence (Fasta formatted) + - spliced: Optional path to additional spliced sequences (Fasta formatted) + - unspliced: Optional path to unspliced sequences (Fasta formatted) +output: + - fasta: Path to spliced+unspliced sequences (Fasta formatted) + - gene_id_to_name: Path to a TSV formatted text file containing gene_id <-> gene_name correspondence + - t2g_3col: Path to a TSV formatted text file containing the transcript_id <-> gene_name <-> splicing status correspondence + - t2g: Path to a TSV formatted text file containing the transcript_id <-> gene_name + - g2g: Path to a TSV formatted text file containing the gene_id <-> gene_name +params: + - extra: Optional parameters to be passed to pyroe \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/test/Snakefile b/bio/pyroe/makeunspliceunspliced/test/Snakefile new file mode 100644 index 0000000000..7b0ffdd046 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/Snakefile @@ -0,0 +1,20 @@ + +rule test_pyroe_makesplicedunspliced: + input: + fasta="genome.fasta", + gtf="annotation.gtf", + spliced="extra_spliced.fasta", # Optional path to additional spliced sequences (FASTA) + unspliced="extra_unspliced.fasta", # Optional path to additional unspliced sequences (FASTA) + output: + gene_id_to_name="gene_id_to_name.tsv", + fasta="spliceu.fa", + g2g="spliceu_g2g.tsv", + t2g_3col="spliceu_t2g_3col.tsv", + t2g="spliceu_t2g.tsv", + threads: 1 + log: + "logs/pyroe.log", + params: + extra="", # Optional parameters + wrapper: + "master/bio/pyroe/makeunspliceunspliced/" diff --git a/bio/pyroe/makeunspliceunspliced/test/annotation.gtf b/bio/pyroe/makeunspliceunspliced/test/annotation.gtf new file mode 100644 index 0000000000..361012dfde --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/annotation.gtf @@ -0,0 +1,27 @@ +##gff-version 2 +##source-version rtracklayer 1.52.1 +##date 2021-09-14 +chr1 rtracklayer exon 1 2 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.1"; exon_id "E1" +chr1 rtracklayer exon 36 45 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.1"; exon_id "E2" +chr1 rtracklayer exon 71 80 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.1"; exon_id "E3" +chr1 rtracklayer exon 46 55 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.2"; exon_id "E4" +chr1 rtracklayer exon 91 100 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.2"; exon_id "E5" +chr1 rtracklayer exon 121 130 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.3"; exon_id "E6" +chr1 rtracklayer exon 156 160 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.3"; exon_id "E7" +chr1 rtracklayer exon 191 200 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.3"; exon_id "E8" +chr1 rtracklayer transcript 1 80 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.1"; +chr1 rtracklayer transcript 46 100 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.2"; +chr1 rtracklayer transcript 121 200 . + . gene_id "g1"; gene_name "g1"; transcript_id "tx1.3"; +chr1 rtracklayer gene 1 200 . + . gene_id "g1"; gene_name "g1"; +chr2 rtracklayer exon 1 2 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.1"; exon_id "E9" +chr2 rtracklayer exon 36 45 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.1"; exon_id "E10" +chr2 rtracklayer exon 71 80 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.1"; exon_id "E11" +chr2 rtracklayer exon 46 55 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.2"; exon_id "E12" +chr2 rtracklayer exon 91 100 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.2"; exon_id "E13" +chr2 rtracklayer exon 121 130 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.3"; exon_id "E14" +chr2 rtracklayer exon 156 160 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.3"; exon_id "E15" +chr2 rtracklayer exon 191 200 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.3"; exon_id "E16" +chr2 rtracklayer transcript 1 80 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.1"; +chr2 rtracklayer transcript 46 100 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.2"; +chr2 rtracklayer transcript 121 200 . - . gene_id "g2"; gene_name "g2"; transcript_id "tx2.3"; +chr2 rtracklayer gene 1 200 . - . gene_id "g2"; gene_name "g2"; \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/test/extra_spliced.fasta b/bio/pyroe/makeunspliceunspliced/test/extra_spliced.fasta new file mode 100644 index 0000000000..2e7752f0b1 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/extra_spliced.fasta @@ -0,0 +1,2 @@ +>ExtraSpliced +ATATATATATATATATATATATATATATATATATATATAT \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/test/extra_unspliced.fasta b/bio/pyroe/makeunspliceunspliced/test/extra_unspliced.fasta new file mode 100644 index 0000000000..3920e14797 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/extra_unspliced.fasta @@ -0,0 +1,2 @@ +>ExtraUnspliced +CGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/test/genome.fasta b/bio/pyroe/makeunspliceunspliced/test/genome.fasta new file mode 100644 index 0000000000..c9d91c3893 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/genome.fasta @@ -0,0 +1,8 @@ +>chr1 +TTAACATTCGCTGGGGGAGATGACGAGACTAGCCGCCGCGTGGTCCTGCCGCATTATACGTGTTCAAGCGCCTACGTGGG +TTGGGCAACCCGTGCCTATGGAGGCATGGACAAATTAGGTTCAACTTCAGCTACGTACGAGACCTAGAGGTAATAAGGGT +ATTTTACTCGGAGCATGTTTCAGTACGAACGTTAGATATC +>chr2 +CTATCGAAGTGGAATCTTGAAGAGCCCATCGGTTAAGGTCTCTCCAATGTCCAGCCTATTCTATGGCACGGCAGACCCGT +TGTGCATCCACAGTGATAACTTACTTGGGCTCTTAATAGAGGAGTGTTGCCATTTTATCGGCTTGCACTCCAATTAGCAC +CAAGTGCCGTTATTGGGGTATTGCACTCATCAATAGCGTG \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/test/genome.fasta.fai b/bio/pyroe/makeunspliceunspliced/test/genome.fasta.fai new file mode 100644 index 0000000000..72cbfa2c91 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/test/genome.fasta.fai @@ -0,0 +1,4 @@ +chr1 + 203 7 81 82 +chr2 + 203 220 81 82 \ No newline at end of file diff --git a/bio/pyroe/makeunspliceunspliced/wrapper.py b/bio/pyroe/makeunspliceunspliced/wrapper.py new file mode 100644 index 0000000000..26a49a16e4 --- /dev/null +++ b/bio/pyroe/makeunspliceunspliced/wrapper.py @@ -0,0 +1,56 @@ +__author__ = "Thibault Dayris" +__copyright__ = "Copyright 2023, Thibault Dayris" +__email__ = "thibault.dayris@gustaveroussy.fr" +__license__ = "MIT" + + +from tempfile import TemporaryDirectory +from snakemake.shell import shell + + +log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True) +extra = snakemake.params.get("extra", "") + +spliced = snakemake.input.get("spliced", "") +if spliced: + spliced = "--extra-spliced " + spliced + + +unspliced = snakemake.input.get("unspliced", "") +if unspliced: + unspliced = "--extra-unspliced " + unspliced + + +with TemporaryDirectory() as tempdir: + shell( + "pyroe make-spliced+unspliced " + "{extra} {spliced} " + "{unspliced} " + "{snakemake.input.fasta} " + "{snakemake.input.gtf} " + "{tempdir} " + "{log}" + ) + + if snakemake.output.get("fasta", False): + shell("mv --verbose {tempdir}/spliceu.fa {snakemake.output.fasta} {log}") + + if snakemake.output.get("gene_id_to_name", False): + shell( + "mv --verbose " + "{tempdir}/gene_id_to_name.tsv " + "{snakemake.output.gene_id_to_name} {log}" + ) + + if snakemake.output.get("t2g_3col", False): + shell( + "mv --verbose " + "{tempdir}/spliceu_t2g_3col.tsv " + "{snakemake.output.t2g_3col} {log} " + ) + + if snakemake.output.get("t2g", False): + shell("mv --verbose {tempdir}/spliceu_t2g.tsv {snakemake.output.t2g} {log} ") + + if snakemake.output.get("g2g", False): + shell("mv --verbose {tempdir}/spliceu_g2g.tsv {snakemake.output.g2g} {log} ") diff --git a/test.py b/test.py index 998b2f13a8..4f3c8f27f2 100644 --- a/test.py +++ b/test.py @@ -1184,6 +1184,19 @@ def test_art_profiler_illumina(): ], ) +@skip_if_not_modified +def test_pyroe_makesplicedunspliced(): + run( + "bio/pyroe/makeunspliceunspliced/", + [ + "snakemake", + "--cores", + "1", + "--use-conda", + "-F", + "spliceu.fa", + ] + ) @skip_if_not_modified def test_pyroe_makesplicedintronic(): @@ -1197,7 +1210,8 @@ def test_pyroe_makesplicedintronic(): "--use-conda", "-F", ], - ) + ) + @skip_if_not_modified