From 1dc4666c93962f4b81db8f72cd3b38220dc5f257 Mon Sep 17 00:00:00 2001 From: sharkLoc Date: Wed, 3 Apr 2024 16:29:17 +0800 Subject: [PATCH] version 0.3.14 --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- src/command.rs | 223 ++++++++++++++++++++++++++----------------------- src/main.rs | 32 +++---- src/remove.rs | 33 +++++--- src/rename.rs | 41 +++++++-- src/trimfq.rs | 6 ++ 8 files changed, 203 insertions(+), 138 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4ec5a07..5608eec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -876,7 +876,7 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "fqkit" -version = "0.3.13" +version = "0.3.14" dependencies = [ "anyhow", "bio", diff --git a/Cargo.toml b/Cargo.toml index 0099300..2c733f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fqkit" -version = "0.3.13" +version = "0.3.14" edition = "2021" authors = ["sharkLoc "] rust-version = "1.65.0" diff --git a/README.md b/README.md index 95d0520..d865a72 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ cargo install --git https://github.com/sharkLoc/fqkit.git ```bash FqKit -- A simple and cross-platform program for fastq file manipulation -Version: 0.3.13 +Version: 0.3.14 Authors: sharkLoc Source code: https://github.com/sharkLoc/fqkit.git diff --git a/src/command.rs b/src/command.rs index e35aab3..08614a5 100644 --- a/src/command.rs +++ b/src/command.rs @@ -4,7 +4,7 @@ use clap::{Parser,value_parser}; #[command( name = "FqKit", author = "sharkLoc", - version = "0.3.13", + version = "0.3.14", about = "A simple and cross-platform program for fastq file manipulation", long_about = None, next_line_help = false, @@ -32,15 +32,15 @@ pub struct Args { /// set gzip/bzip2/xz compression level 1 (compress faster) - 9 (compress better) for gzip/bzip2/xz output file, /// just work with option -o/--out #[arg(long = "compress-level", default_value_t = 6, global = true, - value_parser = value_parser!(u32).range(1..=9), value_name = "int", + value_parser = value_parser!(u32).range(1..=9), value_name = "INT", help_heading = Some("Global Arguments") )] pub compression_level: u32, /// if file name specified, write log message to this file, or write to stderr - #[arg(long = "log", global = true, help_heading = Some("Global Arguments"), value_name = "str")] + #[arg(long = "log", global = true, help_heading = Some("Global Arguments"), value_name = "STR")] pub logfile: Option, /// control verbosity of logging, possible values: {error, warn, info, debug, trace} - #[arg(short = 'v', long = "verbosity", global = true, value_name = "str", + #[arg(short = 'v', long = "verbosity", global = true, value_name = "STR", default_value_t = String::from("debug"), help_heading = Some("Global Arguments") )] @@ -59,10 +59,10 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// print first N fastq records - #[arg(short = 'n', long = "num", default_value_t = 10, value_name = "int")] + #[arg(short = 'n', long = "num", default_value_t = 10, value_name = "INT")] num: usize, /// output fastq file name or write to stdout, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// get last N records from fastq file @@ -70,25 +70,25 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// print last N fastq records - #[arg(short = 'n', long = "num", default_value_t = 10, value_name = "int")] + #[arg(short = 'n', long = "num", default_value_t = 10, value_name = "INT")] num: usize, /// output fastq file name or write to stdout, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// concat fastq files from different lanes concat { /// input read1 list file, one fastq file per line - #[arg(short = 'i', long = "input1", value_name = "str")] + #[arg(short = 'i', long = "input1", value_name = "STR")] read1: String, /// input read2 list file, one fastq file per line - #[arg(short = 'I', long = "input2", value_name = "str")] + #[arg(short = 'I', long = "input2", value_name = "STR")] read2: String, /// read1 output file name, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out1", value_name = "str")] + #[arg(short = 'o', long = "out1", value_name = "STR")] out1: String, /// read2 output file name, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'O', long = "out2", value_name = "str")] + #[arg(short = 'O', long = "out2", value_name = "STR")] out2: String, }, /// subsample sequences from big fastq file. @@ -97,31 +97,31 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// set rand seed. - #[arg(short = 's', long = "seed", default_value_t = 69, value_name = "int")] + #[arg(short = 's', long = "seed", default_value_t = 69, value_name = "INT")] seed: u64, /// subseq number - #[arg(short = 'n', long = "num", value_name = "int")] + #[arg(short = 'n', long = "num", value_name = "INT")] num: usize, /// read files twice to reduce much memory but cost more time #[arg(short = 'r', long = "rdc", help_heading = Some("FLAGS"))] rdc: bool, /// fastq output file name or write to stdout, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// select pair-end reads by read id select { /// input read1 fastq file - #[arg(short = '1', long = "read1", value_name = "str")] + #[arg(short = '1', long = "read1", value_name = "STR")] read1: String, /// input read2 fastq file - #[arg(short = '2', long = "read2", value_name = "str")] + #[arg(short = '2', long = "read2", value_name = "STR")] read2: String, /// output selected forward(read1) fastq file name, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short='f', long = "out1", value_name = "str")] + #[arg(short='f', long = "out1", value_name = "STR")] out1: String, /// output selected resverse(read2) fastq file name, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short='r', long = "out2", value_name = "str")] + #[arg(short='r', long = "out2", value_name = "STR")] out2: String, }, /// trim fastq reads by position @@ -129,28 +129,31 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// trim int bp from left - #[arg(short, long, default_value_t = 0, value_name = "int")] + #[arg(short, long, default_value_t = 0, value_name = "INT")] left: usize, /// trim int bp from right - #[arg(short, long, default_value_t = 0, value_name = "int")] + #[arg(short, long, default_value_t = 0, value_name = "INT")] right: usize, + /// after trimming, reads shorter than INT are discarded + #[arg(short = 'd', long = "discard", default_value_t = 0, value_name = "INT")] + len: usize, /// fastq output file name or write to stdout, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// a simple filter for pair end fastq sqeuence filter { /// input read1 fastq file - #[arg(short = '1', long = "read1", value_name = "str")] + #[arg(short = '1', long = "read1", value_name = "STR")] read1: String, /// input read2 fastq file - #[arg(short = '2', long = "read2", value_name = "str")] + #[arg(short = '2', long = "read2", value_name = "STR")] read2: String, /// if one read number of N base is more then N base limit, then this read pair is discarded. - #[arg(short = 'n', long = "n-limit", default_value_t=5, value_name = "int")] + #[arg(short = 'n', long = "n-limit", default_value_t=5, value_name = "INT")] nbase: usize, /// reads shorter than length required will be discarded - #[arg(short = 'l', long = "length", default_value_t=30, value_name = "int")] + #[arg(short = 'l', long = "length", default_value_t=30, value_name = "INT")] length: usize, /// the complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]), ///a 51-bp sequence, with 3 bases that is different from its next base @@ -158,30 +161,30 @@ pub enum Subcli { ///the threshold for low complexity filter (0~100). 30 is recommended, which means 30% complexity is required. #[arg(short = 'y', long = "complexity", default_value_t = 0, value_parser = value_parser!(u32).range(0..=100), - verbatim_doc_comment, value_name = "int" + verbatim_doc_comment, value_name = "INT" )] complexity: u32, /// if one read's average quality score < average qual, then this read pair is discarded, ///eg. Q20 error 0.01, Q30 error 0.001, averaging the probability of error is 0.0055 => Q value 22.59637 - #[arg(short = 'Q', long = "average_qual", default_value_t = 20, verbatim_doc_comment, value_name = "int")] + #[arg(short = 'Q', long = "average_qual", default_value_t = 20, verbatim_doc_comment, value_name = "INT")] average_qual: u8, ///phred score 33 or 64 - #[arg(short = 'p', long = "phred", default_value_t = 33, value_name = "int")] + #[arg(short = 'p', long = "phred", default_value_t = 33, value_name = "INT")] phred: u8, /// the number of reads in the chunk on each thread - #[arg(short, long, default_value_t = 5000, value_name = "int")] + #[arg(short, long, default_value_t = 5000, value_name = "INT")] chunk: usize, /// number of additional worker threads to use - #[arg(short='@', long="thread", default_value_t = 4, value_name = "int")] + #[arg(short='@', long="thread", default_value_t = 4, value_name = "INT")] thread: usize, /// specify the file to store reads(interleaved) that cannot pass the filters, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short='u', long = "failed", value_name = "str")] + #[arg(short='u', long = "failed", value_name = "STR")] failed: String, /// output pass filtered forward(read1) fastq file name, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short='f', long = "out1", value_name = "str")] + #[arg(short='f', long = "out1", value_name = "STR")] out1: String, /// output pass filtered resverse(read2) fastq file name, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short='r', long = "out2", value_name = "str")] + #[arg(short='r', long = "out2", value_name = "STR")] out2: String, }, /// print fastq records in a range @@ -189,13 +192,13 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// skip first int read records - #[arg(short = 's', long = "skip", default_value_t = 0, value_name = "int")] + #[arg(short = 's', long = "skip", default_value_t = 0, value_name = "INT")] skip: usize, /// take int read records - #[arg(short = 't', long = "take", value_name = "int")] + #[arg(short = 't', long = "take", value_name = "INT")] take: usize, /// fastq output file name or write to stdout, files ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, @@ -205,20 +208,20 @@ pub enum Subcli { input: Option, /// specify pattern/motif, regular expression supported, e.g., -p "ATC{2,}" or -p "ATCCG" ///for multiple motifs, -p "TTAGGG|CCCTAA" - #[arg(short = 'p', long = "pattern",verbatim_doc_comment, value_name = "str")] + #[arg(short = 'p', long = "pattern",verbatim_doc_comment, value_name = "STR")] pat: String, /// if specified, enable case insensitive matching for the entire pattern #[arg(short = 'i', long ="ignore-case", help_heading = Some("FLAGS"))] case: bool, /// the number of reads in the chunk on each thread - #[arg(short, long, default_value_t = 5000, value_name = "int")] + #[arg(short, long, default_value_t = 5000, value_name = "INT")] chunk: usize, /// number of additional worker threads to use - #[arg(short='@', long="thread", default_value_t = 1, value_name = "int")] + #[arg(short='@', long="thread", default_value_t = 1, value_name = "INT")] thread: usize, /// output contain pattern/motif reads result fastq file or write to stdout, ///file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", verbatim_doc_comment, value_name = "str")] + #[arg(short = 'o', long = "out", verbatim_doc_comment, value_name = "STR")] out: Option, }, /// grep fastq sequence by read id or full name @@ -226,14 +229,14 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// read name list file, one name per line and without read name prefix "@" - #[arg(short = 'i', long = "id-list", value_name = "str")] + #[arg(short = 'i', long = "id-list", value_name = "STR")] ids: String, /// if specified, match read by full name instead of just id #[arg(short = 'f', long = "full-name", help_heading = Some("FLAGS"))] full: bool, /// output matched reads result in fastq file or write to stdout, ///file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", verbatim_doc_comment, value_name = "str")] + #[arg(short = 'o', long = "out", verbatim_doc_comment, value_name = "STR")] out: Option, }, /// summary for fastq format file @@ -242,13 +245,13 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, ///phred score 33 or 64 - #[arg(short = 'p', long = "phred", default_value_t = 33, value_name = "int")] + #[arg(short = 'p', long = "phred", default_value_t = 33, value_name = "INT")] phred: u8, /// specify a name for summary output file - #[arg(short='s',long="sumy",default_value_t=String::from("summary.txt"), value_name = "str")] + #[arg(short='s',long="sumy",default_value_t=String::from("summary.txt"), value_name = "STR")] sum: String, /// if not specified, cycle result write to stdout - #[arg(short = 'c', long = "cycle", value_name = "str")] + #[arg(short = 'c', long = "cycle", value_name = "STR")] cyc: Option, }, /// shuffle fastq sequences @@ -257,10 +260,10 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// set rand seed. - #[arg(short = 's', long = "seed", default_value_t = 69, value_name = "int")] + #[arg(short = 's', long = "seed", default_value_t = 69, value_name = "INT")] seed: u64, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// report the number sequences and bases @@ -268,13 +271,13 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// the number of reads in the chunk on each thread - #[arg(short, long, default_value_t = 5000, value_name = "int")] + #[arg(short, long, default_value_t = 5000, value_name = "INT")] chunk: usize, /// number of additional worker threads to use - #[arg(short='@', long="thread", default_value_t = 3, value_name = "int")] + #[arg(short='@', long="thread", default_value_t = 3, value_name = "INT")] thread: usize, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, @@ -283,16 +286,16 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, ///set sliding window step size - #[arg(short = 'w', long = "window", default_value_t = 10, value_name = "int")] + #[arg(short = 'w', long = "window", default_value_t = 10, value_name = "INT")] window: usize, ///set sliding window step size - #[arg(short = 's', long = "step", default_value_t = 5, value_name = "int")] + #[arg(short = 's', long = "step", default_value_t = 5, value_name = "INT")] step: usize, /// suffix added to the sequence ID - #[arg(short = 'S', long = "suffidx", default_value_t = String::from("_slide"), value_name = "str")] + #[arg(short = 'S', long = "suffidx", default_value_t = String::from("_slide"), value_name = "STR")] suffix: String, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// sort fastq file by name/seq/gc/length @@ -316,31 +319,31 @@ pub enum Subcli { #[arg(short = 'r', long = "reverse", help_heading = Some("FLAGS"))] reverse: bool, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// line plot for A T G C N percentage in read position plot { /// input cycle result data: fqkit stats cycle output - #[arg(short = 'd', long = "data", value_name = "str")] + #[arg(short = 'd', long = "data", value_name = "STR")] data: String, /// if specified, show line plot in terminal #[arg( short = 's', long ="show-terminal", help_heading = Some("FLAGS"))] show: bool, /// output base figure prefix name - #[arg(short='p', long="prefix", default_value_t=String::from("base_plot"), value_name = "str")] + #[arg(short='p', long="prefix", default_value_t=String::from("base_plot"), value_name = "STR")] prefix: String, /// set output figure width - #[arg(short = 'W', long = "width", default_value_t = 960, value_name = "int")] + #[arg(short = 'W', long = "width", default_value_t = 960, value_name = "INT")] width: usize, /// set output figure height - #[arg(short = 'H', long = "height", default_value_t = 540, value_name = "int")] + #[arg(short = 'H', long = "height", default_value_t = 540, value_name = "INT")] height: usize, /// set max ylim (0~100) #[arg(short = 'y', long = "ylim", default_value_t = 50.0, value_name = "float")] ylim: f32, /// figure type 'png' or 'svg' - #[arg(short='t', long="types", default_value_t=String::from("png"), value_name = "str")] + #[arg(short='t', long="types", default_value_t=String::from("png"), value_name = "STR")] types: String, }, /// translate fastq to fasta @@ -351,31 +354,31 @@ pub enum Subcli { #[arg(short='r', long="remove", help_heading = Some("FLAGS"))] remove: bool, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out", value_name = "str")] + #[arg(short = 'o', long = "out", value_name = "STR")] out: Option, }, /// converts a fastq file to an unaligned SAM file fq2sam { /// input fastq file - #[arg(short = '1', long = "read1", value_name = "str")] + #[arg(short = '1', long = "read1", value_name = "STR")] r1: String, /// input fastq file for the second read of paired end data - #[arg(short = '2', long = "read2", help_heading = Some("Optional Arguments"), value_name = "str")] + #[arg(short = '2', long = "read2", help_heading = Some("Optional Arguments"), value_name = "STR")] r2: Option, /// sample name to insert into the read group header - #[arg(short = 's', long = "sample-name" ,value_name = "str")] + #[arg(short = 's', long = "sample-name" ,value_name = "STR")] sm: String, /// read group name, default: A - #[arg(short = 'r', long = "read-group-name", help_heading = Some("Optional Arguments") ,value_name = "str")] + #[arg(short = 'r', long = "read-group-name", help_heading = Some("Optional Arguments") ,value_name = "STR")] rg: Option, /// the library name to place into the LB attribute in the read group header - #[arg(short = 'l', long = "library-name", help_heading = Some("Optional Arguments") ,value_name = "str")] + #[arg(short = 'l', long = "library-name", help_heading = Some("Optional Arguments") ,value_name = "STR")] lb: Option, /// the platform type (e.g. ILLUMINA, SOLID) to insert into the read group header - #[arg(short = 'p', long = "platform", help_heading = Some("Optional Arguments") ,value_name = "str")] + #[arg(short = 'p', long = "platform", help_heading = Some("Optional Arguments") ,value_name = "STR")] pl: Option, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, /// converts the fastq file quality scores @@ -389,7 +392,7 @@ pub enum Subcli { #[arg(long = "to64", help_heading = Some("FLAGS"))] to64: bool, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, @@ -400,36 +403,37 @@ pub enum Subcli { input: Option, /// filed number, id:1, sequence:2, symbol:4, quality:8 ///eg. output id, sequence and quality value: 1 + 2 + 8 == 11 , - #[arg(short = 'f', long = "field", default_value_t = 3, verbatim_doc_comment ,value_name = "int")] + #[arg(short = 'f', long = "field", default_value_t = 3, verbatim_doc_comment ,value_name = "INT")] flag: u8, /// output seprater, can be ",", ";", #[arg(short = 's', long = "sep", default_value_t='\t' ,value_name = "char")] sep: char, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, - /// perform demultiplex for pair-end fastq reads + /// perform demultiplex for pair-end fastq reads + #[command(visible_alias = "demux")] barcode { /// input read1 fastq file - #[arg(short = '1', long = "read1" ,value_name = "str")] + #[arg(short = '1', long = "read1" ,value_name = "STR")] read1: String, /// input read2 fastq file - #[arg(short = '2', long = "read2" ,value_name = "str")] + #[arg(short = '2', long = "read2" ,value_name = "STR")] read2: String, /// barcode list file, format eg: ///ATGCAGTG sample1 ///TGCAGTAC sample2 - #[arg(short = 'b', long = "barcode", verbatim_doc_comment ,value_name = "str")] + #[arg(short = 'b', long = "barcode", verbatim_doc_comment ,value_name = "STR")] bar: String, /// barcode position mode, 1:left, 2:right - #[arg(short = 'm', long = "mode", default_value_t = 2 ,value_name = "int")] + #[arg(short = 'm', long = "mode", default_value_t = 2 ,value_name = "INT")] mode: usize, /// barcode reverse complement #[arg(short = 'r', long = "rev_comp", help_heading = Some("FLAGS"))] trans: bool, /// barcode mismatch base count - #[arg(short = 'e', long = "error", default_value_t = 0 ,value_name = "int")] + #[arg(short = 'e', long = "error", default_value_t = 0 ,value_name = "INT")] mismatch: usize, /// if specified, output gzip compressed file #[arg(short = 'z', long = "gzip", help_heading = Some("FLAGS"))] @@ -441,7 +445,7 @@ pub enum Subcli { #[arg(short = 'x', long = "xz", help_heading = Some("FLAGS"))] xz: bool, /// fastq file output dir. - #[arg(short = 'o', long = "outdir", default_value_t = String::from(".") ,value_name = "str")] + #[arg(short = 'o', long = "outdir", default_value_t = String::from(".") ,value_name = "STR")] outdir: String, }, /// check the validity of a fastq record @@ -456,7 +460,7 @@ pub enum Subcli { #[arg(short = 's', long = "save", help_heading = Some("FLAGS"))] save: bool, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, /// remove reads by read name. @@ -464,14 +468,17 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, /// read name list file, one name per line and without read name prefix "@" - #[arg(short = 'n', long = "name" ,value_name = "str")] + #[arg(short = 'n', long = "name" ,value_name = "STR")] name: String, /// save removed reads in read name list - #[arg(short = 's', long = "save",default_value_t=String::from("rm.fq.gz") ,value_name = "str")] + #[arg(short = 's', long = "save",default_value_t=String::from("rm.fq.gz") ,value_name = "STR")] save: String, + /// if set, do not output removed reads + #[arg(short = 'r', long = "remove")] + rm: bool, }, /// rename sequence id in fastq file rename { @@ -481,10 +488,16 @@ pub enum Subcli { #[arg(short = 'k', long = "keep", help_heading = Some("FLAGS"))] keep: bool, /// set new id prefix for sequence - #[arg(short = 'p', long = "prefix" ,value_name = "str")] + #[arg(short = 'p', long = "prefix" ,value_name = "STR")] prefix: Option, + /// if specified, add a label before/after read id + #[arg(short = 'l', long = "label", value_name = "STR")] + label: Option, + /// if set, label before read id + #[arg(short = 'b', long = "before")] + before: bool, /// output fastq file name, or write to stdout, file name ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] output: Option, }, /// get a reverse-complement of fastq file. @@ -496,31 +509,31 @@ pub enum Subcli { #[arg(short = 'r', long = "reverse", help_heading = Some("FLAGS"))] rev: bool, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, /// split interleaved fastq file split { /// input fastq file, or read from stdin - #[arg(short = 'i', long = "input" ,value_name = "str")] + #[arg(short = 'i', long = "input" ,value_name = "STR")] input: Option, /// output fastq file prefix name - #[arg(short = 'p', long = "prefix" ,value_name = "str")] + #[arg(short = 'p', long = "prefix" ,value_name = "STR")] pre: String, /// fastq file outdir - #[arg(short = 'o', long = "out", default_value_t = String::from(".") ,value_name = "str")] + #[arg(short = 'o', long = "out", default_value_t = String::from(".") ,value_name = "STR")] out: String, }, /// merge PE reads as interleaved fastq file merge { /// input read1 fastq file. - #[arg(short = '1', long = "read1" ,value_name = "str")] + #[arg(short = '1', long = "read1" ,value_name = "STR")] read1: String, /// input read2 fastq file. - #[arg(short = '2', long = "read2" ,value_name = "str")] + #[arg(short = '2', long = "read2" ,value_name = "STR")] read2: String, /// output interleaved fastq file name, eg. result.fq.bz2 - #[arg(short = 'o', long = "out", default_value_t = String::from("interleaved.fq.gz") ,value_name = "str")] + #[arg(short = 'o', long = "out", default_value_t = String::from("interleaved.fq.gz") ,value_name = "STR")] out: String, }, /// convert any low quality base to 'N' or other chars @@ -528,16 +541,16 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, ///phred score 33 or 64 - #[arg(short = 'p', long = "phred", default_value_t = 33 ,value_name = "int")] + #[arg(short = 'p', long = "phred", default_value_t = 33 ,value_name = "INT")] phred: u8, /// low quality - #[arg(short = 'l', long = "low-quality",default_value_t = 5 ,value_name = "int")] + #[arg(short = 'l', long = "low-quality",default_value_t = 5 ,value_name = "INT")] low: u8, /// mask low quality ( <= low quality) base with this char #[arg(short = 'c', long = "char", default_value_t = 'N' ,value_name = "char")] chars: char, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, /// split fastq file by records number @@ -545,7 +558,7 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// set record number for each mini fastq file - #[arg(short = 'n', long = "num", default_value_t = 200000 ,value_name = "int")] + #[arg(short = 'n', long = "num", default_value_t = 200000 ,value_name = "INT")] num: usize, /// if specified, output gzip compressed file #[arg(short = 'z', long = "gzip", help_heading = Some("FLAGS"))] @@ -557,7 +570,7 @@ pub enum Subcli { #[arg(short = 'x', long = "xz", help_heading = Some("FLAGS"))] xz: bool, /// output prefix name - #[arg(short = 'p', long = "prefix", default_value_t = String::from("sub") ,value_name = "str")] + #[arg(short = 'p', long = "prefix", default_value_t = String::from("sub") ,value_name = "STR")] name: String, }, /// get GC content result and plot @@ -565,25 +578,25 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// output GC contnet result file name - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] output: Option, /// if specified, show histogram graphs in terminal #[arg( short = 's', long ="show-terminal", help_heading = Some("FLAGS"))] show: bool, /// output base figure prefix name - #[arg(short='p', long="prefix", default_value_t=String::from("gc_plot") ,value_name = "str")] + #[arg(short='p', long="prefix", default_value_t=String::from("gc_plot") ,value_name = "STR")] prefix: String, /// set output figure width - #[arg(short = 'W', long = "width", default_value_t = 960 ,value_name = "int")] + #[arg(short = 'W', long = "width", default_value_t = 960 ,value_name = "INT")] width: usize, /// set output figure height - #[arg(short = 'H', long = "height", default_value_t = 540 ,value_name = "int")] + #[arg(short = 'H', long = "height", default_value_t = 540 ,value_name = "INT")] height: usize, /// set max ylim (0~100) - #[arg(short = 'y', long = "ylim", default_value_t = 15 ,value_name = "int")] + #[arg(short = 'y', long = "ylim", default_value_t = 15 ,value_name = "INT")] ylim: usize, /// figure type 'png' or 'svg' - #[arg(short='t', long="types", default_value_t=String::from("png") ,value_name = "str")] + #[arg(short='t', long="types", default_value_t=String::from("png") ,value_name = "STR")] types: String, }, /// get reads length count @@ -592,7 +605,7 @@ pub enum Subcli { /// input fastq file, or read from stdin input: Option, /// output file name or write to stdout, file ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, }, /// view fastq file page by page @@ -600,7 +613,7 @@ pub enum Subcli { /// input fastq file input: Option, /// output reads page by page, file name ending in .gz/.bz2/.xz will be compressed automatically - #[arg(short = 'o', long = "out" ,value_name = "str")] + #[arg(short = 'o', long = "out" ,value_name = "STR")] out: Option, } } diff --git a/src/main.rs b/src/main.rs index 2ef8065..3938243 100644 --- a/src/main.rs +++ b/src/main.rs @@ -144,18 +144,18 @@ fn main() -> Result<(), Error> { Subcli::select { read1, read2, out1, out2 } => { select_pe_fastq(&read1, &read2, &out1, &out2, arg.compression_level)?; } - Subcli::trim { input, left, right, out } => { + Subcli::trim { input, left, right, len, out } => { if let Some(input) = input { if let Some(out) = out { - trim_fq(&Some(input.as_str()), left, right, &Some(out.as_str()), arg.compression_level)?; + trim_fq(&Some(input.as_str()), left, right, len, &Some(out.as_str()), arg.compression_level)?; } else { - trim_fq(&Some(input.as_str()), left, right, &None, arg.compression_level)?; + trim_fq(&Some(input.as_str()), left, right, len, &None, arg.compression_level)?; } } else { if let Some(out) = out { - trim_fq(&None, left, right, &Some(out.as_str()), arg.compression_level)?; + trim_fq(&None, left, right, len, &Some(out.as_str()), arg.compression_level)?; } else { - trim_fq(&None, left, right, &None, arg.compression_level)?; + trim_fq(&None, left, right, len, &None, arg.compression_level)?; } } } @@ -367,33 +367,35 @@ fn main() -> Result<(), Error> { Subcli::concat { read1, read2, out1, out2 } => { concat_fqstq_lane(&read1, &read2, &out1, &out2, arg.compression_level)?; } - Subcli::remove { input, out, name , save} => { + Subcli::remove { input, out, name , save, rm} => { if let Some(input) = input { if let Some(out) = out { - remove_read(&Some(&input), &Some(&out) ,&name, &save, arg.compression_level)?; + remove_read(&Some(&input), &Some(&out) ,&name, &save, rm, arg.compression_level)?; } else { - remove_read(&Some(&input), &None ,&name, &save, arg.compression_level)?; + remove_read(&Some(&input), &None ,&name, &save, rm, arg.compression_level)?; } } else { if let Some(out) = out { - remove_read(&None, &Some(&out) ,&name, &save, arg.compression_level)?; + remove_read(&None, &Some(&out) ,&name, &save, rm, arg.compression_level)?; } else { - remove_read(&None, &None ,&name, &save, arg.compression_level)?; + remove_read(&None, &None ,&name, &save, rm, arg.compression_level)?; } } } - Subcli::rename { input, keep, prefix, output } => { + Subcli::rename { input, keep, prefix, label, before, output } => { + let x = label.unwrap_or_default(); + let label = Some(x.as_str()); if let Some(input) =input { if let Some(output) = output { - rename_fastq(&Some(&input), keep, prefix, &Some(&output), arg.compression_level)?; + rename_fastq(&Some(&input), keep, prefix, label, before, &Some(&output), arg.compression_level)?; } else { - rename_fastq(&Some(&input), keep, prefix, &None, arg.compression_level)?; + rename_fastq(&Some(&input), keep, prefix, label, before, &None, arg.compression_level)?; } } else { if let Some(output) = output { - rename_fastq(&None, keep, prefix, &Some(&output), arg.compression_level)?; + rename_fastq(&None, keep, prefix, label, before, &Some(&output), arg.compression_level)?; } else { - rename_fastq(&None, keep, prefix, &None, arg.compression_level)?; + rename_fastq(&None, keep, prefix, label, before, &None, arg.compression_level)?; } } } diff --git a/src/remove.rs b/src/remove.rs index 20fd20b..a462cb4 100644 --- a/src/remove.rs +++ b/src/remove.rs @@ -10,6 +10,7 @@ pub fn remove_read( out: &Option<&str>, name: &str, save: &str, + rm: bool, compression_level: u32, ) -> Result<(),Error> { if let Some(file) = file { @@ -18,7 +19,9 @@ pub fn remove_read( info!("reading reads from stdin"); } info!("reading reads id form file: {}", name); - info!("removed reads in file: {}", save); + if !rm { + info!("removed reads in file: {}", save); + } let start = Instant::now(); let mut ids = vec![]; @@ -34,16 +37,26 @@ pub fn remove_read( let fq_reader = fastq::Reader::new(file_reader(file)?); let mut fq_writer = fastq::Writer::new(file_writer(out, compression_level)?); - let mut rm_writer = fastq::Writer::new(file_writer(&Some(&save), compression_level)?); - for rec in fq_reader.records().flatten() { - if !ids.contains(&rec.id().to_string()) { - fq_writer.write(rec.id(), rec.desc(), rec.seq(), rec.qual())?; - } else { - rm_writer.write(rec.id(), rec.desc(), rec.seq(), rec.qual())?; - } + + if rm { + for rec in fq_reader.records().flatten() { + if !ids.contains(&rec.id().to_string()) { + fq_writer.write(rec.id(), rec.desc(), rec.seq(), rec.qual())?; + } + } + fq_writer.flush()?; + } else { + let mut rm_writer = fastq::Writer::new(file_writer(&Some(&save), compression_level)?); + for rec in fq_reader.records().flatten() { + if !ids.contains(&rec.id().to_string()) { + fq_writer.write(rec.id(), rec.desc(), rec.seq(), rec.qual())?; + } else { + rm_writer.write(rec.id(), rec.desc(), rec.seq(), rec.qual())?; + } + } + fq_writer.flush()?; + rm_writer.flush()?; } - fq_writer.flush()?; - rm_writer.flush()?; info!("time elapsed is: {:?}",start.elapsed()); Ok(()) diff --git a/src/rename.rs b/src/rename.rs index fec0fd1..cff02a9 100644 --- a/src/rename.rs +++ b/src/rename.rs @@ -8,7 +8,9 @@ use crate::utils::*; pub fn rename_fastq( input: &Option<&str>, keep: bool, - prefix: Option, //&str, + prefix: Option, + label: Option<&str>, + before: bool, output: &Option<&str>, compression_level: u32, ) -> Result<()> { @@ -18,7 +20,7 @@ pub fn rename_fastq( } else { info!("reading from stdin"); } - + let fp = fastq::Reader::new(file_reader(input)?); let mut fo = fastq::Writer::new(file_writer(output, compression_level)?); let mut n: usize = 0; @@ -26,7 +28,22 @@ pub fn rename_fastq( if let Some(pre) = prefix { for rec in fp.records().flatten() { n += 1; - let newid = format!("{}{}",pre,n); + /*let newid = match label { + Some(x) => { + if before { + format!("{}{}{}",x,pre,n) + } else { + format!("{}{}{}",pre,x,n) + } + }, + None => { format!("{}{}",pre,n) } + };*/ + let newid = if before { + format!("{}{}{}",label.unwrap_or_default(),pre,n) + } else { + format!("{}{}{}",pre,label.unwrap_or_default(),n) + }; + let record = if keep { Record::with_attrs(&newid, rec.desc(), rec.seq(),rec.qual()) } else { @@ -38,10 +55,24 @@ pub fn rename_fastq( } else { for rec in fp.records().flatten() { n += 1; + /*let newid = if let Some(x) = label { + if before { + format!("{}{}",x,rec.id()) + } else { + format!("{}{}",rec.id(),x) + } + } else { + format!("{}",rec.id()) + };*/ + let newid = if before { + format!("{}{}",label.unwrap_or_default(),rec.id()) + } else { + format!("{}{}",rec.id(),label.unwrap_or_default()) + }; let record = if keep { - Record::with_attrs(rec.id(), rec.desc(), rec.seq(),rec.qual()) + Record::with_attrs(&newid, rec.desc(), rec.seq(),rec.qual()) } else { - Record::with_attrs(rec.id(), None, rec.seq(),rec.qual()) + Record::with_attrs(&newid, None, rec.seq(),rec.qual()) }; fo.write_record(&record)?; } diff --git a/src/trimfq.rs b/src/trimfq.rs index b8b9070..8fd2190 100644 --- a/src/trimfq.rs +++ b/src/trimfq.rs @@ -9,6 +9,7 @@ pub fn trim_fq( file: &Option<&str>, left: usize, right: usize, + len: usize, out: &Option<&str>, compression_level: u32, ) -> Result<()> { @@ -22,15 +23,20 @@ pub fn trim_fq( let length = right + left; let fq_reader = fastq::Reader::new(file_reader(file)?); let mut fq_writer = fastq::Writer::new(file_writer(out, compression_level)?); + for (idx,rec) in fq_reader.records().flatten().enumerate() { let rlen = rec.seq().len(); if left >= rlen || right>= rlen || length >= rlen { warn!("read: {} in order {} is short than {} , skip", rec.id(), idx+1, length); continue; } + let end = rlen - right; let seq = &rec.seq()[left..end]; let qual = &rec.qual()[left..end]; + if seq.len() < len { + continue; + } fq_writer.write(rec.id(), rec.desc(), seq, qual)?; } fq_writer.flush()?;