From fb57d954b0f02cad71225324ba54d677a1396b7b Mon Sep 17 00:00:00 2001 From: Shintaro Katayama Date: Sun, 8 Jan 2017 19:05:24 +0100 Subject: [PATCH] update strt:alignment with refactoring of Bio::Gadget#grep_command --- lib/bio/gadget.rb | 7 +- lib/bio/gadget/fq1l.rb | 4 +- lib/bio/gadget/strt.rb | 89 ++++++++++++-------- lib/bio/gadget/strt/prepare_transcriptome.rb | 6 +- 4 files changed, 64 insertions(+), 42 deletions(-) diff --git a/lib/bio/gadget.rb b/lib/bio/gadget.rb index 5296378..6a06a37 100644 --- a/lib/bio/gadget.rb +++ b/lib/bio/gadget.rb @@ -110,7 +110,7 @@ def get_fifo(prefix, suffix, cleanup=true) fifo end - def grep_command(options) + def grep_command "#{options.grep_prefix}grep" end @@ -136,6 +136,11 @@ def pipeline(*cmds) def sort_command(options) "#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz" end + + def sh(cmd) + system cmd + raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success? + end def tee_command(options) "#{options.coreutils_prefix}tee" diff --git a/lib/bio/gadget/fq1l.rb b/lib/bio/gadget/fq1l.rb index d68c956..cd7f2ba 100644 --- a/lib/bio/gadget/fq1l.rb +++ b/lib/bio/gadget/fq1l.rb @@ -178,7 +178,7 @@ def exclude_duplicate def match_3end(pattern) exit unless STDIN.wait # PCRE was faster than BRE and ERE in GNU grep 2.25 - system "#{grep_command(options)}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'" + system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'" exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i end @@ -192,7 +192,7 @@ def match_3end(pattern) def match_5end(pattern) exit unless STDIN.wait # PCRE was faster than BRE and ERE in GNU grep 2.25 - system "#{grep_command(options)} #{options.invert_match ? '-v' : ''} -P -e '^[^\\t]+\\t#{pattern}'" + system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'" exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i end diff --git a/lib/bio/gadget/strt.rb b/lib/bio/gadget/strt.rb index b204100..b501e82 100644 --- a/lib/bio/gadget/strt.rb +++ b/lib/bio/gadget/strt.rb @@ -19,35 +19,35 @@ class Strt < Bio::Gadget # strt:alignment - desc 'alignment INDEX FQGZDIR BAMDIR BEDDIR', 'Align reads to reference' + desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference' long_desc <<-DESC -Align STRT reads (*.fq.gz files at FQGZDIR) to a reference (INDEX/ref.*.ht2). The alignments will be at BAMDIR/*.bam, and per-base 5'-end counts will be at BEDDIR/*.bed.gz. +Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz. DESC method_option *OPT_BUFFER_SIZE method_option *OPT_COREUTILS_PREFIX + method_option *OPT_GREP_PREFIX method_option *OPT_PARALLEL - def alignment(index0, indir0, bamdir0, beddir0) - - index = File.expand_path(index0) - bamdir = File.expand_path(bamdir0) - beddir = File.expand_path(beddir0) + def alignment(refdir, seqdir, mapdir) - Dir.glob("#{File.expand_path(indir0)}/*.fq.gz").each do |fqgz| - - STDERR.puts "#{`date`.strip}: Align #{fqgz}..." - - tmpbam = get_temporary_path('strt.alignment', 'bam') + Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz| base = File.basename(fqgz, '.fq.gz') - bam = "#{bamdir}/#{base}.bam" - - pipeline("hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{index}/ref -U #{fqgz}", - "samtools view -S -b -@ #{options.parallel} - > #{tmpbam}") - system "samtools sort -f -@ #{options.parallel} #{tmpbam} #{bam}" or exit $?.exitstatus - system "samtools index #{bam}" or exit $?.exitstatus - pipeline("strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)}#{bam}", - "pigz -c > #{beddir}/#{base}.bed.gz") + STDERR.puts "#{`date`.strip}: Align #{base}..." + bam = "#{mapdir}/#{base}.bam" + pipeline( + "hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}", + "#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'", + "samtools sort -@ #{options.parallel} -o #{bam}") + sh "samtools index #{bam}" + end + + STDERR.puts "#{`date`.strip}: Count from all alignments." + Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"), + in_threads: options.parallel) do |bam| + pipeline( + "strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)} #{bam}", + "pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz") end end @@ -124,7 +124,7 @@ def call_allele(csv, bamdir, refdir) # strt:check_samples - desc 'check_sampels CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples' + desc 'check_samples CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples' long_desc <<-DESC Check samples in a design CSV by counting.' DESC @@ -176,12 +176,6 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir) *count_commands).to_i end - tmp = Array.new - samples.each do |row| - tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_READS"].to_f - end - samples["RELATIVE_POLYA_RNAS"] = tmp - samples["SPIKEIN_5END_READS"] = Parallel.map(bases, in_threads: options.parallel) do |base| pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz", @@ -194,6 +188,12 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir) end samples["SPIKEIN_5END_RATE"] = tmp + tmp = Array.new + samples.each do |row| + tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f + end + samples["RELATIVE_POLYA_RNAS"] = tmp + samples["CODING_READS"] = Parallel.map(bases, in_threads: options.parallel) do |base| pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz", @@ -214,7 +214,7 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir) tmp = Array.new samples.each do |row| - tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_READS"] + tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"] end samples["RELATIVE_MRNAS"] = tmp @@ -235,14 +235,31 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir) def count_per_base(bam) - pipeline("samtools view -h #{bam}", - "grep -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'", - "samtools view -S -u -b -", - "bedtools bamtobed -i stdin", - "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'", - "#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n", - "#{uniq_command(options)} -c", - "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'") + pipeline( + "bedtools bamtobed -i #{bam}", + "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'", + "#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n", + "#{uniq_command(options)} -c", + "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'") + + end + + # strt:count_per_region + + desc 'count_per_region COUNT REGION [REGION ...]', + 'Count reads per region' + long_desc <<-DESC +Count reads per region. Read counts, which are summerized by a BED-format COUNT, within regions, which are defined by a BED-format REGION, are summed by the region names. +DESC + + method_option *OPT_COREUTILS_PREFIX + + def count_per_region(count, region0, *regions0) + + pipeline( + "bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}", + "#{cut_command(options)} -f 5,11", + "ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"name,count\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'") end diff --git a/lib/bio/gadget/strt/prepare_transcriptome.rb b/lib/bio/gadget/strt/prepare_transcriptome.rb index 254a89c..8c930a5 100644 --- a/lib/bio/gadget/strt/prepare_transcriptome.rb +++ b/lib/bio/gadget/strt/prepare_transcriptome.rb @@ -66,7 +66,7 @@ def hg38(dir0) regex_exon_number = /exon_number (\d+)/ Open3.pipeline_r( "unpigz -c #{gtf}", - "#{grep_command(options)} '\tstart_codon\t'") do |fp, threads| + "#{grep_command} '\tstart_codon\t'") do |fp, threads| fp.each do |line| cols = line.rstrip.split /\t/ atgs[regex_transcript_id.match(cols[8]).to_a[1]] = @@ -89,7 +89,7 @@ def hg38(dir0) fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed") Open3.pipeline_r( "unpigz -c #{gtf}", - "#{grep_command(options)} -E '\t(exon|transcript)\t'") do |fp, threads| + "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads| fp.each do |line| cols = line.rstrip.split /\t/ ann = cols[8] @@ -166,7 +166,7 @@ def hg38(dir0) genePred = "#{dir}/hg38_refGene.txt" pipelin("unpigz -c #{gtf}", - "#{grep_command(options)} 'tag \"basic\"'", + "#{grep_command} 'tag \"basic\"'", "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}") system "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}" or exit $?.exitstatus