From fb57d954b0f02cad71225324ba54d677a1396b7b Mon Sep 17 00:00:00 2001
From: Shintaro Katayama <shintaro.katayama@gmail.com>
Date: Sun, 8 Jan 2017 19:05:24 +0100
Subject: [PATCH] update strt:alignment with refactoring of
 Bio::Gadget#grep_command

---
 lib/bio/gadget.rb                            |  7 +-
 lib/bio/gadget/fq1l.rb                       |  4 +-
 lib/bio/gadget/strt.rb                       | 89 ++++++++++++--------
 lib/bio/gadget/strt/prepare_transcriptome.rb |  6 +-
 4 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/lib/bio/gadget.rb b/lib/bio/gadget.rb
index 5296378..6a06a37 100644
--- a/lib/bio/gadget.rb
+++ b/lib/bio/gadget.rb
@@ -110,7 +110,7 @@ def get_fifo(prefix, suffix, cleanup=true)
         fifo
       end
       
-      def grep_command(options)
+      def grep_command
         "#{options.grep_prefix}grep"
       end
 
@@ -136,6 +136,11 @@ def pipeline(*cmds)
       def sort_command(options)
         "#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
       end
+
+      def sh(cmd)
+        system cmd
+        raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
+      end
       
       def tee_command(options)
         "#{options.coreutils_prefix}tee"
diff --git a/lib/bio/gadget/fq1l.rb b/lib/bio/gadget/fq1l.rb
index d68c956..cd7f2ba 100644
--- a/lib/bio/gadget/fq1l.rb
+++ b/lib/bio/gadget/fq1l.rb
@@ -178,7 +178,7 @@ def exclude_duplicate
       def match_3end(pattern)
         exit unless STDIN.wait
         # PCRE was faster than BRE and ERE in GNU grep 2.25
-        system "#{grep_command(options)}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
+        system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
         exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
       end
       
@@ -192,7 +192,7 @@ def match_3end(pattern)
       def match_5end(pattern)
         exit unless STDIN.wait
         # PCRE was faster than BRE and ERE in GNU grep 2.25
-        system "#{grep_command(options)} #{options.invert_match ? '-v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
+        system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
         exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
       end
 
diff --git a/lib/bio/gadget/strt.rb b/lib/bio/gadget/strt.rb
index b204100..b501e82 100644
--- a/lib/bio/gadget/strt.rb
+++ b/lib/bio/gadget/strt.rb
@@ -19,35 +19,35 @@ class Strt < Bio::Gadget
 
       # strt:alignment
 
-      desc 'alignment INDEX FQGZDIR BAMDIR BEDDIR', 'Align reads to reference'
+      desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
       long_desc <<-DESC
-Align STRT reads (*.fq.gz files at FQGZDIR) to a reference (INDEX/ref.*.ht2). The alignments will be at BAMDIR/*.bam, and per-base 5'-end counts will be at BEDDIR/*.bed.gz.
+Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
 DESC
 
       method_option *OPT_BUFFER_SIZE
       method_option *OPT_COREUTILS_PREFIX
+      method_option *OPT_GREP_PREFIX
       method_option *OPT_PARALLEL
 
-      def alignment(index0, indir0, bamdir0, beddir0)
-        
-        index = File.expand_path(index0)
-        bamdir = File.expand_path(bamdir0)
-        beddir = File.expand_path(beddir0)
+      def alignment(refdir, seqdir, mapdir)
         
-        Dir.glob("#{File.expand_path(indir0)}/*.fq.gz").each do |fqgz|
-          
-          STDERR.puts "#{`date`.strip}: Align #{fqgz}..."
-
-          tmpbam = get_temporary_path('strt.alignment', 'bam')
+        Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
           base = File.basename(fqgz, '.fq.gz')
-          bam = "#{bamdir}/#{base}.bam"
-          
-          pipeline("hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{index}/ref -U #{fqgz}",
-                   "samtools view -S -b -@ #{options.parallel} - > #{tmpbam}")
-          system "samtools sort -f -@ #{options.parallel} #{tmpbam} #{bam}" or exit $?.exitstatus
-          system "samtools index #{bam}" or exit $?.exitstatus
-          pipeline("strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)}#{bam}",
-                   "pigz -c > #{beddir}/#{base}.bed.gz")
+          STDERR.puts "#{`date`.strip}: Align #{base}..."
+          bam = "#{mapdir}/#{base}.bam"
+          pipeline(
+            "hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
+            "#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
+            "samtools sort -@ #{options.parallel} -o #{bam}")
+          sh "samtools index #{bam}"
+        end
+
+        STDERR.puts "#{`date`.strip}: Count from all alignments."
+        Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
+                     in_threads: options.parallel) do |bam|
+          pipeline(
+            "strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)} #{bam}",
+            "pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
         end
         
       end
@@ -124,7 +124,7 @@ def call_allele(csv, bamdir, refdir)
 
       # strt:check_samples
 
-      desc 'check_sampels CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples'
+      desc 'check_samples CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples'
 long_desc <<-DESC
 Check samples in a design CSV by counting.'
 DESC
@@ -176,12 +176,6 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
                             *count_commands).to_i
         end
 
-        tmp = Array.new
-        samples.each do |row|
-          tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_READS"].to_f
-        end
-        samples["RELATIVE_POLYA_RNAS"] = tmp
-
         samples["SPIKEIN_5END_READS"] =
           Parallel.map(bases, in_threads: options.parallel) do |base|
           pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
@@ -194,6 +188,12 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
         end
         samples["SPIKEIN_5END_RATE"] = tmp
 
+        tmp = Array.new
+        samples.each do |row|
+          tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
+        end
+        samples["RELATIVE_POLYA_RNAS"] = tmp
+
         samples["CODING_READS"] =
           Parallel.map(bases, in_threads: options.parallel) do |base|
           pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
@@ -214,7 +214,7 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
 
         tmp = Array.new
         samples.each do |row|
-          tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_READS"]
+          tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
         end
         samples["RELATIVE_MRNAS"] = tmp
 
@@ -235,14 +235,31 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
 
       def count_per_base(bam)
 
-        pipeline("samtools view -h #{bam}",
-                 "grep -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
-                 "samtools view -S -u -b -",
-                 "bedtools bamtobed -i stdin",
-                 "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
-                 "#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n",
-                 "#{uniq_command(options)} -c",
-                 "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
+        pipeline(
+          "bedtools bamtobed -i #{bam}",
+          "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
+          "#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n",
+          "#{uniq_command(options)} -c",
+          "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
+        
+      end
+
+      # strt:count_per_region
+
+      desc 'count_per_region COUNT REGION [REGION ...]',
+           'Count reads per region'
+      long_desc <<-DESC
+Count reads per region. Read counts, which are summerized by a BED-format COUNT, within regions, which are defined by a BED-format REGION, are summed by the region names.
+DESC
+      
+      method_option *OPT_COREUTILS_PREFIX
+
+      def count_per_region(count, region0, *regions0)
+
+        pipeline(
+          "bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
+          "#{cut_command(options)} -f 5,11",
+          "ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"name,count\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
         
       end
       
diff --git a/lib/bio/gadget/strt/prepare_transcriptome.rb b/lib/bio/gadget/strt/prepare_transcriptome.rb
index 254a89c..8c930a5 100644
--- a/lib/bio/gadget/strt/prepare_transcriptome.rb
+++ b/lib/bio/gadget/strt/prepare_transcriptome.rb
@@ -66,7 +66,7 @@ def hg38(dir0)
           regex_exon_number = /exon_number (\d+)/
           Open3.pipeline_r(
             "unpigz -c #{gtf}",
-            "#{grep_command(options)} '\tstart_codon\t'") do |fp, threads|
+            "#{grep_command} '\tstart_codon\t'") do |fp, threads|
             fp.each do |line|
               cols = line.rstrip.split /\t/
               atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
@@ -89,7 +89,7 @@ def hg38(dir0)
           fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
           Open3.pipeline_r(
             "unpigz -c #{gtf}",
-            "#{grep_command(options)} -E '\t(exon|transcript)\t'") do |fp, threads|
+            "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
             fp.each do |line|
               cols = line.rstrip.split /\t/
               ann = cols[8]
@@ -166,7 +166,7 @@ def hg38(dir0)
 
           genePred = "#{dir}/hg38_refGene.txt"
           pipelin("unpigz -c #{gtf}",
-                  "#{grep_command(options)} 'tag \"basic\"'",
+                  "#{grep_command} 'tag \"basic\"'",
                   "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
           system "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}" or exit $?.exitstatus