Skip to content

Commit

Permalink
update strt:alignment with refactoring of Bio::Gadget#grep_command
Browse files Browse the repository at this point in the history
  • Loading branch information
shka committed Jan 8, 2017
1 parent 6c82d0f commit fb57d95
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 42 deletions.
7 changes: 6 additions & 1 deletion lib/bio/gadget.rb
Expand Up @@ -110,7 +110,7 @@ def get_fifo(prefix, suffix, cleanup=true)
fifo
end

def grep_command(options)
def grep_command
"#{options.grep_prefix}grep"
end

Expand All @@ -136,6 +136,11 @@ def pipeline(*cmds)
def sort_command(options)
"#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
end

def sh(cmd)
system cmd
raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
end

def tee_command(options)
"#{options.coreutils_prefix}tee"
Expand Down
4 changes: 2 additions & 2 deletions lib/bio/gadget/fq1l.rb
Expand Up @@ -178,7 +178,7 @@ def exclude_duplicate
def match_3end(pattern)
exit unless STDIN.wait
# PCRE was faster than BRE and ERE in GNU grep 2.25
system "#{grep_command(options)}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
end

Expand All @@ -192,7 +192,7 @@ def match_3end(pattern)
def match_5end(pattern)
exit unless STDIN.wait
# PCRE was faster than BRE and ERE in GNU grep 2.25
system "#{grep_command(options)} #{options.invert_match ? '-v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
end

Expand Down
89 changes: 53 additions & 36 deletions lib/bio/gadget/strt.rb
Expand Up @@ -19,35 +19,35 @@ class Strt < Bio::Gadget

# strt:alignment

desc 'alignment INDEX FQGZDIR BAMDIR BEDDIR', 'Align reads to reference'
desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
long_desc <<-DESC
Align STRT reads (*.fq.gz files at FQGZDIR) to a reference (INDEX/ref.*.ht2). The alignments will be at BAMDIR/*.bam, and per-base 5'-end counts will be at BEDDIR/*.bed.gz.
Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
DESC

method_option *OPT_BUFFER_SIZE
method_option *OPT_COREUTILS_PREFIX
method_option *OPT_GREP_PREFIX
method_option *OPT_PARALLEL

def alignment(index0, indir0, bamdir0, beddir0)

index = File.expand_path(index0)
bamdir = File.expand_path(bamdir0)
beddir = File.expand_path(beddir0)
def alignment(refdir, seqdir, mapdir)

Dir.glob("#{File.expand_path(indir0)}/*.fq.gz").each do |fqgz|

STDERR.puts "#{`date`.strip}: Align #{fqgz}..."

tmpbam = get_temporary_path('strt.alignment', 'bam')
Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
base = File.basename(fqgz, '.fq.gz')
bam = "#{bamdir}/#{base}.bam"

pipeline("hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{index}/ref -U #{fqgz}",
"samtools view -S -b -@ #{options.parallel} - > #{tmpbam}")
system "samtools sort -f -@ #{options.parallel} #{tmpbam} #{bam}" or exit $?.exitstatus
system "samtools index #{bam}" or exit $?.exitstatus
pipeline("strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)}#{bam}",
"pigz -c > #{beddir}/#{base}.bed.gz")
STDERR.puts "#{`date`.strip}: Align #{base}..."
bam = "#{mapdir}/#{base}.bam"
pipeline(
"hisat2 --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
"#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
"samtools sort -@ #{options.parallel} -o #{bam}")
sh "samtools index #{bam}"
end

STDERR.puts "#{`date`.strip}: Count from all alignments."
Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
in_threads: options.parallel) do |bam|
pipeline(
"strt count_per_base#{buffer_size_option}#{coreutils_prefix_option(options)}#{parallel_option(options)} #{bam}",
"pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
end

end
Expand Down Expand Up @@ -124,7 +124,7 @@ def call_allele(csv, bamdir, refdir)

# strt:check_samples

desc 'check_sampels CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples'
desc 'check_samples CSV SEQDIR BAMDIR BEDDIR REFDIR', 'Check samples'
long_desc <<-DESC
Check samples in a design CSV by counting.'
DESC
Expand Down Expand Up @@ -176,12 +176,6 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
*count_commands).to_i
end

tmp = Array.new
samples.each do |row|
tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_READS"].to_f
end
samples["RELATIVE_POLYA_RNAS"] = tmp

samples["SPIKEIN_5END_READS"] =
Parallel.map(bases, in_threads: options.parallel) do |base|
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
Expand All @@ -194,6 +188,12 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)
end
samples["SPIKEIN_5END_RATE"] = tmp

tmp = Array.new
samples.each do |row|
tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
end
samples["RELATIVE_POLYA_RNAS"] = tmp

samples["CODING_READS"] =
Parallel.map(bases, in_threads: options.parallel) do |base|
pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{beddir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
Expand All @@ -214,7 +214,7 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)

tmp = Array.new
samples.each do |row|
tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_READS"]
tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
end
samples["RELATIVE_MRNAS"] = tmp

Expand All @@ -235,14 +235,31 @@ def check_samples(csv, seqdir, bamdir, beddir, refdir)

def count_per_base(bam)

pipeline("samtools view -h #{bam}",
"grep -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
"samtools view -S -u -b -",
"bedtools bamtobed -i stdin",
"ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
"#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n",
"#{uniq_command(options)} -c",
"ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
pipeline(
"bedtools bamtobed -i #{bam}",
"ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
"#{sort_command(options)} -t '\t' -k 1,1 -k 2,2n",
"#{uniq_command(options)} -c",
"ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")

end

# strt:count_per_region

desc 'count_per_region COUNT REGION [REGION ...]',
'Count reads per region'
long_desc <<-DESC
Count reads per region. Read counts, which are summerized by a BED-format COUNT, within regions, which are defined by a BED-format REGION, are summed by the region names.
DESC

method_option *OPT_COREUTILS_PREFIX

def count_per_region(count, region0, *regions0)

pipeline(
"bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
"#{cut_command(options)} -f 5,11",
"ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"name,count\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")

end

Expand Down
6 changes: 3 additions & 3 deletions lib/bio/gadget/strt/prepare_transcriptome.rb
Expand Up @@ -66,7 +66,7 @@ def hg38(dir0)
regex_exon_number = /exon_number (\d+)/
Open3.pipeline_r(
"unpigz -c #{gtf}",
"#{grep_command(options)} '\tstart_codon\t'") do |fp, threads|
"#{grep_command} '\tstart_codon\t'") do |fp, threads|
fp.each do |line|
cols = line.rstrip.split /\t/
atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
Expand All @@ -89,7 +89,7 @@ def hg38(dir0)
fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
Open3.pipeline_r(
"unpigz -c #{gtf}",
"#{grep_command(options)} -E '\t(exon|transcript)\t'") do |fp, threads|
"#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
fp.each do |line|
cols = line.rstrip.split /\t/
ann = cols[8]
Expand Down Expand Up @@ -166,7 +166,7 @@ def hg38(dir0)

genePred = "#{dir}/hg38_refGene.txt"
pipelin("unpigz -c #{gtf}",
"#{grep_command(options)} 'tag \"basic\"'",
"#{grep_command} 'tag \"basic\"'",
"gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
system "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}" or exit $?.exitstatus

Expand Down

0 comments on commit fb57d95

Please sign in to comment.