Skip to content

Commit

Permalink
Merge branch 'back-from-biotope'
Browse files Browse the repository at this point in the history
  • Loading branch information
pveber committed Nov 19, 2020
2 parents 40d82a0 + 09565be commit 3e39f98
Show file tree
Hide file tree
Showing 103 changed files with 6,712 additions and 4,430 deletions.
1 change: 1 addition & 0 deletions bistro-bio.opam
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ depends: [
"dune" {>= "1.11"}
"biocaml"
"biotk"
"tyxml"
]
build: [
["dune" "subst"] {pinned}
Expand Down
2 changes: 1 addition & 1 deletion dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@ This library provides wrappers for popular tools for genomics, transcriptomics
and phylogeny, as well as custom tools to help piping data from one tool to the
other.
")
(depends biocaml biotk))
(depends biocaml biotk tyxml))
43 changes: 43 additions & 0 deletions lib/bio/ChIPQC.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
open Core_kernel
open Bistro
open Bistro.Shell_dsl
open Formats

type 'a sample = {
id : string ;
tissue : string ;
factor : string ;
replicate : string ;
bam : [`indexed_bam] directory ;
peaks : (#bed3 as 'a) file ;
}

let img = [ docker_image ~account:"pveber" ~name:"bioconductor" ~tag:"3.8" () ]

let sample_sheet samples =
let header = string "SampleID,Tissue,Factor,Replicate,bamReads,Peaks" in
let line { id ; tissue ; factor ; replicate ; bam ; peaks } =
seq ~sep:"," [ string id ; string tissue ; string factor ; string replicate ; dep (Samtools.indexed_bam_to_bam bam) ; dep peaks ]
in
seq ~sep:"\n" (header :: List.map ~f:line samples)

let rscript sample_sheet =
seq ~sep:"\n" [
string "library(ChIPQC)" ;
seq ~sep:"" [
string {|samples = read.csv("|} ;
file_dump sample_sheet ;
string {|")|} ;
] ;
string "experiment = ChIPQC(samples)" ;
seq ~sep:"" [
string {|ChIPQCreport(experiment,facet=F,reportFolder="|} ;
dest ;
string {|")|} ;
]
]

let run samples =
Workflow.shell ~descr:"ChIPQC" ~img [
cmd "Rscript" [ file_dump (rscript (sample_sheet samples)) ] ;
]
15 changes: 15 additions & 0 deletions lib/bio/ChIPQC.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
open Bistro
open Formats

type 'a sample = {
id : string ;
tissue : string ;
factor : string ;
replicate : string ;
bam : [`indexed_bam] directory ;
peaks : (#bed3 as 'a) file ;
}

val run : 'a sample list -> [`ChIPQC] directory
(** Beware: doesn't work with only one sample (see
https://support.bioconductor.org/p/84754/) *)
224 changes: 224 additions & 0 deletions lib/bio/DESeq2.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
open Core_kernel
open Bistro
open Bistro.Shell_dsl

class type table = object
inherit tsv
method header : [`yes]
end

type output =
<
comparison_summary : table file ;
comparisons : ((string * string * string) * table file) list ;
effect_table : table file ;
normalized_counts : table file ;
sample_clustering : svg file ;
sample_pca : svg file ;
directory : [`DESeq2] directory ;
>

let img = [ docker_image ~account:"pveber" ~name:"bioconductor" ~tag:"3.3" () ]

let wrapper_script = {|
library(DESeq2)
library(gplots)
library(RColorBrewer)

### DATA PROCESSING
loadCounts <- function(sample_files) {
loadFile <- function(fn) {
d <- read.table(fn,header=F,sep='\t')
d[!grepl("^__",d[,1]), 2] #remove HTSEQ sum counts
}
sapply(sample_files,loadFile)
}

loadIds <- function(sample_files) {
d <- read.table(sample_files[1],header=F,sep='\t')
d[!grepl("^__",d[,1]), 1]
}

differentialAnalysis <- function(counts, description) {
DESeq(DESeqDataSetFromMatrix(countData=counts,
colData=description,
design=as.formula(paste("~", paste(colnames(description),collapse=" + ")))),
fitType='local')
}

my.summary.results <- function(object) {
alpha <- 0.1
notallzero <- sum(object$baseMean > 0)
up <- sum(object$padj < alpha & object$log2FoldChange > 0,
na.rm = TRUE)
down <- sum(object$padj < alpha & object$log2FoldChange <
0, na.rm = TRUE)
filt <- sum(!is.na(object$pvalue) & is.na(object$padj))
outlier <- sum(object$baseMean > 0 & is.na(object$pvalue))
c(notallzero, up, down, filt, outlier)
}

### OUTPUT
outputForAllComparisons <- function(description, outdir, factor_names, ids, dds) {
recap <- data.frame(gene = ids)
stats <- data.frame(comparison = character(0),
expressed = integer(0),
up = integer(0),
down = integer(0),
filt = integer(0),
outlier = integer(0), stringsAsFactors=F)
for(f in factor_names) {
l <- unique(description[,f])
for(i in 1:length(l))
for(j in if(i+1 > length(l)) c() else (i + 1):length(l)) {
label <- paste0(f,"_",l[i],"_",l[j])

res <- results(dds,contrast=c(f,as.character(l[i]),as.character(l[j])))

fn <- paste0(outdir,"/results_",label,".tsv")
write.table(cbind(data.frame(id = ids), res),file=fn,row.names=F,sep='\t',quote=F)

recap[,paste0(label,"_l2fc")] <- res[,"log2FoldChange"]
recap[,paste0(label,"_padj")] <- res[,"padj"]

stats[dim(stats)[1]+1,] <- c(label,my.summary.results(res))

svg(paste0(outdir,"/MA_plot_",label,".svg"), width=7, height=3.5)
plotMA(res,main=label)
dev.off()
}
}

write.table(recap, file = paste0(outdir,"/recap.tsv"),row.names=F,sep='\t',quote=F)
write.table(stats, file = paste0(outdir,"/summary.tsv"),row.names=F,sep='\t',quote=F)
}


generalPlots <- function(description, outdir, factor_names, ids, dds) {
rld <- rlog(dds)
rldMat <- assay(rld)
rldDist <- dist(t(rldMat))
mat <- as.matrix(rldDist)
rownames(mat) <- colnames(mat) <- apply(description, 1, paste, collapse = ":")
hc <- hclust(rldDist)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)

svg(paste0(outdir,"/sample_clustering.svg"))
heatmap.2(mat, Rowv = as.dendrogram(hc), symm = TRUE, trace = "none", col = rev(hmcol), margin = c(13,13))
dev.off()

svg(paste0(outdir,"/sample_pca.svg"))
print(plotPCA(rld, intgroup = factor_names))
dev.off()

counts <- cbind(ids,as.data.frame(counts(dds,normalized=T)))
write.table(counts, file = paste0(outdir,"/normalized_counts.tsv"),row.names=F,sep='\t',quote=F,col.names=F)
}

main <- function(outdir, factor_names, sample_files, conditions) {
description <- as.data.frame(conditions)
colnames(description) <- factor_names
rownames(description) <- NULL
ids <- loadIds(sample_files)
counts <- loadCounts(sample_files)
dds <- differentialAnalysis(counts, description)
system(paste("mkdir -p", outdir))
outputForAllComparisons(description, outdir, factor_names, ids, dds)
generalPlots(description, outdir, factor_names, ids, dds)
}
|}

let app fn args =
seq ~sep:"" [
string fn ; string "(" ;
seq ~sep:"," args ;
string ")" ;
]

let app' fn args =
let arg (k, v) = seq ~sep:"=" [ string k ; v ] in
seq ~sep:"" [
string fn ; string "(" ;
list arg ~sep:"," args ;
string ")" ;
]

let script factors samples =
seq ~sep:"\n" [
string wrapper_script ;
app "main" [
quote dest ~using:'"' ;
app "c" (List.map ~f:(string % quote ~using:'"') factors) ;
app "c" (List.map ~f:(snd % dep % quote ~using:'"') samples) ;
app' "matrix" [
"data", app "c" (List.map ~f:fst samples
|> List.concat
|> List.map ~f:(string % quote ~using:'"')) ;
"ncol", int (List.length factors) ;
"byrow", string "T" ;
]
]
]


let wrapper factors samples =
Workflow.shell ~descr:"deseq2.wrapper" ~img [
cmd "Rscript" [ file_dump (script factors samples) ] ;
]

(*
remove duplicates *and* keep original order
not tail-recursive and quadratic complexity
*)
let unique xs =
let rec aux seen = function
| [] -> []
| h :: t ->
if List.mem seen h ~equal:Poly.( = ) then
aux seen t
else
h :: aux (h :: seen) t
in
aux [] xs

let rec fold_2sets xs ~init ~f =
match xs with
| [] -> init
| h :: t ->
let next = List.fold_left t ~init ~f:(fun accu g -> f accu h g) in
fold_2sets t ~init:next ~f

let factor_levels factor_names conditions =
List.fold_right
conditions
~init:(List.map factor_names ~f:(fun _ -> []))
~f:(fun cond accu -> List.map2_exn cond accu ~f:(fun c cs -> c :: cs))
|> List.map ~f:unique

let comparisons factor_names conditions =
let factor_levels = factor_levels factor_names conditions in
List.map2_exn factor_names factor_levels ~f:(fun name levels ->
fold_2sets levels ~init:[] ~f:(fun accu l1 l2 ->
(name, l1, l2) :: accu
)
|> List.rev
)
|> List.concat


let main_effects factors samples =
let o = wrapper factors samples in
let sel p = Workflow.select o p in
object
method sample_clustering = sel [ "sample_clustering.svg" ]
method sample_pca = sel [ "sample_pca.svg" ]
method normalized_counts = sel [ "normalized_counts.tsv" ]
method comparison_summary = sel [ "summary.tsv" ]
method effect_table = sel [ "recap.tsv" ]
method comparisons =
let conditions = List.map samples ~f:fst in
List.map (comparisons factors conditions) ~f:(fun ((name, l1, l2) as comp) ->
comp, sel [ sprintf "results_%s_%s_%s.tsv" name l1 l2 ]
)
method directory = o
end
24 changes: 24 additions & 0 deletions lib/bio/DESeq2.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
open Bistro

val img : container_image list

class type table = object
inherit tsv
method header : [`yes]
end

type output =
<
comparison_summary : table file ;
comparisons : ((string * string * string) * table file) list ;
effect_table : table file ;
normalized_counts : table file ;
sample_clustering : svg file ;
sample_pca : svg file ;
directory : [`DESeq2] directory ;
>

val main_effects :
string list ->
(string list * #Htseq.count_tsv file) list ->
output
37 changes: 37 additions & 0 deletions lib/bio/FastQC.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
open Bistro
open Bistro.Shell_dsl

type report = [`fastQC] directory

let img = [ docker_image ~account:"pveber" ~name:"fastqc" ~tag:"0.11.8" () ]

module Cmd = struct
let fastqc x = [
mkdir_p dest ;
cmd "fastqc" [
seq ~sep:"" [ string "--outdir=" ; dest ] ;
(
match x with
| `fq fq -> dep fq
| `fq_gz fq_gz -> Bistro_unix.Cmd.gzdep fq_gz
)
] ;
and_list [
cd dest ;
cmd "unzip" [ string "*_fastqc.zip" ] ;
cmd "mv" [ string "*_fastqc/*" ; string "." ]
] ;
]
end

let fastqc fq = Workflow.shell ~descr:"fastQC" ~img (Cmd.fastqc (`fq fq))

let fastqc_gz fq_gz = Workflow.shell ~descr:"fastQC" ~img (Cmd.fastqc (`fq_gz fq_gz))

let html_report x = Workflow.select x ["fastqc_report.html"]

let per_base_quality x =
Workflow.select x ["Images" ; "per_base_quality.png"]

let per_base_sequence_content x =
Workflow.select x ["Images" ; "per_base_sequence_content.png"]
10 changes: 10 additions & 0 deletions lib/bio/FastQC.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
open Bistro
open Formats

type report = [`fastQC] directory

val fastqc : #fastq file -> report
val fastqc_gz : #fastq gz file -> report
val html_report : report -> html file
val per_base_quality : report -> png file
val per_base_sequence_content : report -> png file
11 changes: 11 additions & 0 deletions lib/bio/SE_or_PE.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
type 'a t =
| Single_end of 'a
| Paired_end of 'a * 'a

let map x ~f = match x with
| Single_end x -> Single_end (f x)
| Paired_end (x, y) -> Paired_end (f x, f y)

let fst = function
| Single_end x
| Paired_end (x, _) -> x
6 changes: 6 additions & 0 deletions lib/bio/SE_or_PE.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
type 'a t =
| Single_end of 'a
| Paired_end of 'a * 'a

val map : 'a t -> f:('a -> 'b) -> 'b t
val fst : 'a t -> 'a

0 comments on commit 3e39f98

Please sign in to comment.