Skip to content

Commit

Permalink
update algorithm for first full benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
stemangiola committed Aug 10, 2021
1 parent 345a2af commit f58cd58
Show file tree
Hide file tree
Showing 8 changed files with 9,865 additions and 680 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ dev/*rda
dev/*pdf
dev/dplyr-master/*
dev/reproducible_example_beta_binomial_truncated
src/stanExports*
20 changes: 20 additions & 0 deletions dev/benchmark_code/calculate_auc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
library(tidyverse)
library(magrittr)
library(bayestestR)

# Read arguments
args = commandArgs(trailingOnly=TRUE)

input_file = args[1]
output_file = args[2]

readRDS(input_file) %>%
group_by(name, slope, n_samples , n_cell_type, max_cell_counts_per_sample , add_outliers, probs) %>%
summarise(mean_FP_rate = mean(FP_rate), mean_TP_rate = mean(TP_rate)) %>%
ungroup() %>%
arrange(mean_FP_rate) %>%
nest(data = -c(name, slope, n_samples , n_cell_type, max_cell_counts_per_sample , add_outliers)) %>%
mutate(auc = map_dbl(data, ~ bayestestR::auc(.x$mean_FP_rate , .x$mean_TP_rate))) %>%
select(-data) %>%
saveRDS(output_file)

39 changes: 35 additions & 4 deletions dev/benchmark_code/create_makefile.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ commands_df =
unite("file_prefix" , c(slope, n_samples, n_cell_type, max_cell_counts_per_sample, add_outliers), remove = FALSE) %>%
mutate(input_file = sprintf("input_%s.rds", file_prefix)) %>%
mutate(output_file = sprintf("output_%s_%s.rds", file_prefix, method)) %>%
mutate(parsed_file = glue("parsed_{file_prefix}.rds")) %>%
mutate(auc_file = glue("auc_{file_prefix}.rds")) %>%
mutate(create_input_command = glue("{results_directory}{input_file}:\n{tab}Rscript {code_directory}create_input.R {slope} {n_samples} {n_cell_type} {max_cell_counts_per_sample} {add_outliers} {results_directory}{input_file}")) %>%
mutate(estimate_command = glue("{results_directory}{output_file}:\n{tab}Rscript {code_directory}estimate_{method}.R {results_directory}{input_file} {results_directory}{output_file} {add_outliers}"))
mutate(estimate_command = glue("{results_directory}{output_file}:{results_directory}{input_file}\n{tab}Rscript {code_directory}estimate_{method}.R {results_directory}{input_file} {results_directory}{output_file} {add_outliers}"))


# Create input
Expand All @@ -35,10 +37,39 @@ commands_df =
commands_df %>% distinct(create_input_command) %>% pull(create_input_command)
) %>%

# Estimate
c("CATEGORY=estimate_sccomp\nMEMORY=30024\nCORES=2\nWALL_TIME=36000") %>%
# Estimate sccomp
c("CATEGORY=estimate_sccomp\nMEMORY=60024\nCORES=16") %>%
c(
commands_df %>% distinct(estimate_command) %>% pull(estimate_command)
commands_df %>% filter(method %in% c("sccomp", "dirichletMultinomial")) %>% distinct(estimate_command) %>% pull(estimate_command)
) %>%

# Estimate sccomp
c("CATEGORY=estimate_rest\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
c(
commands_df %>% filter(method %in% c("rest")) %>% distinct(estimate_command) %>% pull(estimate_command)
) %>%

# Calculate significance
c("CATEGORY=significance\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
c(

commands_df %>%
pivot_wider(names_from = method, values_from = c(input_file, output_file, estimate_command)) %>%
mutate(parse_estimates_command = glue("{results_directory}{parsed_file}:{results_directory}{output_file_sccomp} {results_directory}{output_file_dirichletMultinomial} {results_directory}{output_file_rest}\n{tab}Rscript {code_directory}parse_estimates.R {slope} {n_samples} {n_cell_type} {max_cell_counts_per_sample} {add_outliers} {results_directory}{output_file_sccomp} {results_directory}{output_file_dirichletMultinomial} {results_directory}{output_file_rest} {results_directory}{parsed_file}")) %>%
distinct(parse_estimates_command) %>%
pull(parse_estimates_command)
) %>%

# Calculate AUC
c("CATEGORY=auc\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
c(

commands_df %>%
select(-c(input_file, output_file, estimate_command)) %>%
distinct() %>%
mutate(auc_command = glue("{results_directory}{auc_file}:{results_directory}{parsed_file}\n{tab}Rscript {code_directory}calculate_auc.R {results_directory}{parsed_file} {results_directory}{auc_file}")) %>%
distinct(auc_command) %>%
pull(auc_command)
) %>%


Expand Down
1,615 changes: 1,615 additions & 0 deletions dev/benchmark_code/makefile.makeflow

Large diffs are not rendered by default.

8,639 changes: 8,025 additions & 614 deletions dev/benchmark_code/makefile.makeflow.makeflowlog

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions dev/benchmark_code/parse_estimates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
library(sccomp)
library(tidyverse)
library(magrittr)
library(tidybulk)

# Read arguments
args = commandArgs(trailingOnly=TRUE)

estimate_file_1 = args[6]
estimate_file_2 = args[7]
estimate_file_3 = args[8]
output_file = args[9]

probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))

# Import files
readRDS(estimate_file_1) %>%
left_join(readRDS(estimate_file_2)) %>%
left_join(readRDS(estimate_file_3)) %>%

{print(99); (.)} %>%

# Add probs
dplyr::mutate(probs = map(run, ~ !!probs)) %>%
unnest((probs) ) %>%



# Calculate sgnificance
mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<=.y) %>% mutate(trend = logFC))) %>%
mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<=.y) %>% mutate(trend = logFC))) %>%
mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<=.y) %>% mutate(trend = -Tstatistic ))) %>%
mutate(hypothesis_sccomp = map2(
results_sccomp, (probs) ,
~ .x %>%
arrange(false_discovery_rate) %>%
mutate(positive = false_discovery_rate<=.y) %>%
mutate(trend = .median_type )
)) %>%
mutate(hypothesis_DirichletMultinomial = map2(
results_DirichletMultinomial , (probs) ,
~ .x %>%
arrange(false_discovery_rate) %>%
mutate(positive = false_discovery_rate<=.y) %>%
mutate(trend = .median_type )
)) %>%

# Clean
dplyr::select(-contains("results")) %>%

# Calculate accuracy
pivot_longer(contains("hypothesis"),names_prefix = "hypothesis_" ) %>%
mutate(accuracy_df = map2(
data, value ,
~ left_join(

.x %>% unnest(coefficients) %>% dplyr::select(cell_type, beta_1) %>% distinct %>% mutate(cell_type = as.character(cell_type)),
.y %>% dplyr::select(cell_type, positive, trend),
by="cell_type"

)

)) %>%
mutate(TP = map_int(accuracy_df, ~ .x %>% filter(positive & (beta_1 != 0) & (beta_1 * trend)>0) %>% nrow())) %>%
mutate(FP = map_int(accuracy_df, ~ .x %>% filter(

# Positive when not
(positive & (beta_1 == 0)) |

# Positive when yes but wrong direction
(positive & (beta_1 != 0) & (beta_1 * trend)<0)
) %>% nrow())) %>%
mutate(total_true_positive = 6, total_true_negative = 24-6) %>%
mutate(FP_rate = FP/total_true_negative) %>%
mutate(TP_rate = TP/total_true_positive) %>%


mutate(
slope = as.numeric(args[1]),
n_samples = as.integer(args[2]),
n_cell_type = as.integer(args[3]),
max_cell_counts_per_sample = as.integer(args[4]),
add_outliers = as.integer(args[5])
) %>%

# Save
saveRDS(output_file)

10 changes: 10 additions & 0 deletions dev/benchmark_code/plot_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@



dir("dev/benchmark_results/", pattern = "auc", full.names = TRUE) %>%
map_dfr(~ .x %>% readRDS()) %>%
nest(data = -c(add_outliers, max_cell_counts_per_sample)) %>%
pull(data) %>% .[[1]] %>%
ggplot(aes(slope, auc, color=name)) +
geom_line() +
facet_grid(n_samples ~ n_cell_type)
133 changes: 71 additions & 62 deletions dev/simulate_data_stefano.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ library(limma)
library(DirichletReg)
library(MGLM)



# Read arguments
args = commandArgs(trailingOnly=TRUE)
slope = as.numeric(args[1])
n_samples = as.integer(args[2])
max_cell_counts_per_sample = as.integer(args[3])
output_file = args[4]

# exposures = counts_obj %>% group_by(sample) %>% summarise(s=sum(count)) %>% pull(s) %>% sort %>% head(-1)
beta_0 = readRDS("dev/beta_0.rds")

Expand Down Expand Up @@ -150,61 +159,61 @@ benchmark =
)
))

saveRDS(benchmark, output_file)


benchmark %>%
pull(data) %>%
.[[1]] %>%
group_by(sample) %>%
mutate(proportion = (.value+1)/sum(.value+1)) %>%
ungroup(sample) %>%
ggplot(aes(factor(type), proportion, fill = cell_type %in% 1:4)) +
geom_boxplot() +
geom_jitter(size=0.5) +
scale_fill_manual(values = c("white", "#E2D379")) +
facet_wrap(~ as.integer(cell_type), ncol=5) +
theme_bw() +
theme(
strip.background =element_rect(fill="white", color="white"),
legend.position = "none",
)

ggsave(
"dev/example_boxplot_no_outliers.pdf",
units = c("mm"),
width = 183/2 ,
height = 183/2,
limitsize = FALSE
)


probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))

benchmark_hypothesis =
benchmark %>%
dplyr::mutate(probs = map(run, ~ !!probs)) %>%
unnest((probs) ) %>%
mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = logFC))) %>%
mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<.y) %>% mutate(trend = logFC))) %>%
mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = -Tstatistic ))) %>%
mutate(hypothesis_sccomp = map2(
results_sccomp, (probs) ,
~ .x %>%
arrange(false_discovery_rate) %>%
mutate(positive = false_discovery_rate<.y) %>%
mutate(trend = .median_type )
)) %>%
mutate(hypothesis_DirichletMultinomial = map2(
results_DirichletMultinomial , (probs) ,
~ .x %>%
arrange(false_discovery_rate) %>%
mutate(positive = false_discovery_rate<(.y)) %>%
mutate(trend = .median_type )
)) %>%
dplyr::select(-contains("results"))



# benchmark %>%
# pull(data) %>%
# .[[1]] %>%
# group_by(sample) %>%
# mutate(proportion = (.value+1)/sum(.value+1)) %>%
# ungroup(sample) %>%
# ggplot(aes(factor(type), proportion, fill = cell_type %in% 1:4)) +
# geom_boxplot() +
# geom_jitter(size=0.5) +
# scale_fill_manual(values = c("white", "#E2D379")) +
# facet_wrap(~ as.integer(cell_type), ncol=5) +
# theme_bw() +
# theme(
# strip.background =element_rect(fill="white", color="white"),
# legend.position = "none",
# )
#
# ggsave(
# "dev/example_boxplot_no_outliers.pdf",
# units = c("mm"),
# width = 183/2 ,
# height = 183/2,
# limitsize = FALSE
# )
#
#
# probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))
#
# benchmark_hypothesis =
# benchmark %>%
# dplyr::mutate(probs = map(run, ~ !!probs)) %>%
# unnest((probs) ) %>%
# mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = logFC))) %>%
# mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<.y) %>% mutate(trend = logFC))) %>%
# mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = -Tstatistic ))) %>%
# mutate(hypothesis_sccomp = map2(
# results_sccomp, (probs) ,
# ~ .x %>%
# arrange(false_discovery_rate) %>%
# mutate(positive = false_discovery_rate<.y) %>%
# mutate(trend = .median_type )
# )) %>%
# mutate(hypothesis_DirichletMultinomial = map2(
# results_DirichletMultinomial , (probs) ,
# ~ .x %>%
# arrange(false_discovery_rate) %>%
# mutate(positive = false_discovery_rate<(.y)) %>%
# mutate(trend = .median_type )
# )) %>%
# dplyr::select(-contains("results"))
#
#
#
benchmark_hypothesis %>%
dplyr::select(-contains("results")) %>%
pivot_longer(contains("hypothesis"),names_prefix = "hypothesis_" ) %>%
Expand Down Expand Up @@ -246,15 +255,15 @@ benchmark_hypothesis %>%
theme_bw() +
theme(legend.position = "bottom")

saveRDS(benchmark, "dev/benchmark_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.rds")
# "dev/benchmark_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.rds")


ggsave(
"dev/roc_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.pdf",
units = c("mm"),
width = 183/2 ,
height = 183/2,
limitsize = FALSE
)
# ggsave(
# "dev/roc_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.pdf",
# units = c("mm"),
# width = 183/2 ,
# height = 183/2,
# limitsize = FALSE
# )


0 comments on commit f58cd58

Please sign in to comment.