update algorithm for first full benchmark

MangiolaLaboratory · Aug 10, 2021 · f58cd58 · f58cd58
1 parent 345a2af
commit f58cd58
Show file tree

Hide file tree

Showing 8 changed files with 9,865 additions and 680 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ dev/*rda
 dev/*pdf
 dev/dplyr-master/*
 dev/reproducible_example_beta_binomial_truncated
+src/stanExports*
diff --git a/dev/benchmark_code/calculate_auc.R b/dev/benchmark_code/calculate_auc.R
@@ -0,0 +1,20 @@
+library(tidyverse)
+library(magrittr)
+library(bayestestR)
+
+# Read arguments
+args = commandArgs(trailingOnly=TRUE)
+
+input_file = args[1]
+output_file = args[2]
+
+readRDS(input_file) %>%
+  group_by(name,  slope, n_samples , n_cell_type,  max_cell_counts_per_sample , add_outliers, probs) %>%
+  summarise(mean_FP_rate = mean(FP_rate), mean_TP_rate = mean(TP_rate)) %>%
+  ungroup() %>%
+  arrange(mean_FP_rate) %>%
+  nest(data = -c(name,  slope, n_samples , n_cell_type,  max_cell_counts_per_sample , add_outliers)) %>%
+  mutate(auc = map_dbl(data, ~ bayestestR::auc(.x$mean_FP_rate , .x$mean_TP_rate))) %>%
+  select(-data) %>%
+  saveRDS(output_file)
+
diff --git a/dev/benchmark_code/create_makefile.R b/dev/benchmark_code/create_makefile.R
@@ -25,8 +25,10 @@ commands_df =
   unite("file_prefix" , c(slope, n_samples, n_cell_type, max_cell_counts_per_sample, add_outliers), remove = FALSE) %>%
   mutate(input_file = sprintf("input_%s.rds", file_prefix)) %>%
   mutate(output_file = sprintf("output_%s_%s.rds", file_prefix, method)) %>%
+  mutate(parsed_file = glue("parsed_{file_prefix}.rds")) %>%
+  mutate(auc_file = glue("auc_{file_prefix}.rds")) %>%
   mutate(create_input_command = glue("{results_directory}{input_file}:\n{tab}Rscript {code_directory}create_input.R {slope} {n_samples} {n_cell_type} {max_cell_counts_per_sample} {add_outliers} {results_directory}{input_file}")) %>%
-  mutate(estimate_command = glue("{results_directory}{output_file}:\n{tab}Rscript {code_directory}estimate_{method}.R {results_directory}{input_file} {results_directory}{output_file} {add_outliers}"))
+  mutate(estimate_command = glue("{results_directory}{output_file}:{results_directory}{input_file}\n{tab}Rscript {code_directory}estimate_{method}.R {results_directory}{input_file} {results_directory}{output_file} {add_outliers}"))
 
 
 # Create input
@@ -35,10 +37,39 @@ commands_df =
     commands_df %>% distinct(create_input_command) %>% pull(create_input_command)
   ) %>%
 
-  # Estimate
-  c("CATEGORY=estimate_sccomp\nMEMORY=30024\nCORES=2\nWALL_TIME=36000") %>%
+  # Estimate sccomp
+  c("CATEGORY=estimate_sccomp\nMEMORY=60024\nCORES=16") %>%
   c(
-    commands_df %>% distinct(estimate_command) %>% pull(estimate_command)
+    commands_df %>% filter(method %in% c("sccomp", "dirichletMultinomial")) %>%  distinct(estimate_command) %>% pull(estimate_command)
+  ) %>%
+
+  # Estimate sccomp
+  c("CATEGORY=estimate_rest\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
+  c(
+    commands_df %>% filter(method %in% c("rest")) %>% distinct(estimate_command) %>% pull(estimate_command)
+  ) %>%
+
+  # Calculate significance
+  c("CATEGORY=significance\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
+  c(
+
+    commands_df %>%
+      pivot_wider(names_from = method, values_from = c(input_file, output_file, estimate_command)) %>%
+      mutate(parse_estimates_command = glue("{results_directory}{parsed_file}:{results_directory}{output_file_sccomp} {results_directory}{output_file_dirichletMultinomial} {results_directory}{output_file_rest}\n{tab}Rscript {code_directory}parse_estimates.R {slope} {n_samples} {n_cell_type} {max_cell_counts_per_sample} {add_outliers} {results_directory}{output_file_sccomp} {results_directory}{output_file_dirichletMultinomial} {results_directory}{output_file_rest} {results_directory}{parsed_file}")) %>%
+      distinct(parse_estimates_command) %>%
+      pull(parse_estimates_command)
+  ) %>%
+
+  # Calculate AUC
+  c("CATEGORY=auc\nMEMORY=10024\nCORES=4\nWALL_TIME=1500") %>%
+  c(
+
+    commands_df %>%
+      select(-c(input_file, output_file, estimate_command)) %>%
+      distinct() %>%
+      mutate(auc_command = glue("{results_directory}{auc_file}:{results_directory}{parsed_file}\n{tab}Rscript {code_directory}calculate_auc.R {results_directory}{parsed_file} {results_directory}{auc_file}")) %>%
+      distinct(auc_command) %>%
+      pull(auc_command)
   ) %>%
 
 

diff --git a/dev/benchmark_code/makefile.makeflow b/dev/benchmark_code/makefile.makeflow
diff --git a/dev/benchmark_code/makefile.makeflow.makeflowlog b/dev/benchmark_code/makefile.makeflow.makeflowlog
diff --git a/dev/benchmark_code/parse_estimates.R b/dev/benchmark_code/parse_estimates.R
@@ -0,0 +1,88 @@
+library(sccomp)
+library(tidyverse)
+library(magrittr)
+library(tidybulk)
+
+# Read arguments
+args = commandArgs(trailingOnly=TRUE)
+
+estimate_file_1 = args[6]
+estimate_file_2 = args[7]
+estimate_file_3 = args[8]
+output_file = args[9]
+
+probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))
+
+# Import files
+readRDS(estimate_file_1) %>%
+  left_join(readRDS(estimate_file_2)) %>%
+  left_join(readRDS(estimate_file_3))  %>%
+
+  {print(99); (.)} %>%
+
+  # Add probs
+  dplyr::mutate(probs  = map(run, ~ !!probs)) %>%
+  unnest((probs) ) %>%
+
+
+
+  # Calculate sgnificance
+  mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<=.y) %>% mutate(trend = logFC))) %>%
+  mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<=.y) %>% mutate(trend = logFC))) %>%
+  mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<=.y) %>% mutate(trend = -Tstatistic    ))) %>%
+  mutate(hypothesis_sccomp = map2(
+    results_sccomp, (probs) ,
+    ~ .x  %>%
+      arrange(false_discovery_rate) %>%
+      mutate(positive = false_discovery_rate<=.y) %>%
+      mutate(trend = .median_type )
+  )) %>%
+  mutate(hypothesis_DirichletMultinomial  = map2(
+    results_DirichletMultinomial , (probs) ,
+    ~ .x  %>%
+      arrange(false_discovery_rate) %>%
+      mutate(positive = false_discovery_rate<=.y) %>%
+      mutate(trend = .median_type )
+  )) %>%
+
+  # Clean
+  dplyr::select(-contains("results")) %>%
+
+  # Calculate accuracy
+  pivot_longer(contains("hypothesis"),names_prefix = "hypothesis_" ) %>%
+  mutate(accuracy_df = map2(
+    data, value             ,
+    ~ left_join(
+
+      .x %>% unnest(coefficients) %>% dplyr::select(cell_type, beta_1) %>% distinct %>% mutate(cell_type = as.character(cell_type)),
+      .y %>% dplyr::select(cell_type, positive, trend),
+      by="cell_type"
+
+    )
+
+  )) %>%
+  mutate(TP = map_int(accuracy_df, ~ .x %>% filter(positive & (beta_1 != 0) & (beta_1 * trend)>0) %>% nrow())) %>%
+  mutate(FP = map_int(accuracy_df, ~ .x %>% filter(
+
+    # Positive when not
+    (positive & (beta_1 == 0)) |
+
+      # Positive when yes but wrong direction
+      (positive & (beta_1 != 0) & (beta_1 * trend)<0)
+  ) %>% nrow())) %>%
+  mutate(total_true_positive = 6, total_true_negative = 24-6) %>%
+  mutate(FP_rate = FP/total_true_negative) %>%
+  mutate(TP_rate = TP/total_true_positive) %>%
+
+
+  mutate(
+    slope = as.numeric(args[1]),
+    n_samples = as.integer(args[2]),
+    n_cell_type = as.integer(args[3]),
+    max_cell_counts_per_sample = as.integer(args[4]),
+    add_outliers = as.integer(args[5])
+  ) %>%
+
+  # Save
+  saveRDS(output_file)
+
diff --git a/dev/benchmark_code/plot_results.R b/dev/benchmark_code/plot_results.R
@@ -0,0 +1,10 @@
+
+
+
+dir("dev/benchmark_results/", pattern = "auc", full.names = TRUE) %>%
+  map_dfr(~ .x %>% readRDS()) %>%
+  nest(data = -c(add_outliers, max_cell_counts_per_sample)) %>%
+  pull(data) %>% .[[1]] %>%
+  ggplot(aes(slope, auc, color=name)) +
+  geom_line() +
+  facet_grid(n_samples ~ n_cell_type)
diff --git a/dev/simulate_data_stefano.R b/dev/simulate_data_stefano.R
@@ -8,6 +8,15 @@ library(limma)
 library(DirichletReg)
 library(MGLM)
 
+
+
+# Read arguments
+args = commandArgs(trailingOnly=TRUE)
+slope = as.numeric(args[1])
+n_samples = as.integer(args[2])
+max_cell_counts_per_sample = as.integer(args[3])
+output_file = args[4]
+
 # exposures = counts_obj %>% group_by(sample) %>% summarise(s=sum(count)) %>% pull(s) %>% sort %>% head(-1)
 beta_0 = readRDS("dev/beta_0.rds")
 
@@ -150,61 +159,61 @@ benchmark =
       )
   ))
 
+saveRDS(benchmark, output_file)
 
-
-benchmark %>%
-  pull(data) %>%
-  .[[1]] %>%
-  group_by(sample) %>%
-  mutate(proportion = (.value+1)/sum(.value+1)) %>%
-  ungroup(sample) %>%
-  ggplot(aes(factor(type), proportion, fill = cell_type %in% 1:4)) +
-  geom_boxplot() +
-  geom_jitter(size=0.5) +
-  scale_fill_manual(values = c("white", "#E2D379")) +
-  facet_wrap(~ as.integer(cell_type), ncol=5) +
-  theme_bw() +
-  theme(
-    strip.background =element_rect(fill="white", color="white"),
-    legend.position = "none",
-  )
-
-ggsave(
-  "dev/example_boxplot_no_outliers.pdf",
-  units = c("mm"),
-  width = 183/2 ,
-  height = 183/2,
-  limitsize = FALSE
-)
-
-
-probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))
-
-benchmark_hypothesis =
-  benchmark %>%
-  dplyr::mutate(probs  = map(run, ~ !!probs)) %>%
-  unnest((probs) ) %>%
-  mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = logFC))) %>%
-  mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<.y) %>% mutate(trend = logFC))) %>%
-  mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = -Tstatistic    ))) %>%
-  mutate(hypothesis_sccomp = map2(
-    results_sccomp, (probs) ,
-    ~ .x  %>%
-      arrange(false_discovery_rate) %>%
-      mutate(positive = false_discovery_rate<.y) %>%
-      mutate(trend = .median_type )
-  )) %>%
-  mutate(hypothesis_DirichletMultinomial  = map2(
-    results_DirichletMultinomial , (probs) ,
-    ~ .x  %>%
-      arrange(false_discovery_rate) %>%
-      mutate(positive = false_discovery_rate<(.y)) %>%
-      mutate(trend = .median_type )
-  )) %>%
-  dplyr::select(-contains("results"))
-
-
-
+# benchmark %>%
+#   pull(data) %>%
+#   .[[1]] %>%
+#   group_by(sample) %>%
+#   mutate(proportion = (.value+1)/sum(.value+1)) %>%
+#   ungroup(sample) %>%
+#   ggplot(aes(factor(type), proportion, fill = cell_type %in% 1:4)) +
+#   geom_boxplot() +
+#   geom_jitter(size=0.5) +
+#   scale_fill_manual(values = c("white", "#E2D379")) +
+#   facet_wrap(~ as.integer(cell_type), ncol=5) +
+#   theme_bw() +
+#   theme(
+#     strip.background =element_rect(fill="white", color="white"),
+#     legend.position = "none",
+#   )
+#
+# ggsave(
+#   "dev/example_boxplot_no_outliers.pdf",
+#   units = c("mm"),
+#   width = 183/2 ,
+#   height = 183/2,
+#   limitsize = FALSE
+# )
+#
+#
+# probs = seq(0, 0.1,length.out = 50) %>% c(seq(0.1, 1,length.out = 50))
+#
+# benchmark_hypothesis =
+#   benchmark %>%
+#   dplyr::mutate(probs  = map(run, ~ !!probs)) %>%
+#   unnest((probs) ) %>%
+#   mutate(hypothesis_edger = map2(results_edger, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = logFC))) %>%
+#   mutate(hypothesis_voom = map2(results_voom, (probs) , ~.x %>% arrange(adj.P.Val) %>% mutate(positive = adj.P.Val<.y) %>% mutate(trend = logFC))) %>%
+#   mutate(hypothesis_speckle = map2(results_speckle, (probs) , ~ .x %>% arrange(FDR) %>% mutate(positive = FDR<.y) %>% mutate(trend = -Tstatistic    ))) %>%
+#   mutate(hypothesis_sccomp = map2(
+#     results_sccomp, (probs) ,
+#     ~ .x  %>%
+#       arrange(false_discovery_rate) %>%
+#       mutate(positive = false_discovery_rate<.y) %>%
+#       mutate(trend = .median_type )
+#   )) %>%
+#   mutate(hypothesis_DirichletMultinomial  = map2(
+#     results_DirichletMultinomial , (probs) ,
+#     ~ .x  %>%
+#       arrange(false_discovery_rate) %>%
+#       mutate(positive = false_discovery_rate<(.y)) %>%
+#       mutate(trend = .median_type )
+#   )) %>%
+#   dplyr::select(-contains("results"))
+#
+#
+#
 benchmark_hypothesis %>%
   dplyr::select(-contains("results")) %>%
   pivot_longer(contains("hypothesis"),names_prefix = "hypothesis_" ) %>%
@@ -246,15 +255,15 @@ benchmark_hypothesis %>%
   theme_bw() +
   theme(legend.position = "bottom")
 
-saveRDS(benchmark, "dev/benchmark_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.rds")
+# "dev/benchmark_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.rds")
 
 
-ggsave(
-  "dev/roc_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.pdf",
-  units = c("mm"),
-  width = 183/2 ,
-  height = 183/2,
-  limitsize = FALSE
-)
+# ggsave(
+#   "dev/roc_slope0.4_samples10_celltypes10_noOutliers_max1000exposure_50replicates.pdf",
+#   units = c("mm"),
+#   width = 183/2 ,
+#   height = 183/2,
+#   limitsize = FALSE
+# )