In [None]:
## Loading libraries
suppressPackageStartupMessages({
  library(dplyr)
  library(data.table)
  library(ggplot2)
  library(ggrepel)
  library(scales)
})

## Step 1: Define the benchmark directory and files
benchmark_dir <- "genopred/output/reference/benchmarks"

# List of files from your output
benchmark_files <- c(
  "ancestry_inference_i-ukb.txt", "ancestry_reporter-ukb.txt",
  "format_target_i-ukb-1.txt", "format_target_i-ukb-2.txt", "format_target_i-ukb-3.txt",
  "format_target_i-ukb-4.txt", "format_target_i-ukb-5.txt", "format_target_i-ukb-6.txt",
  "format_target_i-ukb-7.txt", "format_target_i-ukb-8.txt", "format_target_i-ukb-9.txt",
  "format_target_i-ukb-10.txt", "format_target_i-ukb-11.txt", "format_target_i-ukb-12.txt",
  "format_target_i-ukb-13.txt", "format_target_i-ukb-14.txt", "format_target_i-ukb-15.txt",
  "format_target_i-ukb-16.txt", "format_target_i-ukb-17.txt", "format_target_i-ukb-18.txt",
  "format_target_i-ukb-19.txt", "format_target_i-ukb-20.txt", "format_target_i-ukb-21.txt",
  "format_target_i-ukb-22.txt",
  "ldsc-T2D_SUZUKI.txt",
  "pc_projection_i-ukb-TRANS.txt",
  "prep_pgs_dbslmm_i-T2D_SUZUKI.txt",
  "prep_pgs_lassosum_i-T2D_SUZUKI.txt",
  "prep_pgs_megaprs_i-T2D_SUZUKI.txt",
  "prep_pgs_prscsx_i-T2D.txt", "prep_pgs_prscs_i-T2D_SUZUKI.txt",
  "prep_pgs_ptclump_i-T2D_SUZUKI.txt",
  "prep_pgs_quickprs_i-T2D_SUZUKI.txt",
  "ref_pca_i-TRANS.txt", "ref_pgs.txt",
  "sample_report_i-ukb.txt",
  "sumstat_prep_i-T2D_SUZUKI.txt",
  "target_pgs_i-ukb-TRANS.txt"
)

## Step 2: Function to parse benchmark files
parse_benchmark_file <- function(filename) {
  filepath <- file.path(benchmark_dir, filename)
  
  if (!file.exists(filepath)) {
    cat("File not found:", filepath, "\n")
    return(NULL)
  }
  
  tryCatch({
    # Read the file
    data <- fread(filepath)
    
    # Extract task name from filename
    task_name <- gsub(".txt", "", filename)
    
    # Categorize tasks following GenoPred's structure
    if (grepl("format_target", task_name)) {
      category <- "Target QC"
      subcategory <- "Target Formatting"
    } else if (grepl("ancestry_inference", task_name)) {
      category <- "Ancestry Inference"
      subcategory <- "Ancestry Inference"
    } else if (grepl("ancestry_reporter", task_name)) {
      category <- "Ancestry Inference"
      subcategory <- "Ancestry Reporting"
    } else if (grepl("sample_report", task_name)) {
      category <- "Target QC"
      subcategory <- "Sample Reporting"
    } else if (grepl("ref_pca", task_name)) {
      category <- "Ancestry Inference"
      subcategory <- "Reference PCA"
    } else if (grepl("pc_projection", task_name)) {
      category <- "Ancestry Inference"
      subcategory <- "PCA Projection"
    } else if (grepl("ldsc", task_name) | grepl("sumstat_prep", task_name)) {
      category <- "GWAS QC"
      subcategory <- "Summary Statistics"
    } else if (grepl("prep_pgs", task_name)) {
      category <- "PGS Methods"
      method <- gsub("prep_pgs_", "", task_name)
      method <- gsub("_i-T2D.*", "", method)
      subcategory <- paste0(toupper(method), " PGS")
    } else if (grepl("ref_pgs", task_name)) {
      category <- "PGS Methods"
      subcategory <- "Reference PGS"
    } else if (grepl("target_pgs", task_name)) {
      category <- "Target Scoring"
      subcategory <- "Target PGS"
    } else {
      category <- "Other"
      subcategory <- "Other"
    }
    
    # Return as data frame
    data.frame(
      Task = task_name,
      Category = category,
      Subcategory = subcategory,
      Wall_Time_Seconds = data$s,
      CPU_Time_Seconds = data$cpu_time,
      Max_RSS_GB = data$max_rss / 1024,  # Convert MB to GB
      stringsAsFactors = FALSE
    )
  }, error = function(e) {
    cat("Error reading", filename, ":", e$message, "\n")
    return(NULL)
  })
}

## Step 3: Read and combine all benchmark files
cat("Reading benchmark files...\n")
all_benchmarks <- list()

for (file in benchmark_files) {
  cat("Processing:", file, "\n")
  result <- parse_benchmark_file(file)
  if (!is.null(result)) {
    all_benchmarks[[file]] <- result
  }
}

# Combine all data
benchmark_data <- rbindlist(all_benchmarks, fill = TRUE)

# Check if we have data
if (nrow(benchmark_data) == 0) {
  stop("No benchmark data was successfully read. Please check file paths and formats.")
}

cat("Successfully processed", nrow(benchmark_data), "benchmark files\n")

## Step 4: Add the African subsetting step (911 minutes = 54660 seconds)
african_subsetting <- data.frame(
  Task = "african_subsetting_plink2",
  Category = "Data Subsetting",
  Subcategory = "African Subsetting",
  Wall_Time_Seconds = 911 * 60,  # Convert minutes to seconds
  CPU_Time_Seconds = 911 * 60,   # Assuming same as wall time
  Max_RSS_GB = 50,               # Estimate memory usage
  stringsAsFactors = FALSE
)

# Add to benchmark data
benchmark_data <- rbind(benchmark_data, african_subsetting)

## Step 5: Aggregate by category for timeline plot
# Sum wall time by category
timeline_data <- benchmark_data %>%
  group_by(Category) %>%
  summarise(
    values = sum(Wall_Time_Seconds),
    .groups = 'drop'
  )

# Apply GenoPred's categories with our additional "Data Subsetting" category
timeline_data$category <- factor(timeline_data$Category,
                                 levels = c('Data Subsetting',
                                            'GWAS QC',
                                            'Target QC',
                                            'Ancestry Inference',
                                            'PGS Methods',
                                            'Target Scoring',
                                            'Report Creation'))

# Remove any categories not present in our data
timeline_data <- timeline_data[!is.na(timeline_data$category), ]

# Order by category
timeline_data <- timeline_data[order(timeline_data$category), ]

# Calculate percentages (following GenoPred's code exactly)
timeline_data$perc <- timeline_data$values / sum(timeline_data$values) * 100
timeline_data$cum_perc <- cumsum(timeline_data$perc)

# Calculate start and end positions for rectangles
for(i in 1:nrow(timeline_data)) {
  timeline_data$start[i] <- ifelse(i == 1, 0, timeline_data$end[i - 1])
  timeline_data$end[i] <- timeline_data$cum_perc[i]
  timeline_data$label_position[i] <- timeline_data$cum_perc[i] - timeline_data$perc[i] / 2
}

# Format time labels (CORRECTED - fixed the time conversion logic)
timeline_data$time_clean <- NA
for(i in 1:nrow(timeline_data)) {
  if(timeline_data$values[i] < 60) {
    timeline_data$time_clean[i] <- paste0(round(timeline_data$values[i], 1), ' sec')
  } else if(timeline_data$values[i] >= 60 & timeline_data$values[i] < 3600) {
    timeline_data$time_clean[i] <- paste0(round(timeline_data$values[i] / 60, 1), ' min')
  } else {
    timeline_data$time_clean[i] <- paste0(round(timeline_data$values[i] / 3600, 1), ' hr')
  }
}

# Create labels (following GenoPred's format exactly)
timeline_data$label <- paste0(timeline_data$category,
                              "\n(", timeline_data$time_clean, ', ',
                              round(timeline_data$perc, 1), "%)")

## Step 6: Create the timeline plot using GenoPred's exact styling
cat("\nCreating GenoPred-style timeline plot...\n")

# Use colors similar to GenoPred's website with additional color for Data Subsetting
category_colors <- c(
  'Data Subsetting' = '#17becf',  # Added cyan color for the new category
  'GWAS QC' = '#1f77b4',
  'Target QC' = '#ff7f0e', 
  'Ancestry Inference' = '#2ca02c',
  'PGS Methods' = '#d62728',
  'Target Scoring' = '#9467bd',
  'Report Creation' = '#8c564b'
)

# Create the timeline plot (following GenoPred's code exactly)
genopred_timeline <- ggplot(timeline_data,
                            aes(
                              xmin = start,
                              xmax = end,
                              ymin = 0,
                              ymax = 0.1,
                              fill = category
                            )) +
  geom_rect(colour = 'black', size = 0.1) +
  geom_text_repel(
    aes(
      x = label_position,
      y = 0.1,
      label = label,
      segment.square  = TRUE,
      segment.inflect = TRUE
    ),
    force = 100,
    nudge_y           = 0.15,
    hjust = 0.5,
    segment.size      = 0.3,
    segment.curvature = -0.1,
    segment.color = 'darkgrey',
    box.padding = 0.6,
    size = 4  # Increased font size
  ) +
  scale_x_continuous(breaks = seq(0, 100, by = 20)) +
  scale_fill_manual(values = category_colors) +
  coord_cartesian(clip = "off", xlim = c(-20, 120), ylim = c(0, 0.3)) +
  labs(
    #title = "GenoPred Pipeline Computational Timeline",
    x = 'Time (%)',
    y = NULL
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 12),
    axis.line.x = element_line(color = "black", size = 0.5),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA)
  )
genopred_timeline
# Save with high DPI
ggsave("prs_output/genopred_pipeline_timeline_with_subsetting.png", genopred_timeline, width = 14, height = 8, dpi = 300)
cat("GenoPred-style timeline plot saved to: genopred_pipeline_timeline_with_subsetting.png\n")

## Step 4: Create additional detailed breakdown
# Create a more detailed breakdown by subcategory
subcategory_data <- benchmark_data %>%
  group_by(Category, Subcategory) %>%
  summarise(
    values = sum(Wall_Time_Seconds),
    Tasks = n(),
    .groups = 'drop'
  ) %>%
  arrange(Category, desc(values))

# Format time labels for subcategory data (CORRECTED)
subcategory_data$time_clean <- NA
for(i in 1:nrow(subcategory_data)) {
  if(subcategory_data$values[i] < 60) {
    subcategory_data$time_clean[i] <- paste0(round(subcategory_data$values[i], 1), ' sec')
  } else if(subcategory_data$values[i] >= 60 & subcategory_data$values[i] < 3600) {
    subcategory_data$time_clean[i] <- paste0(round(subcategory_data$values[i] / 60, 1), ' min')
  } else {
    subcategory_data$time_clean[i] <- paste0(round(subcategory_data$values[i] / 3600, 1), ' hr')
  }
}

## Step 5: Create bar plot for subcategory breakdown
subcategory_plot <- ggplot(subcategory_data, aes(x = reorder(Subcategory, values), y = values / 3600)) +
  geom_col(aes(fill = Category), alpha = 0.8) +
  geom_text(aes(label = time_clean), hjust = -0.1, size = 3.5) +
  scale_fill_manual(values = category_colors) +
  coord_flip() +
  labs(
    #title = "GenoPred Pipeline: Computational Time by Task\n(with African Subsetting)",
    x = "",
    y = "Wall Time (Hours)",
    fill = "Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.text = element_text(size = 11),
    axis.title = element_text(size = 12),
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  )
subcategory_plot
ggsave("prs_output/genopred_pipeline_subcategory_breakdown_with_subsetting.png", subcategory_plot, width = 12, height = 10, dpi = 300)
cat("Subcategory breakdown plot saved to: genopred_pipeline_subcategory_breakdown_with_subsetting.png\n")


## Step 6: Create PGS Methods Computational Cost Breakdown
cat("\nCreating PGS Methods computational cost breakdown...\n")

# Extract PGS methods data
pgs_data <- benchmark_data %>%
  filter(Category == "PGS Methods") %>%
  mutate(
    Method = gsub(" PGS", "", Subcategory),
    Population = case_when(
      grepl("AMR", Task) ~ "AMR",
      grepl("SSA", Task) ~ "SSA",
      grepl("TRANS", Task) ~ "TRANS",
      TRUE ~ "General"
    )
  )

# Aggregate by method and population
pgs_summary <- pgs_data %>%
  group_by(Method, Population) %>%
  summarise(
    Wall_Time_Seconds = sum(Wall_Time_Seconds),
    CPU_Time_Seconds = sum(CPU_Time_Seconds),
    Max_RSS_GB = max(Max_RSS_GB),
    Tasks = n(),
    .groups = 'drop'
  ) #%>%
  #arrange(desc(Wall_Time_Seconds))

# Format time labels for PGS data
pgs_summary$time_clean <- NA
for(i in 1:nrow(pgs_summary)) {
  if(pgs_summary$Wall_Time_Seconds[i] < 60) {
    pgs_summary$time_clean[i] <- paste0(round(pgs_summary$Wall_Time_Seconds[i], 1), ' sec')
  } else if(pgs_summary$Wall_Time_Seconds[i] >= 60 & pgs_summary$Wall_Time_Seconds[i] < 3600) {
    pgs_summary$time_clean[i] <- paste0(round(pgs_summary$Wall_Time_Seconds[i] / 60, 1), ' min')
  } else {
    pgs_summary$time_clean[i] <- paste0(round(pgs_summary$Wall_Time_Seconds[i] / 3600, 1), ' hr')
  }
}

# Create PGS methods timeline plot
pgs_timeline_data <- pgs_summary %>%
  group_by(Method) %>%
  summarise(
    values = sum(Wall_Time_Seconds),
    .groups = 'drop'
  )# %>%
  #arrange(desc(Tasks))

# Calculate percentages for PGS methods
pgs_timeline_data$perc <- pgs_timeline_data$values / sum(pgs_timeline_data$values) * 100
pgs_timeline_data$cum_perc <- cumsum(pgs_timeline_data$perc)

# Calculate start and end positions for PGS rectangles
for(i in 1:nrow(pgs_timeline_data)) {
  pgs_timeline_data$start[i] <- ifelse(i == 1, 0, pgs_timeline_data$end[i - 1])
  pgs_timeline_data$end[i] <- pgs_timeline_data$cum_perc[i]
  pgs_timeline_data$label_position[i] <- pgs_timeline_data$cum_perc[i] - pgs_timeline_data$perc[i] / 2
}

# Format time labels for PGS timeline
pgs_timeline_data$time_clean <- NA
for(i in 1:nrow(pgs_timeline_data)) {
  if(pgs_timeline_data$values[i] < 60) {
    pgs_timeline_data$time_clean[i] <- paste0(round(pgs_timeline_data$values[i], 1), ' sec')
  } else if(pgs_timeline_data$values[i] >= 60 & pgs_timeline_data$values[i] < 3600) {
    pgs_timeline_data$time_clean[i] <- paste0(round(pgs_timeline_data$values[i] / 60, 1), ' min')
  } else {
    pgs_timeline_data$time_clean[i] <- paste0(round(pgs_timeline_data$values[i] / 3600, 1), ' hr')
  }
}

pgs_timeline_data$label <- paste0(pgs_timeline_data$Method,
                                  "\n(", pgs_timeline_data$time_clean, ', ',
                                  round(pgs_timeline_data$perc, 1), "%)")

# Create PGS methods timeline plot
pgs_timeline <- ggplot(pgs_timeline_data,
                       aes(
                         xmin = start,
                         xmax = end,
                         ymin = 0,
                         ymax = 0.1,
                         fill = Method
                       )) +
  geom_rect(colour = 'black', size = 0.1) +
  geom_text_repel(
    aes(
      x = label_position,
      y = 0.1,
      label = label,
      segment.square  = TRUE,
      segment.inflect = TRUE
    ),
    force = 100,
    nudge_y           = 0.15,
    hjust = 0.5,
    segment.size      = 0.3,
    segment.curvature = -0.1,
    segment.color = 'darkgrey',
    box.padding = 0.6,
    size = 4
  ) +
  scale_x_continuous(breaks = seq(0, 100, by = 20)) +
  scale_fill_brewer(palette = "Set3", name = "PGS Method") +
  coord_cartesian(clip = "off", xlim = c(-20, 120), ylim = c(0, 0.3)) +
  labs(
    #title = "PGS Methods Computational Cost Breakdown",
    x = 'Time (%)',
    y = NULL
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 12),
    axis.line.x = element_line(color = "black", size = 0.5),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = "none",
    plot.background = element_rect(fill = "white", color = NA),
    panel.background = element_rect(fill = "white", color = NA)
  )
pgs_timeline
# Save PGS timeline plot
ggsave("prs_output/pgs_methods_computational_timeline.png", pgs_timeline, width = 14, height = 8, dpi = 300)
cat("PGS methods timeline plot saved to: pgs_methods_computational_timeline.png\n")

pgs_timeline

## Step 9: Save summary data
write.csv(timeline_data, "genopred_timeline_summary_with_subsetting.csv", row.names = FALSE)
write.csv(subcategory_data, "genopred_subcategory_breakdown_with_subsetting.csv", row.names = FALSE)
write.csv(benchmark_data, "genopred_detailed_benchmark_data_with_subsetting.csv", row.names = FALSE)

cat("\n Summary data saved to:\n")
cat("   - genopred_timeline_summary_with_subsetting.csv\n")
cat("   - genopred_subcategory_breakdown_with_subsetting.csv\n")
cat("   - genopred_detailed_benchmark_data_with_subsetting.csv\n")

## Step 7: Print overall summary
cat("\n", strrep("=", 60), "\n")
cat("GENOPRED PIPELINE COMPUTATIONAL SUMMARY (WITH AFRICAN SUBSETTING)\n")
cat(strrep("=", 60), "\n")
cat("Total Pipeline Wall Time:", round(sum(benchmark_data$Wall_Time_Seconds) / 3600, 2), "hours\n")
cat("Total Pipeline CPU Time:", round(sum(benchmark_data$CPU_Time_Seconds) / 3600, 2), "hours\n")
cat("Number of Tasks:", nrow(benchmark_data), "\n")
cat("Peak Memory Usage:", round(max(benchmark_data$Max_RSS_GB), 2), "GB\n\n")

cat("Time Distribution by Category:\n")
for(i in 1:nrow(timeline_data)) {
  cat(sprintf("  %-20s: %6.1f%% (%s)\n", 
              as.character(timeline_data$category[i]),
              timeline_data$perc[i],
              timeline_data$time_clean[i]))
}
cat(strrep("=", 60), "\n")

# Display the main timeline plot
print(genopred_timeline)

### Combined plot
library(gridExtra)

# Add A/B labels to the individual plots first
genopred_timeline_A <- genopred_timeline + 
  labs(title = "A") +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0))

pgs_timeline_B <- pgs_timeline + 
  labs(title = "B") +
  theme(plot.title = element_text(size = 16, face = "bold", hjust = 0))

# Create the combined plot
combined_plot <- grid.arrange(genopred_timeline_A, pgs_timeline_B, 
                              ncol = 1, nrow = 2)


# Save PGS timeline plot
ggsave("prs_output/computational_timeline.png", combined_plot, width = 14, height = 8, dpi = 300)
cat("combutational timeline plot saved to: computational_timeline.png\n")
