# Preparing a terra.bio sample table from GEO/SRA
TODO

In [None]:
# bioproject <- "PRJNA668299" # Mellor Lab - Spt4
bioproject <- "PRJNA669852"# Churchman Lab - dozens of regulatory factors
genomeName <- "sacCer3"
genome_fasta <- "https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/sacCer3.fa.gz"
genome_local_name <- "sacCer3.fa"

In [None]:
# Load needed packages, installing if necessary
required_packages <- c("AnVIL", "xml2", "rentrez", "glue", "kableExtra",
                      "tidyverse")
install_and_load <- function(packages) {
    if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
        BiocManager::install(setdiff(packages, rownames(installed.packages())))
    }
    lapply(packages, library,  character.only = TRUE)
    NULL
}
x <- install_and_load(required_packages)

In [None]:
# Define functions
# NOTE: use rentrez to avoid using the SRAdb secondary source.
create_sample_grid <- function(bioproject,
            semantic_name = "fastq", org = "GCP", retmax = NULL) {

        bpid <- entrez_search(db = "bioproject", term = bioproject)
    if (bpid$count == 0) {
        stop(glue::glue("Bioproject <{bioproject}> not found"))
    }

    sra_ids <- entrez_link(dbfrom = "bioproject", id = bpid$ids, db = "sra")$links$bioproject_sra
    x <- read_xml(entrez_fetch(db = "sra", id = sra_ids, rettype = "xml", retmax = retmax))
    
    runs <- xml_find_all(x, '/EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE/RUN_SET/RUN')
    run_id <- xml_attr(runs, "accession")
    expref <- xml_find_all(runs, "EXPERIMENT_REF")
    experiment_id <- xml_attr(expref, "accession")
    biosample_id <- xml_attr(expref, "refname")
    total_reads <- xml_attr(runs, "total_readss")
    total_bases <- xml_attr(runs, "total_bases")
    size <- xml_attr(runs, "size")
    # Only reporting first member in pool
    member <- xml_find_first(runs, "Pool/Member")
    sample_title <- xml_attr(member, "sample_title")
    sra_sample_id <- xml_attr(member, "accession")
    tibble(bioproject, experiment_id, biosample_id, sra_sample_id, run_id, sample_title, total_reads, total_bases, size)
}

In [None]:
bpid <- entrez_search(db = "bioproject", term = bioproject)
bp_summary <- entrez_summary(db = "bioproject", id = bpid$ids)
sapply(c("project_acc",
    "project_name",
    "submitter_organization",
    "registration_date"), 
       function(u) data.frame(attribute = u, value = bp_summary[[u]]), USE.NAMES = FALSE) %>% 
 t %>% kable(format = "pipe", caption = "Bioproject Summary Attributes")

In [None]:
result <- create_sample_grid(bioproject)
# TODO Clean up sample_id's if we are going to allow multiple assays
# Infer strain and assay type from sample_title
result %>% 
    separate(sample_title, into = c("sample_id", "assay"), sep = "_", remove = FALSE) %>%
    separate(sample_id, into = "strain", sep = "-", extra = "drop", remove = FALSE) %>%
    filter(assay == "NETseq") %>%
    relocate(sample_id) %>%
    arrange(sample_title) %>%
    rename("entity:sample_id" = sample_id) -> sample

sample %>% kable(format = "pipe", caption = "sample entity table")

In [None]:
sample %>% avtable_import
# TODO report before and after statistics

In [None]:
# Copy genome fasta to local bucket
work_dir <- tempdir()
genome_local_fa <- glue("{work_dir}/{genome_local_name}")
genome_local_gz <- glue("{genome_local_fa}.gz")
download.file(genome_fasta, genome_local_gz)
# TODO Assumes we are in gzip format. For simplicity, save uncompressed for now ... 
system(glue("gunzip -f {genome_local_gz}"))
gs_uri <- glue("{avbucket()}/{genome_local_name}")
gsutil_cp(genome_local_fa, gs_uri)