# Download SRA annotations

## Load libraries

In [1]:
suppressPackageStartupMessages(library(reutils))
suppressPackageStartupMessages(library(xml2))
suppressPackageStartupMessages(library(glue))
suppressPackageStartupMessages(library(tidyverse))

## Define analysis-specific variables

In [2]:
bioproject_number <- "PRJNA974090"

## Download SRA annotations and process it into required columns

In [3]:
sra_info <- bioproject_number %>% 
    esearch(db = "sra") %>% 
    efetch(db = "sra") %>% 
    content(as = "text") %>%  
    read_xml()

sra_annotations <- sra_info %>% 
    xml_find_all("//EXPERIMENT") %>% 
    map_df(xml_attrs) %>% 
    rename(srx = accession, sample_name = alias) %>%
    # # XPATH search below: find node called EXPERIMENT_REF with attribute accession value set to srx accession
    # # then go to the parent node and find all attributes
    mutate(data = map(srx, function (x) bind_rows(xml_attrs(xml_find_all(sra_info, glue('//EXPERIMENT_REF[@accession="{x}"]/..')))))) %>%
    unnest(data) %>% 
    rename(srr = accession) %>% 
    # mutate(data = str_split(sample_name, "_", 2)) %>% 
    select(srr, sample_name) %>%
    rename(sample_id = sample_name) %>% 
    # unnest_wider(data, names_sep = "_") %>% 
    # rename(sample_id = data_1, sample_name = data_2) #%>% 
    # filter(str_detect(sample_name, '51')) %>% # try out workflow on just a subset of the submitted files
    write_tsv("../../annotations/sra_annotations.tsv")

sra_annotations

srr,sample_id
<chr>,<chr>
SRR24783016,yeast_cyto_linkage
SRR24728123,47_5_lib_5
SRR24728124,47_5_lib_2
SRR24726858,47_5_lib_4
SRR24726859,47_5_lib_1
SRR24725018,67lib4-2
SRR24725020,67lib2-2
SRR24651716,51lib2
SRR24651717,51lib1
