# Download SRA annotations

## Load libraries

In [13]:
suppressPackageStartupMessages(library(reutils))
suppressPackageStartupMessages(library(xml2))
suppressPackageStartupMessages(library(glue))
suppressPackageStartupMessages(library(tidyverse))

## Define analysis-specific variables

In [2]:
bioproject_number <- "PRJNA785998"

## Download SRA annotations and process it into required columns

In [56]:
sra_info <- bioproject_number %>% 
    esearch(db = "sra") %>% 
    efetch(db = "sra") %>% 
    content(as = "text") %>%  
    read_xml()

sra_annotations <- sra_info %>% 
    xml_find_all("//EXPERIMENT") %>% 
    map_df(xml_attrs) %>% 
    rename(srx = accession, sample_name = alias) %>%
    # XPATH search below: find node called EXPERIMENT_REF with attribute accession value set to srx accession
    # then go to the parent node and find all attributes
    mutate(data = map(srx, function (x) bind_rows(xml_attrs(xml_find_all(sra_info, glue('//EXPERIMENT_REF[@accession="{x}"]/..')))))) %>%
    unnest(data) %>% 
    rename(srr = accession) %>% 
    mutate(nreads = map_int(srr, function (x) length(xml_find_all(sra_info, glue('//RUN[@accession="{x}"]/SRAFiles/SRAFile')))-1)) %>%
    mutate(data = str_split(sample_name, "_", 2)) %>% 
    select(srr, data, nreads) %>% 
    unnest_wider(data, names_sep = "_") %>% 
    rename(sample_id = data_1, sample_name = data_2) %>% 
    write_tsv("../../annotations/sra_annotations.tsv")

sra_annotations

srr,sample_id,sample_name,nreads
<chr>,<chr>,<chr>,<list>
SRR17125806,146p28,stall4control4_linkage,2
SRR17125807,146p27,stall4control4_linkage,2
SRR17125808,146p26,stall4control4_linkage,2
SRR17125809,146p25,stall4control4_linkage,2
SRR17125810,146p24,stall4control4_linkage,2
SRR17125811,146p23,stall4control4_linkage,2
SRR17125812,139p7,endo12k_linkage,1
SRR17125813,146p16,stall4control4_t8,1
SRR17125814,146p15,stall4control4_t4,1
SRR17125815,146p14,stall4control4_t2,1
