# Move WISTAR's CDS Files To a Temporary Staging Folder
## Date: 2022-02-24
## Author: Jeffrey Grover
**Purpose:** Do the same as with HCI's files in support of the CDS data submission.

### Load libraries

In [1]:
library(tidyverse)
library(readxl)
library(sevenbridges)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



### Load the metadata

In [2]:
# Load the files for cds upload

cds_files <- read_excel('wistar_files_sampleattributes_DS.xlsx')$FILENAME

length(cds_files)
head(cds_files)

In [3]:
# Load the cgc metadata

cgc_files <- read_csv('2022-02-24_wistar_rnaseq_wes_manifest_20220224_095810.csv')

head(cgc_files)

[1mRows: [22m[34m382[39m [1mColumns: [22m[34m66[39m

[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (30): id, name, project, p1_1, passage, sample_id, sample_type, case_id,...
[32mdbl[39m  (3): size, paired_end, update_manifest
[33mlgl[39m (33): trimmed_read_count, Is FFPE, prop_ribosomal_bases, library_prep_ki...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



id,name,size,project,trimmed_read_count,p1_1,Is FFPE,passage,sample_id,prop_ribosomal_bases,⋯,dbgap_upload,model_id,paired_end,__inherit__,update_manifest,species,tumor_id,investigation,prop_20x_cov,experimental_strategy
<chr>,<chr>,<dbl>,<chr>,<lgl>,<chr>,<lgl>,<chr>,<chr>,<lgl>,⋯,<lgl>,<lgl>,<dbl>,<lgl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>
5d805c02e4b06d0cb47c4dde,WISTAR/WES/wm4281_R1.fastq.gz,10388246367,pdxnet/pdxnet-datapool,,,,tumor,wm4281,,⋯,,,1,,,,,,,WES
5d805c02e4b06d0cb47c4ecf,WISTAR/WES/wm4281_R2.fastq.gz,11865334115,pdxnet/pdxnet-datapool,,,,tumor,wm4281,,⋯,,,2,,,,,,,WES
5d805c02e4b06d0cb47c4f19,WISTAR/WES/wm4281n_R1.fastq.gz,2742803646,pdxnet/pdxnet-datapool,,,,blood,wm4281n,,⋯,,,1,,,,,,,WES
5d805c02e4b06d0cb47c4f82,WISTAR/WES/wm4281n_R2.fastq.gz,3256676515,pdxnet/pdxnet-datapool,,,,blood,wm4281n,,⋯,,,2,,,,,,,WES
5d805c02e4b06d0cb47c4e1e,WISTAR/WES/wm4304_R1.fastq.gz,8838299944,pdxnet/pdxnet-datapool,,,,tumor,wm4304,,⋯,,,1,,,,,,,WES
5d805c02e4b06d0cb47c4e23,WISTAR/WES/wm4304_R2.fastq.gz,10346466097,pdxnet/pdxnet-datapool,,,,tumor,wm4304,,⋯,,,2,,,,,,,WES


In [4]:
# I don't need all the column, so just get the filename and id from the cgc files

cgc_files <- cgc_files %>% select(id, name)

# basename for the name field

cgc_files <- cgc_files %>% mutate(name = basename(name))

head(cgc_files)
nrow(cgc_files)

id,name
<chr>,<chr>
5d805c02e4b06d0cb47c4dde,wm4281_R1.fastq.gz
5d805c02e4b06d0cb47c4ecf,wm4281_R2.fastq.gz
5d805c02e4b06d0cb47c4f19,wm4281n_R1.fastq.gz
5d805c02e4b06d0cb47c4f82,wm4281n_R2.fastq.gz
5d805c02e4b06d0cb47c4e1e,wm4304_R1.fastq.gz
5d805c02e4b06d0cb47c4e23,wm4304_R2.fastq.gz


### Check the files and filter for only cds files
Are all the cds files contained in the cgc files? If so, I can create a manifest that only has the cds files in it.

In [5]:
all(cds_files %in% cgc_files$name)

In [6]:
# Filter the cgc files for only the cds files

cds_files_manifest <- cgc_files %>% filter(name %in% cds_files)

nrow(cds_files_manifest)
head(cds_files_manifest)

id,name
<chr>,<chr>
5d805c02e4b06d0cb47c4dde,wm4281_R1.fastq.gz
5d805c02e4b06d0cb47c4ecf,wm4281_R2.fastq.gz
5d805c02e4b06d0cb47c4f19,wm4281n_R1.fastq.gz
5d805c02e4b06d0cb47c4f82,wm4281n_R2.fastq.gz
5d805c02e4b06d0cb47c4e1e,wm4304_R1.fastq.gz
5d805c02e4b06d0cb47c4e23,wm4304_R2.fastq.gz


### Copy the files to the cds staging folder

In [7]:
# Connect to the platform

sbg_auth <- Auth(from = 'file', profile_name = 'cgc')
sbg_proj <- sbg_auth$project(id = 'pdxnet/pdxnet-datapool')

Authenticating with user configuration file: ~/.sevenbridges/credentials

Authenticating with user profile: cgc



In [8]:
# Get the folder object for the staging area

staging_folder <- sbg_proj$get_root_folder()$list_folder_contents(complete = TRUE)[[11]]$list_folder_contents(complete = TRUE)[[1]]

staging_folder
staging_folder$id

== Files ==
id : 6217a99bb07915019d2c0d29
name : 2022-02-24_cds_upload_staging
project : pdxnet/pdxnet-datapool
parent : 5d60008fe4b0b892538d34b4
type : folder

In [9]:
# Copy files to the staging area

for (i in seq_along(cds_files_manifest$id)) {
    
    # Pause for 5 mins every 200 iterations to avoid API call limits
    if (i %% 200 == 0) {
        Sys.sleep(300)
    }
    
    # Copy files
    sbg_proj$file(id = cds_files_manifest$id[i])$copy_to_folder(staging_folder$id)
}

### Set the metadata for these files after the upload
The CDS upload was successful for all of these files so we can go ahead and add the `cds_status` and `dbgap_accession` metadata fields to these files.

In [10]:
for (i in seq_along(cds_files_manifest$id)) {
    
    # Pause for 5 mins every 200 iterations to avoid API call limits
    if (i %% 200 == 0) {
        Sys.sleep(300)
    }
    
    # Set metadata
    sbg_proj$file(id = cds_files_manifest$id[i])$set_meta(cds_status = 'uploaded', dbgap_accession = 'phs002432.v1.p1')
}