# Move HCI's CDS Files To a Temporary Staging Folder
## Date: 2022-02-23
## Author: Jeffrey Grover
**Purpose:** I have the full list of HCI files to send to the CDS, as well as credentials for the CDS bucket. I can put these files in a temporary staging area in order to more easily facilitate the upload. After this, the files will be removed.

### Load libraries

In [1]:
library(tidyverse)
library(sevenbridges)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



### Load the CDS Files and CGC Manifest
To get the file IDs so I can copy them to the staging folder.

In [2]:
# Load the dbgap metadata for the file list

cds_files <- read_csv('HCI_dbGaP_SampleMapping_SampleAttributes_DS_v1.csv')$FILENAME

length(cds_files)

New names:
* `` -> ...15
* `` -> ...16

[1mRows: [22m[34m281[39m [1mColumns: [22m[34m16[39m

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): id, SAMPLE_ID, FILENAME, ALT_SAMLE_ID, SUBJECT_ID
[32mdbl[39m (8): DATA_FROMAT, SAMPLE_TYPE, PASSAGE, EXPERIMENTAL_STRATEGY, PAIRED_EN...
[33mlgl[39m (3): FILESIZE, CHECKSUM, ...15


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



In [5]:
# Load the CGC metadata to get the file IDS

hci_files <- read_csv('2022-02-23_hci_fastq_manifest_20220223_150616.csv') %>% select(id, name) %>% mutate(name = basename(name))

head(hci_files)

[1mRows: [22m[34m816[39m [1mColumns: [22m[34m66[39m

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (21): id, name, project, sample_id, sample_type, library_prep_kit, Restr...
[32mdbl[39m  (3): size, platform_unit_id, paired_end
[33mlgl[39m (42): trimmed_read_count, p1_1, Is FFPE, passage, prop_ribosomal_bases, ...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



id,name
<chr>,<chr>
615ddbb165de4868125988dc,19120X19_210830_A00421_0362_BHGLM2DSX2_S19_L002_R1_001.fastq.gz
615ddbb165de4868125988df,19120X19_210830_A00421_0362_BHGLM2DSX2_S19_L002_R2_001.fastq.gz
615ddbaf65de4868125988a0,19120X1_210830_A00421_0362_BHGLM2DSX2_S1_L002_R1_001.fastq.gz
615ddbaf65de4868125988a3,19120X1_210830_A00421_0362_BHGLM2DSX2_S1_L002_R2_001.fastq.gz
615ddbb165de4868125988e8,19120X20_210830_A00421_0362_BHGLM2DSX2_S20_L002_R1_001.fastq.gz
615ddbb265de4868125988eb,19120X20_210830_A00421_0362_BHGLM2DSX2_S20_L002_R2_001.fastq.gz


In [7]:
# Filter the hci files on the cgc for the list of files from dbGaP

cds_files_manifest <- hci_files %>% filter(name %in% cds_files)

nrow(cds_files_manifest)
head(cds_files_manifest)

id,name
<chr>,<chr>
615ddba665de4868125987aa,18588X6_1201216_A00421_0267_BHN2TJDSXY_S44_L003_R1_001.fastq.gz
615ddba665de4868125987ad,18588X6_1201216_A00421_0267_BHN2TJDSXY_S44_L003_R2_001.fastq.gz
615ddba665de4868125987b0,18588X7_1201216_A00421_0267_BHN2TJDSXY_S45_L003_R1_001.fastq.gz
615ddba665de4868125987b3,18588X7_1201216_A00421_0267_BHN2TJDSXY_S45_L003_R2_001.fastq.gz
615ddba665de4868125987b6,18588X8_1201216_A00421_0267_BHN2TJDSXY_S46_L003_R1_001.fastq.gz
615ddba765de4868125987b9,18588X8_1201216_A00421_0267_BHN2TJDSXY_S46_L003_R2_001.fastq.gz


### Copy those files to the staging area

In [8]:
# Connect to the platform

sbg_auth <- Auth(from = 'file', profile_name = 'cgc')
sbg_proj <- sbg_auth$project(id = 'pdxnet/pdxnet-datapool')

Authenticating with user configuration file: ~/.sevenbridges/credentials

Authenticating with user profile: cgc



In [14]:
# Get the folder object for the staging area

staging_folder <- sbg_proj$get_root_folder()$list_folder_contents(complete = TRUE)[[6]]$list_folder_contents(complete = TRUE)[[1]]

staging_folder
staging_folder$id

== Files ==
id : 621692a9b07915019d2bfe4b
name : 2022-02-23_cds_upload_staging
project : pdxnet/pdxnet-datapool
parent : 5d5ffce2e4b0b892538d3488
type : folder

In [15]:
# Copy files to the staging area

for (i in seq_along(cds_files_manifest$id)) {
    
    # Pause for 5 mins every 200 iterations to avoid API call limits
    if (i %% 200 == 0) {
        Sys.sleep(300)
    }
    
    # Copy files
    sbg_proj$file(id = cds_files_manifest$id[i])$copy_to_folder(staging_folder$id)
}

### Set the metadata for these files after the upload
The CDS upload was successful for all of these files so we can go ahead and add the `cds_status` and `dbgap_accession` metadata fields to these files.

In [16]:
for (i in seq_along(cds_files_manifest$id)) {
    
    # Pause for 5 mins every 200 iterations to avoid API call limits
    if (i %% 200 == 0) {
        Sys.sleep(300)
    }
    
    # Set metadata
    sbg_proj$file(id = cds_files_manifest$id[i])$set_meta(cds_status = 'uploaded', dbgap_accession = 'phs002479.v1.p1')
}