# Update the PDMR Metadata For the PDXNet Portal
## Date: 2022-02-23
## Author: Jeffrey Grover
**Purpose:** After creating new metadata for the PDTC models on the portal, the PDMR's metadata can also be updated to remove some unnecessary columns.

### Load libraries

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



### Load metadata

In [2]:
pdmr_metadata <- read_csv('2021-04-01_pdxnet-pdmr_datapool-fastq-manifest.csv') %>% bind_rows(read_csv('2021-08-16_pdxnet_pdmr_datapool_uploaded_fastq-manifest.csv'))

nrow(pdmr_metadata)
head(pdmr_metadata)

[1mRows: [22m[34m9492[39m [1mColumns: [22m[34m21[39m

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (19): id, name, project, Patient ID, gender, Capture assembly, disease_t...
[32mdbl[39m  (2): paired_end, age_at_diagnosis


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1mRows: [22m[34m5014[39m [1mColumns: [22m[34m21[39m

[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

id,name,project,Patient ID,gender,Capture assembly,disease_type,sample_id,PDMR Version,PDMR Sample ID,⋯,Has Matched Normal,Passage,paired_end,sample_type,platform,age_at_diagnosis,Capture kit,case_id,Specimen ID,experimental_strategy
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
5f43cd0be4b0bf4ad141bd5e,253994~v2.0.1.10.0~germlineWES.R1.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,253994,Male,hg19,Colorectal cancer - NOS,253994,2_0_1_10_0,,⋯,,,1,normal_germline,Illumina HiSeq,50,SureSelect Human All Exon V5,253994,,WES
5f43cd0be4b0bf4ad141bd27,174316~266-R~AD7I14~v1.2~WES.R1.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,174316,Male,hg19,Squamous cell carcinoma - skin,174316--266-R--AD7I14,1_2,AD7I14,⋯,Y,P1,1,PDX,Illumina HiSeq,83,SureSelect Human All Exon V5,174316,266-R,WES
5f43cd0be4b0bf4ad141bcde,345371~062-R~P27PQ6~v1.2~RNASEQ.R2.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,345371,Female,hg19,Salivary gland cancer,345371--062-R--P27PQ6,1_2,P27PQ6,⋯,,P1,2,PDX,Illumina HiSeq,65,TruSeq RNA Access,345371,062-R,RNASEQ
5f43cd0be4b0bf4ad141bce0,417821~307-R~TO3~v1.2~WES.R2.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,417821,Male,hg19,Squamous cell lung carcinoma,417821--307-R--TO3,1_2,TO3,⋯,Y,P0,2,PDX,Illumina HiSeq,67,SureSelect Human All Exon V5,417821,307-R,WES
5f43cd0be4b0bf4ad141bcdf,594176~295-R~AL8~v1.2~WES.R2.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,594176,Male,hg19,Osteosarcoma,594176--295-R--AL8,1_2,AL8,⋯,Y,P0,2,PDX,Illumina HiSeq,72,SureSelect Human All Exon V5,594176,295-R,WES
5f43cd0be4b0bf4ad141bcd8,283228~195-R~A04VW5UV9~v1.2~RNASEQ.R2.FASTQ.gz,jeffrey.grover/pdxnet-pdmr-datapool,283228,Male,hg19,Melanoma,283228--195-R--A04VW5UV9,1_2,A04VW5UV9,⋯,,P2,2,PDX,Illumina HiSeq,48,TruSeq RNA Access,283228,195-R,RNASEQ


### Reformatting

In [3]:
# Remove unnecessary columns

pdmr_metadata <- pdmr_metadata %>% select(-id, -project)

In [4]:
# Create index and contributor columns

pdmr_metadata$index <- seq(1, nrow(pdmr_metadata))
pdmr_metadata$contributor <- 'NCI'

head(pdmr_metadata)

name,Patient ID,gender,Capture assembly,disease_type,sample_id,PDMR Version,PDMR Sample ID,Is PDMR Version 2,Has Matched Normal,⋯,paired_end,sample_type,platform,age_at_diagnosis,Capture kit,case_id,Specimen ID,experimental_strategy,index,contributor
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>
253994~v2.0.1.10.0~germlineWES.R1.FASTQ.gz,253994,Male,hg19,Colorectal cancer - NOS,253994,2_0_1_10_0,,Y,,⋯,1,normal_germline,Illumina HiSeq,50,SureSelect Human All Exon V5,253994,,WES,1,NCI
174316~266-R~AD7I14~v1.2~WES.R1.FASTQ.gz,174316,Male,hg19,Squamous cell carcinoma - skin,174316--266-R--AD7I14,1_2,AD7I14,N,Y,⋯,1,PDX,Illumina HiSeq,83,SureSelect Human All Exon V5,174316,266-R,WES,2,NCI
345371~062-R~P27PQ6~v1.2~RNASEQ.R2.FASTQ.gz,345371,Female,hg19,Salivary gland cancer,345371--062-R--P27PQ6,1_2,P27PQ6,N,,⋯,2,PDX,Illumina HiSeq,65,TruSeq RNA Access,345371,062-R,RNASEQ,3,NCI
417821~307-R~TO3~v1.2~WES.R2.FASTQ.gz,417821,Male,hg19,Squamous cell lung carcinoma,417821--307-R--TO3,1_2,TO3,N,Y,⋯,2,PDX,Illumina HiSeq,67,SureSelect Human All Exon V5,417821,307-R,WES,4,NCI
594176~295-R~AL8~v1.2~WES.R2.FASTQ.gz,594176,Male,hg19,Osteosarcoma,594176--295-R--AL8,1_2,AL8,N,Y,⋯,2,PDX,Illumina HiSeq,72,SureSelect Human All Exon V5,594176,295-R,WES,5,NCI
283228~195-R~A04VW5UV9~v1.2~RNASEQ.R2.FASTQ.gz,283228,Male,hg19,Melanoma,283228--195-R--A04VW5UV9,1_2,A04VW5UV9,N,,⋯,2,PDX,Illumina HiSeq,48,TruSeq RNA Access,283228,195-R,RNASEQ,6,NCI


In [5]:
colnames(pdmr_metadata)

In [6]:
# Reorder columns

pdmr_metadata <- pdmr_metadata %>% select(index, contributor, name, `Patient ID`, `PDMR Sample ID`, `PDMR Version`, `Is PDMR Version 2`, sample_id, case_id, `Specimen ID`, sample_type, `Has Matched Normal`,
                         experimental_strategy, gender, age_at_diagnosis, paired_end, `Capture kit`, `Capture assembly`, platform, Passage, disease_type)

colnames(pdmr_metadata)
colnames(pdmr_metadata) %>% length()

In [7]:
colnames(pdmr_metadata) <- c('', 'contributor', 'file_name', 'patient_id', 'pdmr_sample_id', 'pdmr_version', 'is_pdmr_version_2', 'sample_id', 'case_id', 'specimen_id', 'sample_type',
                             'has_matched_normal', 'experimental_strategy', 'gender', 'age_at_diagnosis', 'paired_end', 'capture_kit', 'capture_assembly', 'platform', 'passage',
                             'disease_type')

colnames(pdmr_metadata)
colnames(pdmr_metadata) %>% length()

In [8]:
# Export

write_csv(pdmr_metadata, '2022-02-23_pdmr_portal_metadata.csv')