# Dependencies

In [8]:
library(dplyr)

In [9]:
library(openxlsx)

In [10]:
source("/Users/anabbi/git/ped_CapTCRseq/R/ggplot2_theme.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/color_schemes.R")

# Paths

In [11]:
datapath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Data/"
plotpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Plots/"
manifestpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Manifests/"

gitpath <- "/Users/anabbi/git/ped_CapTCRseq/"

# Main

In [12]:
ffpe <- read.xlsx(paste0(datapath, "CHOP Lab Processing Log.xlsx"), sheet = 3)

In [13]:
metadata <- read.csv(file = paste0(datapath, "INT_metadata_flow.csv"),header = T, stringsAsFactors = F)

In [14]:
divstats <- read.csv(paste0(datapath, "capTCRseq/divstats_TRBCHP.csv"), header = T, stringsAsFactors = F, row.names = 1)

# FFPE

In [15]:
# get batch 1
ffpe <- ffpe[ ffpe$Batch == "1",]

In [16]:
#make a dataframe from sample ids
ffpe_samples <- data.frame(ffpe$Pugh.Lab.ID, stringsAsFactors = F)
colnames(ffpe_samples) <- "sample_id"

In [17]:
# add number of samples
ffpe_samples$Shipped <- paste0("n = ", nrow(ffpe_samples))

In [18]:
ffpe_samples

sample_id,Shipped
<chr>,<chr>
343-B,n = 25
344-A,n = 25
419,n = 25
358,n = 25
384,n = 25
401,n = 25
412,n = 25
404-A,n = 25
418,n = 25
370,n = 25


In [19]:
#add amount DNA extracted
ffpe_samples$DNAextracted <- NA
ffpe_samples$DNAextracted <- ffpe$`DNA.total.(ng)`[ match(ffpe_samples$sample_id,ffpe$Pugh.Lab.ID)]

In [20]:
#add amount DNA used for library
ffpe_samples$DNAused <- NA
ffpe_samples$DNAused <- ffpe$`Amount.of.DNA.used.for.Library.Prep.(ng)`[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID )]

In [21]:
# Add miseq status for libraries
ffpe_samples$librarypassed <- NA
ffpe_samples$librarypassed <- trimws(ffpe$Miseq.for.Library[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)])

In [22]:
ffpe_samples

sample_id,Shipped,DNAextracted,DNAused,librarypassed
<chr>,<chr>,<dbl>,<dbl>,<chr>
343-B,n = 25,1060,500,Passed
344-A,n = 25,9280,500,Passed
419,n = 25,248,248,Passed
358,n = 25,11900,500,Passed
384,n = 25,2460,500,Passed
401,n = 25,1740,500,Passed
412,n = 25,3060,500,Passed
404-A,n = 25,1010,500,Passed
418,n = 25,3280,500,Passed
370,n = 25,2660,500,Passed


In [23]:
# add number of passed libraries
ffpe_samples$Library <- NA
ffpe_samples$Library[ ffpe_samples$librarypassed == "Passed"] <- paste0("n = ", 
                               length(ffpe_samples$librarypassed[ffpe_samples$librarypassed == "Passed"]))

In [24]:
#add total library yield
ffpe_samples$libraryyield <- NA
ffpe_samples$libraryyield <- ffpe$`Total.Library.Yield.(ng)`[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)]

In [25]:
#amount DNA used for capture
ffpe_samples$DNAforcapture <- NA
ffpe_samples$DNAforcapture <- ffpe$`Amount.of.Library.used.for.Capture.(ng)`[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)]

In [26]:
#total capture yield
ffpe_samples$captureyield <- NA
ffpe_samples$captureyield <- ffpe$`Total.Capture.Library.Yield.(ng)`[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)]

In [27]:
# Miseq status for capture
ffpe_samples$capturepassed <- NA
ffpe_samples$capturepassed <- trimws(ffpe$Miseq.for.Capture[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)])

In [28]:
# Number of passed captures
ffpe_samples$Capture <- NA
ffpe_samples$Capture[ ffpe_samples$capturepassed == "Passed"] <- paste0("n = ", 
                                                                        length(ffpe_samples$capturepassed[ffpe_samples$capturepassed == "Passed"]))

In [29]:
ffpe_samples

sample_id,Shipped,DNAextracted,DNAused,librarypassed,Library,libraryyield,DNAforcapture,captureyield,capturepassed,Capture
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
343-B,n = 25,1060,500,Passed,n = 25,258.24,258.24,10.07,Passed,n = 23
344-A,n = 25,9280,500,Passed,n = 25,287.04,287.04,10.07,Passed,n = 23
419,n = 25,248,248,Passed,n = 25,237.12,237.12,10.07,Passed,n = 23
358,n = 25,11900,500,Passed,n = 25,638.4,617.26,83.22,Passed,n = 23
384,n = 25,2460,500,Passed,n = 25,363.84,363.84,83.22,Passed,n = 23
401,n = 25,1740,500,Passed,n = 25,705.6,449.48,10.108,Passed,n = 23
412,n = 25,3060,500,Passed,n = 25,787.2,550.52,10.108,Passed,n = 23
404-A,n = 25,1010,500,Passed,n = 25,811.2,498.41,42.56,Passed,n = 23
418,n = 25,3280,500,Passed,n = 25,792.0,501.59,42.56,Passed,n = 23
370,n = 25,2660,500,Passed,n = 25,472.32,472.32,5.282,Passed,n = 23


In [30]:
# whether submitted for deep seq
ffpe_samples$deepseq <- NA
ffpe_samples$deepseq <- trimws(ffpe$`Sent.for.Nextseq?`[ match(ffpe_samples$sample_id, ffpe$Pugh.Lab.ID)])
ffpe_samples$deepseq[is.na(ffpe_samples$deepseq)] <- "No"

In [31]:
sum(ffpe_samples$deepseq == "Yes")

In [32]:
# number of deep sequencing samples
ffpe_samples$Sequencing <- NA
ffpe_samples$Sequencing[ ffpe_samples$deepseq == "Yes"] <- paste0("n = ", sum(ffpe_samples$deepseq == "Yes"))

In [33]:
#make a new sample id to match with divstats
ffpe_samples$mysample_id <- paste0("CHP_", ffpe_samples$sample_id, sep = "")
ffpe_samples$mysample_id <- gsub("-", "_", ffpe_samples$mysample_id)

In [34]:
# Samples included in tcr analysis
ffpe_samples$TCR <- NA
ffpe_samples$TCR[ ffpe_samples$mysample_id %in% divstats$sample_id] <- "Yes"
ffpe_samples$TCR[ is.na(ffpe_samples$TCR)] <- "No"

In [35]:
ffpe_samples

sample_id,Shipped,DNAextracted,DNAused,librarypassed,Library,libraryyield,DNAforcapture,captureyield,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
343-B,n = 25,1060,500,Passed,n = 25,258.24,258.24,10.07,Passed,n = 23,Yes,n = 21,CHP_343_B,No
344-A,n = 25,9280,500,Passed,n = 25,287.04,287.04,10.07,Passed,n = 23,Yes,n = 21,CHP_344_A,Yes
419,n = 25,248,248,Passed,n = 25,237.12,237.12,10.07,Passed,n = 23,Yes,n = 21,CHP_419,Yes
358,n = 25,11900,500,Passed,n = 25,638.4,617.26,83.22,Passed,n = 23,No,,CHP_358,No
384,n = 25,2460,500,Passed,n = 25,363.84,363.84,83.22,Passed,n = 23,No,,CHP_384,No
401,n = 25,1740,500,Passed,n = 25,705.6,449.48,10.108,Passed,n = 23,Yes,n = 21,CHP_401,Yes
412,n = 25,3060,500,Passed,n = 25,787.2,550.52,10.108,Passed,n = 23,Yes,n = 21,CHP_412,Yes
404-A,n = 25,1010,500,Passed,n = 25,811.2,498.41,42.56,Passed,n = 23,Yes,n = 21,CHP_404_A,Yes
418,n = 25,3280,500,Passed,n = 25,792.0,501.59,42.56,Passed,n = 23,Yes,n = 21,CHP_418,Yes
370,n = 25,2660,500,Passed,n = 25,472.32,472.32,5.282,Passed,n = 23,Yes,n = 21,CHP_370,Yes


In [36]:
# Number of samples included in tcr analysis
ffpe_samples$TCRanalysis <- NA
ffpe_samples$TCRanalysis[ffpe_samples$TCR == "Yes"] <- paste0("n = ", sum(ffpe_samples$TCR == "Yes"))

In [37]:
head(ffpe_samples)

Unnamed: 0_level_0,sample_id,Shipped,DNAextracted,DNAused,librarypassed,Library,libraryyield,DNAforcapture,captureyield,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR,TCRanalysis
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,343-B,n = 25,1060,500,Passed,n = 25,258.24,258.24,10.07,Passed,n = 23,Yes,n = 21,CHP_343_B,No,
2,344-A,n = 25,9280,500,Passed,n = 25,287.04,287.04,10.07,Passed,n = 23,Yes,n = 21,CHP_344_A,Yes,n = 18
3,419,n = 25,248,248,Passed,n = 25,237.12,237.12,10.07,Passed,n = 23,Yes,n = 21,CHP_419,Yes,n = 18
4,358,n = 25,11900,500,Passed,n = 25,638.4,617.26,83.22,Passed,n = 23,No,,CHP_358,No,
5,384,n = 25,2460,500,Passed,n = 25,363.84,363.84,83.22,Passed,n = 23,No,,CHP_384,No,
6,401,n = 25,1740,500,Passed,n = 25,705.6,449.48,10.108,Passed,n = 23,Yes,n = 21,CHP_401,Yes,n = 18


In [38]:
# Number of failed captures
capturefailed <- sum(ffpe_samples$capturepassed == "Failed")
ffpe_samples$Capture[ffpe_samples$capturepassed == "Failed"] <- paste0("n = ", capturefailed)

In [39]:
# Number of samples not deep seq'd
seqfailed <- sum(ffpe_samples$deepseq == "No" & ffpe_samples$capturepassed == "Passed")
ffpe_samples$Sequencing[is.na(ffpe_samples$Sequencing)& 
                        ffpe_samples$capturepassed == "Passed"] <- paste0("n = ", seqfailed)

In [40]:
# Number of samples not included in TCR analysis
tcrfailed <- sum(ffpe_samples$TCR == "No" & 
                 ffpe_samples$deepseq == "Yes" &
                 ffpe_samples$capturepassed == "Passed")

ffpe_samples$TCRanalysis[ffpe_samples$TCR == "No" & 
                         ffpe_samples$deepseq == "Yes" &
                         ffpe_samples$capturepassed == "Passed"] <- paste0("n = ", tcrfailed)

In [41]:
# Create a patient ID
ffpe_samples$Patient <- gsub("_A", "", ffpe_samples$mysample_id)
ffpe_samples$Patient <- gsub("_B", "", ffpe_samples$Patient)

In [42]:
# add tumour type
ffpe_samples$Tumor <- NA
ffpe_samples$Tumor <- metadata$Disease_type[match(ffpe_samples$Patient, metadata$Patient)]

In [43]:
# add tumour group
ffpe_samples$Group <- NA
ffpe_samples$Group <- metadata$group[match(ffpe_samples$Patient, metadata$Patient)]

In [44]:
#remove tumor and group for samples not included in TCR analysis
ffpe_samples$Group[ ffpe_samples$TCR == "No"] <- NA
ffpe_samples$Tumor[ ffpe_samples$TCR == "No"] <- NA

In [45]:
ffpe_samples

sample_id,Shipped,DNAextracted,DNAused,librarypassed,Library,libraryyield,DNAforcapture,captureyield,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR,TCRanalysis,Patient,Tumor,Group
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
343-B,n = 25,1060,500,Passed,n = 25,258.24,258.24,10.07,Passed,n = 23,Yes,n = 21,CHP_343_B,No,n = 3,CHP_343,,
344-A,n = 25,9280,500,Passed,n = 25,287.04,287.04,10.07,Passed,n = 23,Yes,n = 21,CHP_344_A,Yes,n = 18,CHP_344,BL,
419,n = 25,248,248,Passed,n = 25,237.12,237.12,10.07,Passed,n = 23,Yes,n = 21,CHP_419,Yes,n = 18,CHP_419,OS,
358,n = 25,11900,500,Passed,n = 25,638.4,617.26,83.22,Passed,n = 23,No,n = 2,CHP_358,No,,CHP_358,,
384,n = 25,2460,500,Passed,n = 25,363.84,363.84,83.22,Passed,n = 23,No,n = 2,CHP_384,No,,CHP_384,,
401,n = 25,1740,500,Passed,n = 25,705.6,449.48,10.108,Passed,n = 23,Yes,n = 21,CHP_401,Yes,n = 18,CHP_401,ERMS,
412,n = 25,3060,500,Passed,n = 25,787.2,550.52,10.108,Passed,n = 23,Yes,n = 21,CHP_412,Yes,n = 18,CHP_412,BLL,
404-A,n = 25,1010,500,Passed,n = 25,811.2,498.41,42.56,Passed,n = 23,Yes,n = 21,CHP_404_A,Yes,n = 18,CHP_404,ERMS,
418,n = 25,3280,500,Passed,n = 25,792.0,501.59,42.56,Passed,n = 23,Yes,n = 21,CHP_418,Yes,n = 18,CHP_418,NB,
370,n = 25,2660,500,Passed,n = 25,472.32,472.32,5.282,Passed,n = 23,Yes,n = 21,CHP_370,Yes,n = 18,CHP_370,ALCL,


In [46]:
#some colname cleanup
colnames(ffpe_samples)[colnames(ffpe_samples) == "Library"] <- "Library preparation"
colnames(ffpe_samples)[colnames(ffpe_samples) == "Capture"] <- "Successful capture"
colnames(ffpe_samples)[colnames(ffpe_samples) == "Sequencing"] <- "Deep sequencing"
colnames(ffpe_samples)[colnames(ffpe_samples) == "TCRanalysis"] <- "TCR analysis"

In [47]:
save(ffpe_samples, file = paste0(gitpath, "data/tumor_sampleprocessing.RData"))

# PBMC

In [48]:
pbmc <- read.xlsx(paste0(datapath, "CHOP Lab Processing Log.xlsx"), sheet = 2)

In [49]:
pbmc$Specimen.ID <- trimws(pbmc$Specimen.ID)

In [50]:
colnames(pbmc)

In [51]:
head(pbmc[ pbmc$Specimen.ID %in% c("387-03", "387-02"), c(6,25:40)])

Unnamed: 0_level_0,Specimen.ID,Notes,Missing.Samples,Library.Prep.Date,Library.Prep.Technician,Sample.Vol.used.for.Library.Prep.(uL),Amount.of.DNA.used.for.Library.Prep.(ng),Amount.of.DNA.remaining.post-library.prep.(ng),Library.ID,Adapter,Adapter.Sequence,Number.of.PCR.Cycles.used.for.Library.Amplification,Qubit.Concentration,Final.Library.Vol.(uL),Total.Library.Yield.(ng),Miseq.for.Library,Library.Miseq.Flow.Cell.ID
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
300,387-03,,,43815,JW,15.1,500,2820,CHP_387-03_PBMC_DNA,AD11,GTCCTTGT,3,10.3,48,494.4,Passed,191224_M04827_0329_000000000-D7W2Y
308,387-02,Samples re-purified using ethanol precip therefore elution V and concentration are not accurate,,43815,JW,All,55,0,CHP_387-02_PBMC_DNA,AD3,TGTAACCG,6,14.2,48,681.6,Passed,191224_M04827_0329_000000000-D7W2Y


In [52]:
pbmc$Specimen.ID[ grepl("328",pbmc$Specimen.ID) ]

In [53]:
dim(pbmc)

In [54]:
# #CHP_329 is not included in metadata. as per DB, we focused on patients after 330. remove it
# pbmc <- pbmc[pbmc$Specimen.ID != "329-04",]

In [55]:
dim(pbmc)

In [56]:
pbmc$Colour.Legend <- NULL

In [57]:
missingsamples <- pbmc[ which(pbmc$Missing.Samples == "***Potentially missing! Not found in specified location"),]

In [58]:
missingsamples <- rbind(missingsamples, 
                       pbmc[ which(pbmc$Notes == "Sample 423-02 is missing, this location is sample 413-02"),])

In [59]:
missingsamples

Unnamed: 0_level_0,Shipment.#,Batch,Original.Box#,Original.Box.Location,Specimen.ID,Date.Frozen,Study.Cohort,Sample.Type,Date.transferred.to.Pugh.Lab,Total.Cell.Number,...,Final.Capture.Vol.(uL),Total.Capture.Library.Yield.(ng),Miseq.for.Capture,Miseq.Flow.Cell.ID,Sent.for.Nextseq?,Date.Submitted.for.Seq,Pugh.Lab.Seq.ID,Vol.used.for.sequencing.(uL),Vol.Remaining.(uL),Capture.Notes
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,...,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
456,1,,NOT SENT,,331-03,,,,,3300000,...,,,,,,,,,,
457,1,,NOT SENT,,338-01,,,,,600000,...,,,,,,,,,,
458,2,,Cell Box 2,I5,423-02,,,Cell,,1530000,...,,,,,,,,,,


In [60]:
#remove missing samples
pbmc <- pbmc[ is.na(pbmc$Missing.Samples),]
pbmc <- pbmc[! pbmc$Notes %in% "Sample 423-02 is missing, this location is sample 413-02",]

In [61]:
dim(pbmc)

In [62]:
#remove empty entry
pbmc <- pbmc[!is.na(pbmc$Specimen.ID), ]

In [63]:
 length(pbmc$Specimen.ID)

In [64]:
# get total number of samples
pbmc_samples <- data.frame(pbmc$Specimen.ID[!duplicated(pbmc$Specimen.ID)], stringsAsFactors = F)

In [65]:
colnames(pbmc_samples) <- "sample_id"

In [66]:
#number of samples shipped
pbmc_samples$Shipped <- paste0("n = ", nrow(pbmc_samples))

In [67]:
pbmc_samples$sample_id <- trimws(pbmc_samples$sample_id)

In [68]:
pbmc_samples[pbmc_samples == "343-05"]

In [69]:
dim(pbmc_samples)

All samples that were shipped according to DB manifest files

In [70]:
allpbmc <- unlist(read.xlsx(paste0(datapath, "allpbmc_fromDB.xlsx"), sheet = 1))

In [71]:
allpbmc <- allpbmc[!is.na(allpbmc)]

In [72]:
names(allpbmc) <- NULL

In [73]:
#typo
allpbmc[ allpbmc == "388-5"] <- "388-05"

In [74]:
length(allpbmc)

312 cancer samples + 14 normal samples = 326. According to the lab log file we have 321.

In [75]:
allpbmc[! allpbmc %in% pbmc_samples$sample_id ]

In [76]:
pbmc$Miseq.for.Library <- trimws(pbmc$Miseq.for.Library)

Get those with passed libs and capture. Add each failed steps later

In [77]:
table(pbmc$Miseq.for.Library, useNA = "always")


     Failed Failed BioA     On Hold      Passed    Received        <NA> 
         29          48          40         307           1          29 

In [78]:
libfailed <- pbmc[ which(pbmc$Miseq.for.Library != "Passed"),]

In [79]:
dim(libfailed)

In [80]:
pbmc$Miseq.for.Capture <- trimws(pbmc$Miseq.for.Capture)

In [81]:
table(pbmc$Miseq.for.Capture, useNA = "always")


         Failed         Failed?   Not Submitted          Passed Passed Re-Miseq 
              6               2              24             277               2 
           <NA> 
            143 

In [82]:
#cleanup
pbmc$Miseq.for.Capture[ pbmc$Miseq.for.Capture == "Failed?"] <- "Failed"
pbmc$Miseq.for.Capture[ pbmc$Miseq.for.Capture == "Passed Re-Miseq"] <- "Passed"

In [83]:
table(pbmc$Miseq.for.Capture, useNA = "always")


       Failed Not Submitted        Passed          <NA> 
            8            24           279           143 

In [84]:
capfailed <- pbmc[ which(pbmc$Miseq.for.Capture != "Passed"),]

In [85]:
dim(capfailed)

In [86]:
#remove failed samples
pbmc <- pbmc[ which(pbmc$Miseq.for.Library == "Passed"),]
pbmc <- pbmc[ which(pbmc$Miseq.for.Capture == "Passed"),]

In [87]:
table(pbmc$Miseq.for.Capture)


Passed 
   279 

Clean up failedsamples. If samples in libfailed are in cleanup pbmc, remove them. They were failed experiments but the sample eventually survived

In [88]:
libfailed <- libfailed[ !libfailed$Specimen.ID %in% pbmc$Specimen.ID,]

In [89]:
dim(libfailed)

In [90]:
#remove those in capture failed df. these were lib preped again, but capture failed
libfailed <- libfailed[ !libfailed$Specimen.ID %in% capfailed$Specimen.ID,]

In [91]:
dim(libfailed)

In [92]:
length(unique(libfailed$Specimen.ID))

In [93]:
libfailed_dedup <- libfailed[0,]
# to dedup, keep the most recent entry
for(i in unique(libfailed$Specimen.ID)){
    tmp <- libfailed[ libfailed$Specimen.ID == i,]
    libfailed_dedup <- rbind(libfailed_dedup, tmp[nrow(tmp),])
}

In [94]:
dim(libfailed_dedup)

If samples in capfailed are in cleanup pbmc, remove them. 

In [95]:
capfailed <- capfailed[!capfailed$Specimen.ID %in% pbmc$Specimen.ID,]

In [96]:
dim(capfailed)

In [97]:
p343_05 <- pbmc[ which(pbmc$Specimen.ID == "343-05"),]

One sample duplicated. Keep the one corresponding to batch2_Aug62019/mixcr/clones/CLONES_TRBCHP_343-05-PBMC-DNA.txt

In [98]:
pbmc <- pbmc[ which(pbmc$Specimen.ID != "343-05"),]

In [99]:
pbmc <- rbind(pbmc, p343_05[ p343_05$Original.Box.Location == "G5",])

In [100]:
dim(pbmc)

In [101]:
#bind all pbmcs lib and cap failed
allsamples <- rbind(pbmc,libfailed_dedup, capfailed)

In [102]:
#add amount DNA extracted
pbmc_samples$DNAextracted <- NA
pbmc_samples$DNAextracted <- allsamples$`DNA.total.(ng)`[ match(pbmc_samples$sample_id, allsamples$Specimen.ID )]

In [103]:
#add amount DNA used for library
pbmc_samples$DNAused <- NA
pbmc_samples$DNAused <- allsamples$`Amount.of.DNA.used.for.Library.Prep.(ng)`[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [104]:
pbmc_samples$libbatch <- NA
pbmc_samples$libbatch <- allsamples$Library.Miseq.Flow.Cell.ID[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [105]:
head(pbmc_samples$libbatch )

In [106]:
# Add miseq status for libraries
pbmc_samples$librarypassed <- NA
pbmc_samples$librarypassed <- allsamples$Miseq.for.Library[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [107]:
table(pbmc_samples$librarypassed, useNA = "always")


 On Hold   Passed Received     <NA> 
      38      282        1        0 

In [108]:
# add number of passed libraries
pbmc_samples$Library <- NA
pbmc_samples$Library[ pbmc_samples$librarypassed == "Passed"] <- paste0("n = ", 
                               length(pbmc_samples$librarypassed[which(pbmc_samples$librarypassed == "Passed")]))

In [109]:
#cleanup
pbmc_samples$librarypassed[ pbmc_samples$librarypassed == "On Hold"] <- "Failed"
pbmc_samples$librarypassed[ pbmc_samples$librarypassed == "Received"] <- "Failed"

In [110]:
table(pbmc_samples$librarypassed, useNA = "always")


Failed Passed   <NA> 
    39    282      0 

In [111]:
# add number of failed libraries
pbmc_samples$Library[ pbmc_samples$librarypassed == "Failed"] <- paste0("n = ", 
                               length(pbmc_samples$librarypassed[which(pbmc_samples$librarypassed == "Failed")]))

In [112]:
#add total library yield
pbmc_samples$libraryyield <- NA
pbmc_samples$libraryyield <- allsamples$`Total.Library.Yield.(ng)`[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [113]:
#amount DNA used for capture
pbmc_samples$DNAforcapture <- NA
pbmc_samples$DNAforcapture <- allsamples$`Amount.of.Library.used.for.Capture.(ng)`[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [114]:
#total capture yield
pbmc_samples$captureyield <- NA
pbmc_samples$captureyield <- allsamples$`Total.Capture.Library.Yield.(ng)`[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [115]:
# Miseq batch for capture
pbmc_samples$capbatch <- NA
pbmc_samples$capbatch <- allsamples$Miseq.Flow.Cell.ID[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [116]:
head(pbmc_samples$capbatch)

In [117]:
# Miseq status for capture
pbmc_samples$capturepassed <- NA
pbmc_samples$capturepassed <- allsamples$Miseq.for.Capture[ match(pbmc_samples$sample_id, allsamples$Specimen.ID)]

In [118]:
# Number of passed captures
pbmc_samples$Capture <- NA
pbmc_samples$Capture[ pbmc_samples$capturepassed == "Passed"] <- paste0("n = ", 
                                                                        length(pbmc_samples$capturepassed[which(pbmc_samples$capturepassed == "Passed")]))

In [119]:
table(pbmc_samples$capturepassed, useNA = "always")


       Failed Not Submitted        Passed          <NA> 
            1             3           278            39 

In [120]:
# cleanup
pbmc_samples$capturepassed[ which(pbmc_samples$capturepassed == "Not Submitted")] <- "Failed"

In [121]:
table(pbmc_samples$capturepassed, useNA = "always")


Failed Passed   <NA> 
     4    278     39 

In [122]:
pbmc_samples$deepseq <- NA
pbmc_samples$deepseq <- trimws(pbmc$`Sent.for.Nextseq?`[ match(pbmc_samples$sample_id, pbmc$Specimen.ID)])
#pbmc_samples$deepseq[is.na(pbmc_samples$deepseq)] <- "No"

In [123]:
table(pbmc_samples$deepseq, useNA = "always")


Re-seq    Yes   <NA> 
    14    264     43 

In [124]:
#clean up
pbmc_samples$deepseq[ pbmc_samples$deepseq == "Re-seq"] <- "Yes"

In [125]:
table(pbmc_samples$deepseq, useNA = "always")


 Yes <NA> 
 278   43 

In [126]:
table(pbmc_samples$deepseq == "Yes")


TRUE 
 278 

In [127]:
# number of deep sequencing samples
pbmc_samples$Sequencing <- NA
pbmc_samples$Sequencing[ pbmc_samples$deepseq == "Yes"] <- paste0("n = ", table(pbmc_samples$deepseq == "Yes"))

In [128]:
#make a new sample id to match with divstats
pbmc_samples$mysample_id <- NA

pbmc_samples$mysample_id <- paste0("CHP_", pbmc_samples$sample_id, sep = "")
pbmc_samples$mysample_id <- gsub("-", "_", pbmc_samples$mysample_id)

pbmc_samples$mysample_id[ grepl("YO", pbmc_samples$mysample_id)] <- paste0(
    pbmc_samples$mysample_id[ grepl("YO", pbmc_samples$mysample_id)], "_00")

pbmc_samples$mysample_id[ grepl("MO", pbmc_samples$mysample_id)] <- paste0(
    pbmc_samples$mysample_id[ grepl("MO", pbmc_samples$mysample_id)], "_00")

In [129]:
colnames(divstats)

In [130]:
divstats_pbmc <- divstats[ divstats$Sample == "PBMC",]

In [131]:
dim( divstats_pbmc)

In [132]:
# Samples included in tcr analysis
pbmc_samples$TCR <- NA
pbmc_samples$TCR[ pbmc_samples$mysample_id %in% divstats_pbmc$sample_id] <- "Yes"
pbmc_samples$TCR[ is.na(pbmc_samples$TCR)] <- "No"

In [133]:
table(pbmc_samples$TCR, useNA = "always")


  No <NA> 
 321    0 

In [134]:
head(pbmc_samples)

Unnamed: 0_level_0,sample_id,Shipped,DNAextracted,DNAused,libbatch,librarypassed,Library,libraryyield,captureyield,capbatch,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,330-05,n = 321,6160,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,825.6,155.42,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_330_05,No
2,329-04,n = 321,11200,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,2006.4,155.42,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_329_04,No
3,331-05,n = 321,3780,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,1161.6,162.64,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_331_05,No
4,331-04,n = 321,9700,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,1555.2,162.64,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_331_04,No
5,338-05,n = 321,1410,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,508.8,328.7,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_338_05,No
6,338-04,n = 321,2460,600,190517_M04827_0203_000000000-D6BYT,Passed,n = 282,883.2,328.7,190624_M04827_0214_000000000-D6F66,Passed,n = 278,Yes,n = 278,CHP_338_04,No


In [135]:
pbmc_samples$capturepassed[is.na(pbmc_samples$capturepassed)] <- ""

In [136]:
table(pbmc_samples$capturepassed, useNA = "always")


       Failed Passed   <NA> 
    39      4    278      0 

In [137]:
# Number of failed captures
capturefailed <- sum(pbmc_samples$capturepassed == "Failed")
pbmc_samples$Capture[pbmc_samples$capturepassed == "Failed"] <- paste0("n = ", capturefailed)

In [138]:
# Number of samples not deep seq'd == 0
seqfailed <- sum(pbmc_samples$deepseq == "No"& pbmc_samples$capturepassed == "Passed")
seqfailed

In [139]:
# Number of samples included in tcr analysis
pbmc_samples$TCRanalysis <- NA

tcrpassed <- sum(pbmc_samples$TCR == "Yes" & 
                 pbmc_samples$deepseq == "Yes" &
                 pbmc_samples$capturepassed == "Passed")

pbmc_samples$TCRanalysis[pbmc_samples$TCR == "Yes" & 
                 pbmc_samples$deepseq == "Yes" &
                 pbmc_samples$capturepassed == "Passed"] <- paste0("n = ", sum(pbmc_samples$TCR == "Yes" & 
                 pbmc_samples$deepseq == "Yes" &
                 pbmc_samples$capturepassed == "Passed"))

the following sample was annotated as Failed capture miseq, but I got the data. For now, I dont include in sankey plot

In [140]:
pbmc_samples[ pbmc_samples$sample_id == "383-03",]

Unnamed: 0_level_0,sample_id,Shipped,DNAextracted,DNAused,libbatch,librarypassed,Library,libraryyield,captureyield,capbatch,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR,TCRanalysis
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
141,383-03,n = 321,7280,600,191104_M04827_0295_000000000-D7FHM,Passed,n = 282,35.328,47.12,191121_M04827_0308_000000000-D7HCV,Failed,n = 4,,,CHP_383_03,No,


In [141]:
table(pbmc_samples$TCRanalysis )

< table of extent 0 >

In [142]:
# Number of samples not included in TCR analysis
tcrfailed <- sum(pbmc_samples$TCR == "No" & 
                 pbmc_samples$deepseq == "Yes" &
                 pbmc_samples$capturepassed == "Passed")

pbmc_samples$TCRanalysis[pbmc_samples$TCR == "No" & 
                         pbmc_samples$deepseq == "Yes" &
                         pbmc_samples$capturepassed == "Passed"] <- paste0("n = ", tcrfailed)

In [143]:
dim(pbmc_samples)

In [144]:
table(pbmc_samples$TCRanalysis, useNA = "always")


n = 278    <NA> 
    278      43 

In [145]:
# Create a patient ID
pbmc_samples$Patient <- substr(pbmc_samples$mysample_id, 0,7)

In [146]:
# add tumour type
pbmc_samples$Tumor <- NA
pbmc_samples$Tumor <- metadata$Disease_type[match(pbmc_samples$Patient, metadata$Patient)]

In [147]:
# add tumour group
pbmc_samples$Group <- NA
pbmc_samples$Group <- metadata$group[match(pbmc_samples$Patient, metadata$Patient)]

In [148]:
#remove tumor and group for samples not included in TCR analysis
pbmc_samples$Group[ pbmc_samples$TCR == "No"] <- NA
pbmc_samples$Tumor[ pbmc_samples$TCR == "No"] <- NA

In [149]:
#some colname cleanup
colnames(pbmc_samples)[colnames(pbmc_samples) == "Library"] <- "Library preparation"
colnames(pbmc_samples)[colnames(pbmc_samples) == "Capture"] <- "Successful capture"
colnames(pbmc_samples)[colnames(pbmc_samples) == "Sequencing"] <- "Deep sequencing"
colnames(pbmc_samples)[colnames(pbmc_samples) == "TCRanalysis"] <- "TCR analysis"

In [150]:
save(pbmc_samples, file = paste0(gitpath, "data/pbmc_sampleprocessing.RData"))

# cfDNA

I modified the log file in excel added rm in column C. 

In [151]:
cfdna <- read.xlsx(paste0(datapath, "CHOP Lab Processing Log_AN.xlsx"), sheet = 1)

In [152]:
cfdna$Sample.ID <- trimws(cfdna$Sample.ID)

In [153]:
length(cfdna$Sample.ID)

In [154]:
cfdna$Colour.Legend <- NULL

In [155]:
cfdna <- cfdna[ which(cfdna$`Batch.#` != "rm"),]

In [156]:
dim(cfdna)

In [157]:
# get total number of samples shipped
cfdna_samples <- data.frame(cfdna$Sample.ID, stringsAsFactors = F)

In [158]:
colnames(cfdna_samples) <- "sample_id"

In [159]:
dim(cfdna_samples)

In [160]:
#number of DNA samples extratced
cfdna_samples$DNAsamples <- paste0("n = ", nrow(cfdna_samples))

In [161]:
head(cfdna_samples)

Unnamed: 0_level_0,sample_id,DNAsamples
Unnamed: 0_level_1,<chr>,<chr>
1,346-01,n = 316
2,348-01,n = 316
3,348-03,n = 316
4,343-04,n = 316
5,329-04,n = 316
6,331-05,n = 316


In [162]:
cfdna_samples$sample_id <- trimws(cfdna_samples$sample_id)

All samples that were shipped according to DB manifest files

In [163]:
allcfdna <- unlist(read.xlsx(paste0(datapath, "allcfdna_fromDB.xlsx"), sheet = 1))

In [164]:
allcfdna <- allcfdna[ !is.na(allcfdna)]
allcfdna <- allcfdna[ !grepl("Box", allcfdna)]
allcfdna <- allcfdna[ !grepl("X", allcfdna)]

In [165]:
allcfdna <- gsub(";", "", allcfdna)
allcfdna <- gsub("--", "-", allcfdna)

In [166]:
names(allcfdna) <- NULL

In [167]:
allcfdna <- trimws(allcfdna)

In [168]:
allcfdna[ !allcfdna %in% cfdna_samples$sample_id]

Samples above were in the manifest file from DB but were not processed in our lab. Not sure what happened.
335-02 missing. 341-05, 345-05, 365-04 too low dna extracted. 339-02?. 406-03 tube emtpy. 355-05?

In [169]:
cfdna_samples[!cfdna_samples$sample_id %in% allcfdna,]

Unnamed: 0_level_0,sample_id,DNAsamples
Unnamed: 0_level_1,<chr>,<chr>
60,409-02,n = 316
96,364-03,n = 316


samples above were processed in our lab but were not in the manifest file from DB. 

409-02: Sample was not on the Processing log but was in the box.
364-03: Not sure, gotta ask Steph

In [170]:
cfdna$Miseq.for.Library <- trimws(cfdna$Miseq.for.Library)

Get those with passed libs and capture. Add each failed steps later

In [171]:
table(cfdna$Miseq.for.Library, useNA = "always")


  Passed Received     <NA> 
     315        1        0 

In [172]:
cfdna$Miseq.for.Library[ cfdna$Miseq.for.Library == "Received"] <- "Passed"

In [173]:
cfdna$Miseq.for.Capture <- trimws(cfdna$Miseq.for.Capture)

In [174]:
table(cfdna$Miseq.for.Capture, useNA = "always")


Failed Passed   <NA> 
     1    295     20 

In [175]:
cfdna$Miseq.for.Capture[ is.na(cfdna$Miseq.for.Capture)] <- "Failed"

In [176]:
capfailed <- cfdna[ which(cfdna$Miseq.for.Capture != "Passed"),]

In [177]:
dim(capfailed)

In [178]:
#remove failed samples
cfdna <- cfdna[ which(cfdna$Miseq.for.Library == "Passed"),]
cfdna <- cfdna[ which(cfdna$Miseq.for.Capture == "Passed"),]

In [179]:
table(cfdna$Miseq.for.Capture)


Passed 
   295 

In [180]:
#bind all cfdna and cap failed
allsamples <- rbind(cfdna, capfailed)

In [181]:
#add amount DNA extracted
cfdna_samples$DNAextracted <- NA
cfdna_samples$DNAextracted <- allsamples$`Total.yield.1+2`[ match(cfdna_samples$sample_id, allsamples$Sample.ID )]

In [182]:
#add amount DNA used for library
cfdna_samples$DNAused <- NA
cfdna_samples$DNAused <- allsamples$`Amount.of.DNA.used.for.library.prep.(ng)`[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [183]:
# Add miseq status for libraries
cfdna_samples$librarypassed <- NA
cfdna_samples$librarypassed <- allsamples$Miseq.for.Library[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [184]:
cfdna_samples$libbatch <- allsamples$Library.Miseq.Flow.Cell.ID[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [185]:
table(cfdna_samples$librarypassed, useNA = "always")


Passed   <NA> 
   316      0 

In [186]:
# add number of passed libraries
cfdna_samples$Library <- NA
cfdna_samples$Library[ cfdna_samples$librarypassed == "Passed"] <- paste0("n = ", 
                               length(cfdna_samples$librarypassed[which(cfdna_samples$librarypassed == "Passed")]))

In [187]:
table(cfdna_samples$librarypassed, useNA = "always")


Passed   <NA> 
   316      0 

In [188]:
#add total library yield
cfdna_samples$libraryyield <- NA
cfdna_samples$libraryyield <- allsamples$`Amount.of.DNA.(ng)`[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [189]:
#amount DNA used for capture
cfdna_samples$DNAforcapture <- NA
cfdna_samples$DNAforcapture <- allsamples$`Amount.of.Library.used.for.Capture.Pooling.(ng)`[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [190]:
#total capture yield
cfdna_samples$captureyield <- NA
cfdna_samples$captureyield <- allsamples$`Amout.(ng)`[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [191]:
cfdna_samples$capbatch <- allsamples$Miseq.Flow.Cell.ID[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [192]:
# Miseq status for capture
cfdna_samples$capturepassed <- NA
cfdna_samples$capturepassed <- allsamples$Miseq.for.Capture[ match(cfdna_samples$sample_id, allsamples$Sample.ID)]

In [193]:
# Number of passed captures
cfdna_samples$Capture <- NA
cfdna_samples$Capture[ cfdna_samples$capturepassed == "Passed"] <- paste0("n = ", 
                                                                        length(cfdna_samples$capturepassed[which(cfdna_samples$capturepassed == "Passed")]))

In [194]:
table(cfdna_samples$capturepassed, useNA = "always")


Failed Passed   <NA> 
    21    295      0 

In [195]:
cfdna_samples$capturepassed <- trimws(cfdna_samples$capturepassed)

In [196]:
head(cfdna_samples)

Unnamed: 0_level_0,sample_id,DNAsamples,DNAextracted,DNAused,librarypassed,libbatch,Library,libraryyield,DNAforcapture,captureyield,capbatch,capturepassed,Capture
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,346-01,n = 316,19.56,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,237.12,,,,Failed,
2,348-01,n = 316,14.4,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,159.36,159.36,733.4,200911_M04827_0435_000000000-D9GW9,Passed,n = 295
3,348-03,n = 316,24.4944,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,227.52,227.52,733.4,200911_M04827_0435_000000000-D9GW9,Passed,n = 295
4,343-04,n = 316,56.0,20,Passed,190730_M04827_0235_000000000-D6WF7,n = 316,291.84,291.84,1052.6,200909_M04827_0434_000000000-D9988,Passed,n = 295
5,329-04,n = 316,40.0768,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,314.88,314.88,1132.4,200911_M04827_0435_000000000-D9GW9,Passed,n = 295
6,331-05,n = 316,82.4,20,Passed,190730_M04827_0235_000000000-D6WF7,n = 316,811.2,499.13,1565.6,200911_M04827_0435_000000000-D9GW9,Passed,n = 295


In [197]:
cfdna_samples$deepseq <- NA
cfdna_samples$deepseq <- trimws(cfdna$`Sent.for.Nextseq?`[ match(cfdna_samples$sample_id, cfdna$Sample.ID)])
#pbmc_samples$deepseq[is.na(pbmc_samples$deepseq)] <- "No"

In [198]:
table(cfdna_samples$deepseq, useNA = "always")


 Yes <NA> 
 295   21 

In [199]:
cfdna_samples$deepseq[ is.na(cfdna_samples$deepseq)] <- "No"

In [200]:
table(cfdna_samples$deepseq == "Yes")


FALSE  TRUE 
   21   295 

In [201]:
dim(cfdna_samples)

In [202]:
paste0("n = ", table(cfdna_samples$deepseq == "Yes"))

In [203]:
sum(cfdna_samples$deepseq == "Yes")

In [204]:
# number of deep sequencing samples
cfdna_samples$Sequencing <- NA

cfdna_samples$Sequencing[ cfdna_samples$deepseq == "Yes"] <- paste0("n = ", sum(cfdna_samples$deepseq == "Yes"))

In [205]:
#make a new sample id to match with divstats
cfdna_samples$mysample_id <- NA

cfdna_samples$mysample_id <- paste0("CHP_", cfdna_samples$sample_id, sep = "")
cfdna_samples$mysample_id <- gsub("-", "_", cfdna_samples$mysample_id)

cfdna_samples$mysample_id[ grepl("YO", cfdna_samples$mysample_id)] <- paste0(
    cfdna_samples$mysample_id[ grepl("YO", cfdna_samples$mysample_id)], "_00")

cfdna_samples$mysample_id[ grepl("MO", cfdna_samples$mysample_id)] <- paste0(
    cfdna_samples$mysample_id[ grepl("MO", cfdna_samples$mysample_id)], "_00")

In [206]:
table(divstats$Sample)


0.507 0.758 0.898 0.951 0.963 0.979 0.981 0.983 0.986 0.988 0.989 0.991 0.992 
    1     1     1     1     1     1     1     1     1     1     1     1     2 
0.993 0.994 0.995 0.996 0.997 0.998 0.999     1 
    4     1     3    16    13    25    54   450 

In [207]:
divstats_cfdna <- divstats[ divstats$Sample == "cfDNA",]

In [208]:
dim( divstats_cfdna)

In [209]:
# Samples included in tcr analysis
cfdna_samples$TCR <- NA
cfdna_samples$TCR[ cfdna_samples$mysample_id %in% divstats_cfdna$sample_id] <- "Yes"
cfdna_samples$TCR[ is.na(cfdna_samples$TCR)] <- "No"

In [210]:
table(cfdna_samples$TCR, useNA = "always")


  No <NA> 
 316    0 

In [211]:
table(cfdna_samples$capturepassed, useNA = "always")


Failed Passed   <NA> 
    21    295      0 

In [212]:
# Number of failed captures
capturefailed <- sum(cfdna_samples$capturepassed == "Failed")
cfdna_samples$Capture[cfdna_samples$capturepassed == "Failed"] <- paste0("n = ", capturefailed)

In [213]:
# Number of samples included in tcr analysis
cfdna_samples$TCRanalysis <- NA

cfdna_samples$TCRanalysis[cfdna_samples$TCR == "Yes" & 
                 cfdna_samples$deepseq == "Yes" &
                 cfdna_samples$capturepassed == "Passed"] <- paste0("n = ", 
                                                                    sum(cfdna_samples$TCR == "Yes" & 
                                                                        cfdna_samples$deepseq == "Yes" &
                                                                        cfdna_samples$capturepassed == "Passed"))

In [214]:
# Number of samples not included in TCR analysis
tcrfailed <- sum(cfdna_samples$TCR == "No" & 
                 cfdna_samples$deepseq == "Yes" &
                 cfdna_samples$capturepassed == "Passed")

cfdna_samples$TCRanalysis[cfdna_samples$TCR == "No" & 
                         cfdna_samples$deepseq == "Yes" &
                         cfdna_samples$capturepassed == "Passed"] <- paste0("n = ", tcrfailed)

In [215]:
dim(cfdna_samples)

In [216]:
table(cfdna_samples$TCRanalysis, useNA = "always")


n = 295    <NA> 
    295      21 

In [217]:
cfdna_samples

sample_id,DNAsamples,DNAextracted,DNAused,librarypassed,libbatch,Library,libraryyield,DNAforcapture,captureyield,capbatch,capturepassed,Capture,deepseq,Sequencing,mysample_id,TCR,TCRanalysis
<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
346-01,n = 316,19.5600,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,237.12,,,,Failed,n = 21,No,,CHP_346_01,No,
348-01,n = 316,14.4000,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,159.36,159.3600,733.40,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_348_01,No,n = 295
348-03,n = 316,24.4944,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,227.52,227.5200,733.40,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_348_03,No,n = 295
343-04,n = 316,56.0000,20,Passed,190730_M04827_0235_000000000-D6WF7,n = 316,291.84,291.8400,1052.60,200909_M04827_0434_000000000-D9988,Passed,n = 295,Yes,n = 295,CHP_343_04,No,n = 295
329-04,n = 316,40.0768,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,314.88,314.8800,1132.40,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_329_04,No,n = 295
331-05,n = 316,82.4000,20,Passed,190730_M04827_0235_000000000-D6WF7,n = 316,811.20,499.1300,1565.60,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_331_05,No,n = 295
330-05,n = 316,114.4864,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,120.00,,0.00,,Failed,n = 21,No,,CHP_330_05,No,
350-02,n = 316,115.0065,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,241.92,241.9200,805.60,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_350_02,No,n = 295
352-02,n = 316,173.6080,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,446.40,,0.00,,Failed,n = 21,No,,CHP_352_02,No,
343-02,n = 316,161.9982,20,Passed,200605_M04827_0389_000000000-D86L3,n = 316,326.40,326.4000,1132.40,200911_M04827_0435_000000000-D9GW9,Passed,n = 295,Yes,n = 295,CHP_343_02,No,n = 295


In [218]:
# Create a patient ID
cfdna_samples$Patient <- substr(cfdna_samples$mysample_id, 0,7)

In [219]:
# add tumour type
cfdna_samples$Tumor <- NA
cfdna_samples$Tumor <- metadata$Disease_type[match(cfdna_samples$Patient, metadata$Patient)]

In [220]:
# add tumour group
cfdna_samples$Group <- NA
cfdna_samples$Group <- metadata$group[match(cfdna_samples$Patient, metadata$Patient)]

In [221]:
#remove tumor and group for samples not included in TCR analysis
cfdna_samples$Group[ cfdna_samples$TCR == "No"] <- NA
cfdna_samples$Tumor[ cfdna_samples$TCR == "No"] <- NA

In [222]:
#some colname cleanup
colnames(cfdna_samples)[colnames(cfdna_samples) == "DNAsamples"] <- "DNA samples"
colnames(cfdna_samples)[colnames(cfdna_samples) == "Library"] <- "Library preparation"
colnames(cfdna_samples)[colnames(cfdna_samples) == "Capture"] <- "Successful capture"
colnames(cfdna_samples)[colnames(cfdna_samples) == "Sequencing"] <- "Deep sequencing"
colnames(cfdna_samples)[colnames(cfdna_samples) == "TCRanalysis"] <- "TCR analysis"

In [223]:
save(cfdna_samples, file = paste0(gitpath, "data/cfdna_sampleprocessing.RData"))