# Dependencies

In [1]:
pacman::p_load(igraph, graphlayouts, ggraph, ggforce, dplyr)

# Functions

In [2]:
sampletags_columns <- function(orig_df, grepvars) {
    orig_df$index <- 1:nrow(orig_df) # add index column
    orig_df$sample_tags <- paste0(orig_df$index,",",orig_df$sample_tags) # add index to sample_tags
    splitsampletags <- strsplit(orig_df$sample_tags, split = ",") # split sample_tags by comma into a list
# for each sample_tag, extract the variables in grepvars as list
    mydf <- lapply(splitsampletags, function(sampletag){ 
        y <- unlist(sampletag)
        y <- trimws(y)
        indx <- y[1] # first element is index
        # for each variable in grepvars, extract the value if it exists, if not add NA
        myvars <- lapply(grepvars, function(myvar){
            ifelse(sum(grepl(myvar, y)) == 1, y[grepl(myvar, y)], NA) })
        myvarsdf <- as.data.frame(myvars)
        vardf <- cbind.data.frame(indx, myvarsdf)
        colnames(vardf) <- c("index", grepvars) # rename columns
        return(vardf)
    })
    return(do.call(rbind, mydf)) # return a data frame
}

In [3]:
source("/Users/anabbi/git/ped_CapTCRseq/R/ggplot2_theme.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/color_schemes.R")

# Paths

In [4]:
datapath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Data/"
plotpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Plots/"
manifestpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Manifests/"
gitpath <- "/Users/anabbi/git/ped_CapTCRseq/"

In [5]:
h4hpath <- "/Users/anabbi/Desktop/H4H/INTERCEPT/"

# Main

all metadata

In [6]:
load(file = paste0(datapath,"capTCRseq/meta_div_TRB.RData"))

"strings not representable in native encoding will be translated to UTF-8"


In [7]:
carey <- read.table(file = paste0(h4hpath,"Adaptive/Carey_Preterm2017/SampleOverview_11-23-2022_7-18-00_PM.tsv"), 
sep = "\t", header = T)

In [9]:
henderson <- read.table(file = paste0(h4hpath,"Adaptive/Henderson_MIS2022/SampleOverview_11-23-2022_7-27-08_PM.tsv"), 
sep = "\t", header = T)

In [10]:
mitchell <- read.table(file = paste0(h4hpath,"Adaptive/Mitchell_Michels2022/SampleOverview_11-23-2022_7-25-05_PM.tsv"),
sep = "\t", header = T)

In [11]:
emerson <- read.table(file = paste0(h4hpath,"Adaptive/emerson2017/SampleOverview_01-18-2023_8-18-39_PM.tsv"),
sep = "\t", header = T)

Clean up mitchell metadata

In [12]:
mitchell_1 <- mitchell[ !grepl("Denver",mitchell$sample_name), ]
mitchell_Denver <- mitchell[ grepl("Denver",mitchell$sample_name), ]

In [79]:
rownames(mitchell_Denver) <- 1:nrow(mitchell_Denver)

In [13]:
mitchellvars <- c("Years at visit", "Years at diagnosis", "Male|Female",
                "Subject |Control ", "Timepoint", "Subject")

In [80]:
mitchell_1_tags <- sampletags_columns(mitchell_1, mitchellvars)
mitchell_Denver_tags <- sampletags_columns(mitchell_Denver, c("Years at diagnosis",
"Male|Female", "Type 1 Diabetes"))

In [15]:
carey_tags <- sampletags_columns(carey, c("Blood",
"Male|Female", "FACS", "Influenza", "Lung", "Years|Adult", "Cord"))

In [16]:
#cleanup emerson tags
emerson$sample_tags <- gsub("35-60 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("25-29 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("35-39 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("45-49 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("55-59 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("65-69 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("45-65 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("18-24 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("18-35 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("15-19 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("20-24 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("24-45 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("12-18 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("30-34 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("40-44 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("50-54 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("60-64 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("10-14 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("00-02 Years", "", emerson$sample_tags) 
emerson$sample_tags <- gsub("01-04 Years", "", emerson$sample_tags)
emerson$sample_tags <- gsub("60+", "", emerson$sample_tags)

In [17]:
emerson_tags <- sampletags_columns(emerson, c("Cohort", 
"Non-Hispanic|Unknown Ethnicity", "Unknown racial|Caucasian|Islander",
 "Inferred CMV", "Years",
"Male|Female", "Cytomegalovirus"))

In [19]:
head(emerson_tags)

Unnamed: 0_level_0,index,Cohort,Non-Hispanic|Unknown Ethnicity,Unknown racial|Caucasian|Islander,Inferred CMV,Years,Male|Female,Cytomegalovirus
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,Cohort 01,Unknown Ethnicity,Unknown racial group,,,,
2,2,Cohort 01,Unknown Ethnicity,Unknown racial group,,,,
3,3,Cohort 01,Unknown Ethnicity,Unknown racial group,,,,
4,4,Cohort 01,Unknown Ethnicity,Unknown racial group,,,,
5,5,Cohort 01,Non-Hispanic or Latino,Caucasian,Inferred CMV +,22 Years,Male,Cytomegalovirus +
6,6,Cohort 01,Non-Hispanic or Latino,Caucasian,Inferred CMV +,56 Years,Male,Cytomegalovirus +


In [25]:
henderson_tags <- as.data.frame(henderson$sample_tags)

# some clean up

In [103]:
henderson_tags$study <- "Henderson"
henderson_tags$disease <- henderson_tags$`henderson$sample_tags`
henderson_tags$sample_name <- henderson$sample_name
henderson_tags$Agegroup <- NA
henderson_tags$Sex <- NA

Acronyms for Henderson

MIS-C, multisystem inflammatory syndrome in children; peds, pediatric; COVID-19, coronavirus disease 2019; KD, Kawasaki disease; sJIA, systemic juvenile idiopathic arthritis; MAS, macrophage activation syndrome.

In [57]:
carey_tags$study <- "Carey"
carey_tags$Agegroup <- gsub("_.*", "", carey_tags$Agegroup)
carey_tags$Sex <- carey_tags$`Male|Female`
carey_tags$sample_name <- carey$sample_name[ match(carey_tags$index, rownames(carey)) ]
carey_tags$disease <- "Ctrl"

In [66]:
emerson_tags$study <- "Emerson"
emerson_tags$Agegroup <- emerson_tags$Years
emerson_tags$Sex <- emerson_tags$`Male|Female`
emerson_tags$sample_name <- emerson$sample_name[ match(emerson_tags$index, rownames(emerson)) ]
emerson_tags$disease <- emerson_tags$Cytomegalovirus

In [92]:
mitchell_1_tags$study <- "Mitchell"
mitchell_1_tags$Agegroup <- mitchell_1_tags$`Years at visit`
mitchell_1_tags$Sex <- mitchell_1_tags$`Male|Female`
mitchell_1_tags$sample_name <- mitchell_1$sample_name[ match(mitchell_1_tags$index, rownames(mitchell_1)) ]
mitchell_1_tags$disease[grepl("Control",mitchell_1_tags$`Subject |Control`)] <- "Ctrl"
mitchell_1_tags$disease[grepl("Subject",mitchell_1_tags$`Subject |Control`)] <- "T1D"


In [93]:
head(mitchell_1_tags)

Unnamed: 0_level_0,index,Years at visit,Years at diagnosis,Male|Female,Subject |Control,Timepoint,Subject,study,Agegroup,Sex,sample_name,disease
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,3 Years at visit,,Female,Control 006,Timepoint 3,,Mitchell,3 Years at visit,Female,310121_TCRB,Ctrl
2,2,3 Years at visit,12.958904109589 Years at diagnosis,Female,Subject 001,Timepoint 1,Subject 001,Mitchell,3 Years at visit,Female,310102_TCRB,T1D
3,3,10 Years at visit,14.4328767123288 Years at diagnosis,Male,Subject 019,Timepoint 3,Subject 019,Mitchell,10 Years at visit,Male,310156_TCRB,T1D
4,4,0 Years at visit,18.0438356164384 Years at diagnosis,Female,Subject 021,Timepoint 1,Subject 021,Mitchell,0 Years at visit,Female,310204_TCRB,T1D
5,5,6 Years at visit,6.37808219178082 Years at diagnosis,Female,Subject 003,Timepoint 4,Subject 003,Mitchell,6 Years at visit,Female,310245_TCRB,T1D
6,6,12 Years at visit,,Male,Control 012,Timepoint 4,,Mitchell,12 Years at visit,Male,310186_TCRB,Ctrl


In [94]:
mitchell_Denver_tags$study <- "Mitchell_Denver"
mitchell_Denver_tags$Agegroup <- mitchell_Denver_tags$`Years at diagnosis`
mitchell_Denver_tags$Sex <- mitchell_Denver_tags$`Male|Female`
mitchell_Denver_tags$sample_name <- mitchell_Denver$sample_name[ match(mitchell_Denver_tags$index, rownames(mitchell_Denver)) ]
mitchell_Denver_tags$disease <- mitchell_Denver_tags$`Type 1 Diabetes`

## bind all together

In [104]:
dfList <- list(carey_tags,henderson_tags,mitchell_1_tags,mitchell_Denver_tags, emerson_tags)
dfColList <- lapply(dfList,colnames)
commonCols <- Reduce(intersect,dfColList)


In [105]:
dfColList

In [108]:
commonCols

In [109]:
allAdaptive <- rbind(carey_tags[, commonCols], henderson_tags[, commonCols], 
mitchell_1_tags[, commonCols], mitchell_Denver_tags[, commonCols],
 emerson_tags[, commonCols])

In [110]:
head(allAdaptive)

Unnamed: 0_level_0,study,Agegroup,Sex,sample_name,disease
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,Carey,,Male,Preterm_23_3,Ctrl
2,Carey,,Female,Preterm_25_5,Ctrl
3,Carey,,Male,Preterm_26_B_2,Ctrl
4,Carey,,Female,Preterm_27_4,Ctrl
5,Carey,Adult,,Adult_1_15-04,Ctrl
6,Carey,Adult,,Adult_2_15-07,Ctrl


In [111]:
save(allAdaptive, file = paste0(datapath, "Adaptivedatasets/allAdaptive.RData"))