# Dependencies

# Functions

In [1]:
hla.fx <- function(datapath, hladf, locus) {
    df <- hla[hla$Locus == locus, ] # subset to locus
    allalleles <- unique(c(df$Allele.1, df$Allele.2)) # get all alleles
# create matrix with sample IDs as row names and alleles as column names
    mymat <- matrix(0,
        nrow = nrow(df), ncol = length(allalleles),
        dimnames = list(df$Sample.ID, allalleles)
    )
# fill matrix for each sample and allele combination 
    for (i in 1:nrow(mymat)) {
        for (j in 1:ncol(mymat)) {
            if (df$Allele.1[i] == colnames(mymat)[j]) {
                mymat[i, j] <- 1
            }
            if (df$Allele.2[i] == colnames(mymat)[j]) {
                mymat[i, j] <- mymat[i, j] + 1 # if homozygous, counts to two
            }
        }
    }
    write.table(mymat, file = paste0(datapath, "INT_HLA_", locus, ".txt"), quote = F, sep = "\t")
}

In [2]:
source("/Users/anabbi/git/ped_CapTCRseq/R/ggplot2_theme.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/color_schemes.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/Misc_functions.R")

# Paths

In [3]:
datapath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Data/"
plotpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Plots/"
manifestpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Manifests/"
gitpath <- "/Users/anabbi/git/ped_CapTCRseq/"
h4hpath <- "/Users/anabbi/Desktop/H4H/INTERCEPT/"

# Main

# INT

In [15]:
hla <- xlsx::read.xlsx(file = paste0(datapath, "/HLA/UHN-20230127-B257-HLA.xlsx"), sheetIndex = 1)

In [5]:
head(hla,10)

Unnamed: 0_level_0,Sample.ID,Locus,Allele.1,Allele.2,Comments,Diploid.Ambiguities,Allele.1.Ambiguities,Allele.2.Ambiguities
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,315-05,A,A*02:01:01,A*02:05:01,,,,
2,315-05,B,B*35:01:01,B*49:01:01,,,,
3,315-05,C,C*04:01:01,C*07:01:01,,,,
4,315-05,DPA1,DPA1*01:03:01,DPA1*02:02:02,,,,
5,315-05,DPB1,DPB1*01:01:01,DPB1*04:02:01,,,,
6,315-05,DQA1,DQA1*01:02:01,DQA1*05:05:01,,,,
7,315-05,DQB1,DQB1*03:19:01,DQB1*06:04:01,,,,
8,315-05,DRB1,DRB1*13:02:01,DRB1*13:04,,,,
9,315-05,DRB345,DRB3*02:02:01,DRB3*03:01:01,,,,
10,319-05,A,A*01:01:01,A*03:01:01,,,,


In [6]:
names(table(hla$Locus))

In [7]:
for(i in names(table(hla$Locus))){
    hla.fx(paste0(datapath,"HLA/"), hla, i)
}

# Mitchel data

In [4]:
mitchell <- read.table(
    file = paste0(h4hpath, "Adaptive/Mitchell_Michels2022/SampleOverview_11-23-2022_7-25-05_PM.tsv"),
    sep = "\t", header = T
)

In [5]:
mitchell_1 <- mitchell[ !grepl("Denver",mitchell$sample_name), ]
mitchell_Denver <- mitchell[ grepl("Denver",mitchell$sample_name), ]

In [6]:
rownames(mitchell_Denver) <- 1:nrow(mitchell_Denver)

In [7]:
mitchellvars <- c("Years at visit", "Years at diagnosis", "Male|Female",
                "Subject |Control ", "Timepoint", "Subject")

In [8]:
mitchell_1_tags <- sampletags_columns(mitchell_1, mitchellvars)
mitchell_Denver_tags <- sampletags_columns(mitchell_Denver, c("Years at diagnosis", "Male|Female", "Type 1 Diabetes"))

In [9]:
head(mitchell_1_tags)

Unnamed: 0_level_0,index,Years at visit,Years at diagnosis,Male|Female,Subject |Control,Timepoint,Subject
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,3 Years at visit,,Female,Control 006,Timepoint 3,
2,2,3 Years at visit,12.958904109589 Years at diagnosis,Female,Subject 001,Timepoint 1,Subject 001
3,3,10 Years at visit,14.4328767123288 Years at diagnosis,Male,Subject 019,Timepoint 3,Subject 019
4,4,0 Years at visit,18.0438356164384 Years at diagnosis,Female,Subject 021,Timepoint 1,Subject 021
5,5,6 Years at visit,6.37808219178082 Years at diagnosis,Female,Subject 003,Timepoint 4,Subject 003
6,6,12 Years at visit,,Male,Control 012,Timepoint 4,


In [10]:
head(mitchell_Denver_tags)

Unnamed: 0_level_0,index,Years at diagnosis,Male|Female,Type 1 Diabetes
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
1,1,12 Years at diagnosis,Female,Type 1 Diabetes
2,2,12 Years at diagnosis,Female,Type 1 Diabetes
3,3,12 Years at diagnosis,Male,Type 1 Diabetes
4,4,11 Years at diagnosis,Female,Type 1 Diabetes
5,5,9 Years at diagnosis,Female,Type 1 Diabetes
6,6,8 Years at diagnosis,Female,Type 1 Diabetes


In [11]:
mitchell_1_tags$study <- "Mitchell"
mitchell_1_tags$Agegroup <- mitchell_1_tags$`Years at visit`
mitchell_1_tags$Sex <- mitchell_1_tags$`Male|Female`
mitchell_1_tags$sample_name <- mitchell_1$sample_name[ match(mitchell_1_tags$index, rownames(mitchell_1)) ]
mitchell_1_tags$disease[grepl("Control",mitchell_1_tags$`Subject |Control`)] <- "Ctrl"
mitchell_1_tags$disease[grepl("Subject",mitchell_1_tags$`Subject |Control`)] <- "T1D"


In [12]:
mitchell_Denver_tags$study <- "Mitchell_Denver"
mitchell_Denver_tags$Agegroup <- mitchell_Denver_tags$`Years at diagnosis`
mitchell_Denver_tags$Sex <- mitchell_Denver_tags$`Male|Female`
mitchell_Denver_tags$sample_name <- mitchell_Denver$sample_name[ match(mitchell_Denver_tags$index, rownames(mitchell_Denver)) ]
mitchell_Denver_tags$disease <- mitchell_Denver_tags$`Type 1 Diabetes`

In [13]:
mitchell_1_tags[ mitchell_1_tags$sample_name %in% c("310177_TCRB", "310283_TCRB"),]

Unnamed: 0_level_0,index,Years at visit,Years at diagnosis,Male|Female,Subject |Control,Timepoint,Subject,study,Agegroup,Sex,sample_name,disease
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
183,183,9 Years at visit,12.6575342465753 Years at diagnosis,Male,Subject 010,Timepoint 3,Subject 010,Mitchell,9 Years at visit,Male,310177_TCRB,T1D
188,188,8 Years at visit,12.6575342465753 Years at diagnosis,Male,Subject 010,Timepoint 2,Subject 010,Mitchell,8 Years at visit,Male,310283_TCRB,T1D


In [14]:
head(mitchell_Denver_tags)

Unnamed: 0_level_0,index,Years at diagnosis,Male|Female,Type 1 Diabetes,study,Agegroup,Sex,sample_name,disease
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,12 Years at diagnosis,Female,Type 1 Diabetes,Mitchell_Denver,12 Years at diagnosis,Female,DenverT1D-046_TCRB,Type 1 Diabetes
2,2,12 Years at diagnosis,Female,Type 1 Diabetes,Mitchell_Denver,12 Years at diagnosis,Female,DenverT1D-247_TCRB,Type 1 Diabetes
3,3,12 Years at diagnosis,Male,Type 1 Diabetes,Mitchell_Denver,12 Years at diagnosis,Male,DenverT1D-064_TCRB,Type 1 Diabetes
4,4,11 Years at diagnosis,Female,Type 1 Diabetes,Mitchell_Denver,11 Years at diagnosis,Female,DenverT1D-340_TCRB,Type 1 Diabetes
5,5,9 Years at diagnosis,Female,Type 1 Diabetes,Mitchell_Denver,9 Years at diagnosis,Female,DenverT1D-103_TCRB,Type 1 Diabetes
6,6,8 Years at diagnosis,Female,Type 1 Diabetes,Mitchell_Denver,8 Years at diagnosis,Female,DenverT1D-332_TCRB,Type 1 Diabetes


In [28]:
splitsampletags <- strsplit(mitchell_1$sample_tags, split = ",")

In [29]:
mydf <- lapply(splitsampletags, function(sampletag) {
    y <- unlist(sampletag)
    y <- trimws(y)
    # get HLA
    hla <- y[grepl("HLA", y)]
})


In [30]:
names(mydf) <- mitchell_1$sample_name

In [33]:
head(mitchell$sample_tags)

In [31]:
tail(mydf,100)

In [26]:
# for each element in the list, create a data frame with each HLA allele as one row and sample_name as column
mydf1 <- lapply(mydf, function(x) {
    df <- data.frame(matrix(unlist(x), nrow = length(x), ncol = 1))
    colnames(df) <- "HLA"
    df
})
# add sample_name as column for each element in the list
mydf1 <- mapply(cbind, mydf1, sample_name = names(mydf1), SIMPLIFY = FALSE)


In [27]:
mitchel_hla <- do.call(rbind, mydf1)

In [28]:
rownames(mitchel_hla) <- 1:nrow(mitchel_hla)

In [29]:
head(mitchel_hla)

Unnamed: 0_level_0,HLA,sample_name
Unnamed: 0_level_1,<chr>,<chr>
1,HLA-A*0101,310121_TCRB
2,HLA-A*0301,310121_TCRB
3,HLA-B*1801,310121_TCRB
4,HLA-C*0701,310121_TCRB
5,HLA-DPA1*0103,310121_TCRB
6,HLA-DPB1*0201,310121_TCRB


In [30]:
names(table(hla$Locus) )

In [31]:
allhla_preqc <- matrix(NA, nrow = length(unique(mitchel_hla$sample_name)), ncol = length(unique(mitchel_hla$HLA)),
                       dimnames = list(unique(mitchel_hla$sample_name), unique(mitchel_hla$HLA)))

In [32]:
head(allhla_preqc)

Unnamed: 0,HLA-A*0101,HLA-A*0301,HLA-B*1801,HLA-C*0701,HLA-DPA1*0103,HLA-DPB1*0201,HLA-DPB1*0401,HLA-DQA1*0301,HLA-DQA1*0501,HLA-DQB1*0201,...,HLA-B*4102,HLA-C*1703,HLA-DPB1*1601,HLA-A*0205,HLA-B*0706,HLA-C*1505,HLA-DPB1*12401,HLA-DRB1*0405,HLA-DQA1*0302,HLA-DRB1*0901
310121_TCRB,,,,,,,,,,,...,,,,,,,,,,
310102_TCRB,,,,,,,,,,,...,,,,,,,,,,
310156_TCRB,,,,,,,,,,,...,,,,,,,,,,
310204_TCRB,,,,,,,,,,,...,,,,,,,,,,
310245_TCRB,,,,,,,,,,,...,,,,,,,,,,
310186_TCRB,,,,,,,,,,,...,,,,,,,,,,


In [33]:
for (i in 1:nrow(allhla_preqc)) {
    myhla <- mitchel_hla[mitchel_hla$sample_name == rownames(allhla_preqc)[i], "HLA"]
    allalleles <- unique(myhla)
    mytab <- as.data.frame(table(allalleles), stringsAsFactors = F)
    allhla_preqc[i, match(mytab$allalleles, colnames(allhla_preqc))] <- mytab$Freq
}


In [34]:
head(allhla_preqc)

Unnamed: 0,HLA-A*0101,HLA-A*0301,HLA-B*1801,HLA-C*0701,HLA-DPA1*0103,HLA-DPB1*0201,HLA-DPB1*0401,HLA-DQA1*0301,HLA-DQA1*0501,HLA-DQB1*0201,...,HLA-B*4102,HLA-C*1703,HLA-DPB1*1601,HLA-A*0205,HLA-B*0706,HLA-C*1505,HLA-DPB1*12401,HLA-DRB1*0405,HLA-DQA1*0302,HLA-DRB1*0901
310121_TCRB,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
310102_TCRB,,1.0,,,1,1.0,1.0,,,,...,,,,,,,,,,
310156_TCRB,,,1.0,,1,,,,,,...,,,,,,,,,,
310204_TCRB,,,,,1,1.0,1.0,1.0,,,...,,,,,,,,,,
310245_TCRB,,,,,1,,1.0,1.0,,,...,,,,,,,,,,
310186_TCRB,,,1.0,,1,,,1.0,1.0,1.0,...,,,,,,,,,,


In [35]:
readr::write_rds(allhla_preqc, paste0(datapath, "HLA/mitchel_allhla_preqc.rds"))

In [37]:
allhlas <- allhla_preqc

In [38]:
mitch_hla <- as.data.frame(matrix(NA, nrow = nrow(allhlas), ncol = 16, dimnames = list(rownames(allhlas), c("A_1", "A_2", "B_1", "B_2", "C_1", "C_2", 
"DRB1_1", "DRB1_2","DQB1_1", "DQB1_2", "DPB1_1", "DPB1_2", "DPA1_1", "DPA1_2", "DQA1_1", "DQA1_2")))  )  # not including DRB345

In [39]:
for(i in 1:nrow(allhlas)){
    myhla <- allhlas[i,,drop = F]
    x <- colnames(myhla)[myhla == 1]
    if( length(x[grepl("A\\*", x)]) == 2 ){
        mitch_hla[i,c("A_1", "A_2")] <- x[grepl("A\\*", x)] }
        else { mitch_hla[i,c("A_1", "A_2")] <- c(x[grepl("A\\*", x)], NA)}
    if( length(x[grepl("B\\*", x)]) == 2 ){
        mitch_hla[i,c("B_1", "B_2")] <- x[grepl("B\\*", x)] }
        else { mitch_hla[i,c("B_1", "B_2")] <- c(x[grepl("B\\*", x)], NA)}
    if( length(x[grepl("C\\*", x)]) == 2 ){
        mitch_hla[i,c("C_1", "C_2")] <- x[grepl("C\\*", x)] }
        else { mitch_hla[i,c("C_1", "C_2")] <- c(x[grepl("C\\*", x)], NA)}
    if( length(x[grepl("DRB1\\*", x)]) == 2 ){
        mitch_hla[i,c("DRB1_1", "DRB1_2")] <- x[grepl("DRB1\\*", x)] }
        else { mitch_hla[i,c("DRB1_1", "DRB1_2")] <- c(x[grepl("DRB1\\*", x)], NA)}
    if( length(x[grepl("DQB1\\*", x)]) == 2 ){
        mitch_hla[i,c("DQB1_1", "DQB1_2")] <- x[grepl("DQB1\\*", x)] }
        else { mitch_hla[i,c("DQB1_1", "DQB1_2")] <- c(x[grepl("DQB1\\*", x)], NA)}
    if( length(x[grepl("DPB1\\*", x)]) == 2 ){
        mitch_hla[i,c("DPB1_1", "DPB1_2")] <- x[grepl("DPB1\\*", x)] }
        else { mitch_hla[i,c("DPB1_1", "DPB1_2")] <- c(x[grepl("DPB1\\*", x)], NA)}
    if( length(x[grepl("DPA1\\*", x)]) == 2 ){
        mitch_hla[i,c("DPA1_1", "DPA1_2")] <- x[grepl("DPA1\\*", x)] }
        else { mitch_hla[i,c("DPA1_1", "DPA1_2")] <- c(x[grepl("DPA1\\*", x)], NA)}
    if( length(x[grepl("DQA1\\*", x)]) == 2 ){
        mitch_hla[i,c("DQA1_1", "DQA1_2")] <- x[grepl("DQA1\\*", x)] }
        else { mitch_hla[i,c("DQA1_1", "DQA1_2")] <- c(x[grepl("DQA1\\*", x)], NA)}

} 

In [40]:
for(i in 1:nrow(allhlas)){
     myhla <- allhlas[i,,drop = F]
y <- colnames(myhla)[myhla == 2]
     if( length(y[grepl("A\\*", y)]) == 1 ){
        mitch_hla[i,c("A_1", "A_2")] <- rep(y[grepl("A\\*", y)],2) }
    if( length(y[grepl("B\\*", y)]) == 1 ){
        mitch_hla[i,c("B_1", "B_2")] <- rep(y[grepl("B\\*", y)],2) }
    if( length(y[grepl("C\\*", y)]) == 1 ){
        mitch_hla[i,c("C_1", "C_2")] <- rep(y[grepl("C\\*", y)],2) }
    if( length(y[grepl("DRB1\\*", y)]) == 1 ){
        mitch_hla[i,c("DRB1_1", "DRB1_2")] <- rep(y[grepl("DRB1\\*", y)],2) }
    if( length(y[grepl("DQB1\\*", y)]) == 1 ){
        mitch_hla[i,c("DQB1_1", "DQB1_2")] <- rep(y[grepl("DQB1\\*", y)],2) }
    if( length(y[grepl("DPB1\\*", y)]) == 1 ){
        mitch_hla[i,c("DPB1_1", "DPB1_2")] <- rep(y[grepl("DPB1\\*", y)],2) }
    if( length(y[grepl("DPA1\\*", y)]) == 1 ){
        mitch_hla[i,c("DPA1_1", "DPA1_2")] <- rep(y[grepl("DPA1\\*", y)],2) }
    if( length(y[grepl("DQA1\\*", y)]) == 1 ){
        mitch_hla[i,c("DQA1_1", "DQA1_2")] <- rep(y[grepl("DQA1\\*", y)],2) }
    
}

In [64]:
tail(mitch_hla)

Unnamed: 0_level_0,A_1,A_2,B_1,B_2,C_1,C_2,DRB1_1,DRB1_2,DQB1_1,DQB1_2,DPB1_1,DPB1_2,DPA1_1,DPA1_2,DQA1_1,DQA1_2
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Daisy-12_TCRB,HLA-A*2301,HLA-A*0205,HLA-B*5001,HLA-B*4102,HLA-C*0602,HLA-C*1703,HLA-DRB1*0301,HLA-DRB1*0701,HLA-DQB1*0201,HLA-DQB1*0202,HLA-DPB1*0201,HLA-DPB1*0301,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0201
Daisy-18_TCRB,HLA-A*2402,HLA-A*0201,HLA-B*3906,HLA-B*0801,HLA-C*0701,HLA-C*0702,HLA-DRB1*0301,HLA-DRB1*0801,HLA-DQB1*0201,HLA-DQB1*0402,HLA-DPB1*0401,HLA-DPB1*1601,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0401
Daisy-9_TCRB,HLA-A*0101,HLA-A*0201,HLA-B*0801,HLA-B*0706,HLA-C*0701,HLA-C*1505,HLA-DRB1*0301,HLA-DRB1*0405,HLA-DQB1*0201,HLA-DQB1*0302,HLA-DPB1*0401,HLA-DPB1*12401,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0303
Daisy-7_TCRB,HLA-A*2402,HLA-A*1101,HLA-B*1801,HLA-B*1501,HLA-C*0501,HLA-C*0102,HLA-DRB1*0301,HLA-DRB1*0901,HLA-DQB1*0201,HLA-DQB1*0303,HLA-DPB1*0401,,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0302
Daisy-8_TCRB,HLA-A*2301,HLA-A*0205,HLA-B*5001,HLA-B*4102,HLA-C*0602,HLA-C*1703,HLA-DRB1*0301,HLA-DRB1*0701,HLA-DQB1*0201,HLA-DQB1*0202,HLA-DPB1*0201,HLA-DPB1*0301,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0201
Daisy-4_TCRB,HLA-A*0101,HLA-A*0201,HLA-B*0801,HLA-B*0706,HLA-C*0701,HLA-C*1505,HLA-DRB1*0301,HLA-DRB1*0405,HLA-DQB1*0201,HLA-DQB1*0302,HLA-DPB1*0401,HLA-DPB1*12401,HLA-DPA1*0103,,HLA-DQA1*0501,HLA-DQA1*0303


In [42]:
readr::write_rds(mitch_hla, paste0(datapath, "HLA/mitch_hla_matrix.rds") )

In [124]:
mitch_hlaI <- as.data.frame(mitch_hla[, c("A_1", "A_2", "B_1", "B_2", "C_1", "C_2")] )

In [125]:
mitch_hlaI <- apply(mitch_hlaI, 2, function(x) gsub("HLA-", "", x))

In [126]:
mitch_hlaI <- mitch_hlaI[ rowSums(is.na(mitch_hlaI)) == 0,]

In [127]:
mitch_hla_I <- apply(mitch_hlaI, 2, function(x) paste0(substr(x, 1, 4), ":", substr(x, 5, 7)))

In [129]:
rownames(mitch_hla_I) <- rownames(mitch_hlaI)

In [131]:
head((mitch_hla_I)) 

Unnamed: 0,A_1,A_2,B_1,B_2,C_1,C_2
310102_TCRB,A*03:01,A*24:02,B*39:06,B*44:02,C*07:02,C*07:04
310156_TCRB,A*02:01,A*25:01,B*18:01,B*44:02,C*05:01,C*12:03
310186_TCRB,A*02:06,A*29:02,B*18:01,B*40:02,C*05:01,C*03:05
310296_TCRB,A*11:01,A*68:01,B*40:01,B*51:01,C*03:04,C*15:02
310149_TCRB,A*01:01,A*02:01,B*08:01,B*15:18,C*07:01,C*07:04
310222_TCRB,A*02:01,A*66:01,B*18:01,B*40:01,C*05:01,C*03:04


In [132]:
hlais <- apply(mitch_hla_I,1, function(x) {
  paste(unlist(x), collapse = ",")}) 

In [133]:
head(hlais)

In [134]:
h4hpath

In [135]:
readr::write_rds(hlais, paste0(h4hpath, "analysis/HLA/mitch_hlaI.rds") )