# Dependencies

# Functions

In [34]:
hla.fx <- function(datapath, hladf, locus) {
    df <- hla[hla$Locus == locus, ] # subset to locus
    allalleles <- unique(c(df$Allele.1, df$Allele.2)) # get all alleles
# create matrix with sample IDs as row names and alleles as column names
    mymat <- matrix(0,
        nrow = nrow(df), ncol = length(allalleles),
        dimnames = list(df$Sample.ID, allalleles)
    )
# fill matrix for each sample and allele combination 
    for (i in 1:nrow(mymat)) {
        for (j in 1:ncol(mymat)) {
            if (df$Allele.1[i] == colnames(mymat)[j]) {
                mymat[i, j] <- 1
            }
            if (df$Allele.2[i] == colnames(mymat)[j]) {
                mymat[i, j] <- mymat[i, j] + 1 # if homozygous, counts to two
            }
        }
    }
    write.table(mymat, file = paste0(datapath, "INT_HLA_", locus, ".txt"), quote = F, sep = "\t")
}

In [35]:
source("/Users/anabbi/git/ped_CapTCRseq/R/ggplot2_theme.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/color_schemes.R")
source("/Users/anabbi/git/ped_CapTCRseq/R/Misc_functions.R")

# Paths

In [36]:
datapath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Data/"
plotpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Plots/"
manifestpath <- "/Users/anabbi/OneDrive - UHN/Documents/INTERCEPT/Manifests/"
gitpath <- "/Users/anabbi/git/ped_CapTCRseq/"
h4hpath <- "/Users/anabbi/Desktop/H4H/INTERCEPT/"

# Main

# INT

In [37]:
hla <- xlsx::read.xlsx(file = paste0(datapath, "/HLA/UHN-20230127-B257-HLA.xlsx"), sheetIndex = 1)

In [38]:
head(hla,10)

Unnamed: 0_level_0,Sample.ID,Locus,Allele.1,Allele.2,Comments,Diploid.Ambiguities,Allele.1.Ambiguities,Allele.2.Ambiguities
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,315-05,A,A*02:01:01,A*02:05:01,,,,
2,315-05,B,B*35:01:01,B*49:01:01,,,,
3,315-05,C,C*04:01:01,C*07:01:01,,,,
4,315-05,DPA1,DPA1*01:03:01,DPA1*02:02:02,,,,
5,315-05,DPB1,DPB1*01:01:01,DPB1*04:02:01,,,,
6,315-05,DQA1,DQA1*01:02:01,DQA1*05:05:01,,,,
7,315-05,DQB1,DQB1*03:19:01,DQB1*06:04:01,,,,
8,315-05,DRB1,DRB1*13:02:01,DRB1*13:04,,,,
9,315-05,DRB345,DRB3*02:02:01,DRB3*03:01:01,,,,
10,319-05,A,A*01:01:01,A*03:01:01,,,,


In [5]:
names(table(hla$Locus))

In [6]:
for(i in names(table(hla$Locus))){
    hla.fx(paste0(datapath,"HLA/"), hla, i)
}

# Mitchel data

In [3]:
mitchell <- read.table(
    file = paste0(h4hpath, "Adaptive/Mitchell_Michels2022/SampleOverview_11-23-2022_7-25-05_PM.tsv"),
    sep = "\t", header = T
)


In [24]:
mitchell_1 <- mitchell[ !grepl("Denver",mitchell$sample_name), ]
mitchell_Denver <- mitchell[ grepl("Denver",mitchell$sample_name), ]

In [25]:
rownames(mitchell_Denver) <- 1:nrow(mitchell_Denver)

In [29]:
mitchellvars <- c("Years at visit", "Years at diagnosis", "Male|Female",
                "Subject |Control ", "Timepoint", "Subject")

In [30]:
mitchell_1_tags <- sampletags_columns(mitchell_1, mitchellvars)
# mitchell_Denver_tags <- sampletags_columns(mitchell_Denver, c("Years at diagnosis",
# "Male|Female", "Type 1 Diabetes"))

In [31]:
head(mitchell_1_tags)

Unnamed: 0_level_0,index,Years at visit,Years at diagnosis,Male|Female,Subject |Control,Timepoint,Subject
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,3 Years at visit,,Female,Control 006,Timepoint 3,
2,2,3 Years at visit,12.958904109589 Years at diagnosis,Female,Subject 001,Timepoint 1,Subject 001
3,3,10 Years at visit,14.4328767123288 Years at diagnosis,Male,Subject 019,Timepoint 3,Subject 019
4,4,0 Years at visit,18.0438356164384 Years at diagnosis,Female,Subject 021,Timepoint 1,Subject 021
5,5,6 Years at visit,6.37808219178082 Years at diagnosis,Female,Subject 003,Timepoint 4,Subject 003
6,6,12 Years at visit,,Male,Control 012,Timepoint 4,


In [32]:
mitchell_1_tags$study <- "Mitchell"
mitchell_1_tags$Agegroup <- mitchell_1_tags$`Years at visit`
mitchell_1_tags$Sex <- mitchell_1_tags$`Male|Female`
mitchell_1_tags$sample_name <- mitchell_1$sample_name[ match(mitchell_1_tags$index, rownames(mitchell_1)) ]
mitchell_1_tags$disease[grepl("Control",mitchell_1_tags$`Subject |Control`)] <- "Ctrl"
mitchell_1_tags$disease[grepl("Subject",mitchell_1_tags$`Subject |Control`)] <- "T1D"


In [40]:
splitsampletags <- strsplit(mitchell_1$sample_tags, split = ",")

In [45]:
    mydf <- lapply(splitsampletags, function(sampletag) {
        y <- unlist(sampletag)
        y <- trimws(y) 
        # get HLA
        hla <- y[grepl("HLA", y)]
        
        } )

In [48]:
names(mydf) <- mitchell_1$sample_name

In [61]:
# for each element in the list, create a data frame with each HLA allele as one row and sample_name as column
mydf1 <- lapply(mydf, function(x) {
    df <- data.frame(matrix(unlist(x), nrow = length(x), ncol = 1))
    colnames(df) <- "HLA"
    df
})
# add sample_name as column for each element in the list
mydf1 <- mapply(cbind, mydf1, sample_name = names(mydf1), SIMPLIFY = FALSE)


In [64]:
mitchel_hla <- do.call(rbind, mydf1)

In [66]:
rownames(mitchel_hla) <- 1:nrow(mitchel_hla)

In [67]:
head(mitchel_hla)

Unnamed: 0_level_0,HLA,sample_name
Unnamed: 0_level_1,<chr>,<chr>
1,HLA-A*0101,310121_TCRB
2,HLA-A*0301,310121_TCRB
3,HLA-B*1801,310121_TCRB
4,HLA-C*0701,310121_TCRB
5,HLA-DPA1*0103,310121_TCRB
6,HLA-DPB1*0201,310121_TCRB


In [71]:
names(table(hla$Locus) )

In [72]:
allhla_preqc <- matrix(NA, nrow = length(unique(mitchel_hla$sample_name)), ncol = length(unique(mitchel_hla$HLA)),
                       dimnames = list(unique(mitchel_hla$sample_name), unique(mitchel_hla$HLA)))

In [73]:
head(allhla_preqc)

Unnamed: 0,HLA-A*0101,HLA-A*0301,HLA-B*1801,HLA-C*0701,HLA-DPA1*0103,HLA-DPB1*0201,HLA-DPB1*0401,HLA-DQA1*0301,HLA-DQA1*0501,HLA-DQB1*0201,...,HLA-B*4102,HLA-C*1703,HLA-DPB1*1601,HLA-A*0205,HLA-B*0706,HLA-C*1505,HLA-DPB1*12401,HLA-DRB1*0405,HLA-DQA1*0302,HLA-DRB1*0901
310121_TCRB,,,,,,,,,,,...,,,,,,,,,,
310102_TCRB,,,,,,,,,,,...,,,,,,,,,,
310156_TCRB,,,,,,,,,,,...,,,,,,,,,,
310204_TCRB,,,,,,,,,,,...,,,,,,,,,,
310245_TCRB,,,,,,,,,,,...,,,,,,,,,,
310186_TCRB,,,,,,,,,,,...,,,,,,,,,,


In [74]:
for (i in 1:nrow(allhla_preqc)) {
    myhla <- mitchel_hla[mitchel_hla$sample_name == rownames(allhla_preqc)[i], "HLA"]
    allalleles <- unique(myhla)
    mytab <- as.data.frame(table(allalleles), stringsAsFactors = F)
    allhla_preqc[i, match(mytab$allalleles, colnames(allhla_preqc))] <- mytab$Freq
}


In [75]:
head(allhla_preqc)

Unnamed: 0,HLA-A*0101,HLA-A*0301,HLA-B*1801,HLA-C*0701,HLA-DPA1*0103,HLA-DPB1*0201,HLA-DPB1*0401,HLA-DQA1*0301,HLA-DQA1*0501,HLA-DQB1*0201,...,HLA-B*4102,HLA-C*1703,HLA-DPB1*1601,HLA-A*0205,HLA-B*0706,HLA-C*1505,HLA-DPB1*12401,HLA-DRB1*0405,HLA-DQA1*0302,HLA-DRB1*0901
310121_TCRB,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
310102_TCRB,,1.0,,,1,1.0,1.0,,,,...,,,,,,,,,,
310156_TCRB,,,1.0,,1,,,,,,...,,,,,,,,,,
310204_TCRB,,,,,1,1.0,1.0,1.0,,,...,,,,,,,,,,
310245_TCRB,,,,,1,,1.0,1.0,,,...,,,,,,,,,,
310186_TCRB,,,1.0,,1,,,1.0,1.0,1.0,...,,,,,,,,,,


In [76]:
mitchel_hla[ mitchel_hla$sample_name == "310186_TCRB", "HLA"]

In [77]:
readr::write_rds(allhla_preqc, paste0(datapath, "HLA/mitchel_allhla_preqc.rds"))