In [1]:
suppressMessages(library('DESeq2'))
library('BiocParallel')
register(MulticoreParam(4))

In [2]:
config <- yaml::read_yaml('../../config/config.yaml')
data_path <- config$data_path
tmp_path <- config$tmp_path

# local work
data_path <- '../../data/'
deseq2_results_path <- '../../results/DESeq2/hamster/'

In [3]:
# load data
countData <- read.csv(file = paste0(data_path, "all_hamsters_countData.csv"), row.names=1)
colData <- read.csv(file = paste0(data_path, "all_hamsters_colData.csv"))

In [4]:
head(colData)

Unnamed: 0_level_0,time,organism,replicate,celltype,dosage,ncells
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,D0,Dwarfhamster,sample0,NK cells,no dose,15
2,D0,Dwarfhamster,sample0,Bcells,no dose,730
3,D0,Dwarfhamster,sample0,Treml4+Macrophages,no dose,182
4,D0,Dwarfhamster,sample0,Bronchial,no dose,211
5,D0,Dwarfhamster,sample0,InterstitialMacrophages,no dose,19
6,D0,Dwarfhamster,sample0,CD4+ T cells,no dose,223


In [5]:
# collapse Endo subtypes
colData$celltype = replace(colData$celltype, colData$celltype=='Capillary', 'Endothelials')
colData$celltype = replace(colData$celltype, colData$celltype=='Vein', 'Endothelials')
colData$celltype = replace(colData$celltype, colData$celltype=='Bronchial', 'Endothelials')
colData$celltype = replace(colData$celltype, colData$celltype=='Artery', 'Endothelials')

# collapse Macro subtypes
colData$celltype = replace(colData$celltype, colData$celltype=='AlveolarMacrophages', 'Macrophages')
colData$celltype = replace(colData$celltype, colData$celltype=='Treml4+Macrophages', 'Macrophages')
colData$celltype = replace(colData$celltype, colData$celltype=='InterstitialMacrophages', 'Macrophages')
colData$celltype = replace(colData$celltype, colData$celltype=='MonocyticMacrophages', 'Macrophages')

# Manual test

In [6]:
celltype = 'Neutrophils'

In [7]:
# select times where we can compare organisms (there is no D5 and E14 in Dwarf).
# exclude low dose samples
# exclude groups with less than 20 cells (we heuristically deem a pseudobulk of less than that many cells inadequate)
mask = colData$celltype %in% c(celltype) & colData$time %in% c('D0', 'D2', 'D3') &
       colData$time != 'low dose' & colData$ncells >= 20
scolData = colData[mask, ]
scountData = countData[, mask]

In [8]:
# this is where the magic happens
design = ~ time + organism + time:organism
reduced = ~ time + organism

# Setup data and supply design matrix
dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                              design = design)

# Filter genes below 10 counts in total
dds <- dds[rowSums(counts(dds)) >= 10,]
# Setup deseq with single cell recommendations, add reduced design matrix
dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced=reduced)

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [9]:
# Extract results
res <- results(dds, name='timeD2.organismGoldhamster')
resOrdered <- res[order(res$pvalue),]

In [10]:
#write.csv(as.data.frame(resOrdered), 
#          file=paste0("../../results/DESeq2/hamster/", celltype, "_GoldvsDwarf_interaction_D2.csv"))

# run1

In [11]:
# run 1 DESeq2 per celltype per organism: 
# Tests Day X w.r.t D0 where X is a time after infection.
# Tests per celltype and low/high dose separately.
for (celltype in c('Neutrophils', 'Endothelials', 'Macrophages')){
    for (organism in c('Goldhamster', 'Dwarfhamster_hd', 'Dwarfhamster_ld')){
        # Define what to test
        if (organism=='Goldhamster') {
            times = c('D2', 'D3', 'D5', 'E14')
            dosage_str = ''
            dosage_to_exclude = 'None'
        } else {
            times = c('D2', 'D3')
            if (grepl('ld', organism, fixed = TRUE)){
                dosage_str = '_ld'
                dosage_to_exclude = 'high dose'
            } else {
                dosage_str = '_hd'
                dosage_to_exclude = 'low dose'
            }
            organism = 'Dwarfhamster'
        }
        # Mask samples accordingly
        mask = colData$celltype %in% c(celltype) & colData$organism %in% c(organism) &
        colData$dosage != dosage_to_exclude & colData$ncells >= 20
        
        scolData = colData[mask, ]
        scountData = countData[, mask]
        
        # Setup data and supply design matrix
        dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                                      design = ~ time)
        dds <- collapseReplicates(dds, dds$replicate)
        # Filter genes below 10 counts in total
        dds <- dds[rowSums(counts(dds)) >= 10,]
        # Setup deseq with single cell recommendations, add reduced design matrix
        dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced= ~ 1)
        
        for (time in times){
            res <- results(dds, contrast=c("time", time, "D0"))
            resOrdered <- res[order(res$pvalue),]
            file = paste0(deseq2_results_path, celltype, "_", organism, dosage_str, "_", time, "_vs_D0.csv")
            write.csv(as.data.frame(resOrdered), file=file)
        }
    }
}


converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion 

In [12]:
# run 1 DESeq2 per organism but for all celltypes merged: 
# Tests Day X w.r.t D0 where X is a time after infection.
# Tests per celltype and low/high dose separately.
for (organism in c('Goldhamster', 'Dwarfhamster_hd', 'Dwarfhamster_ld')){
    # Define what to test
    if (organism=='Goldhamster') {
        times = c('D2', 'D3', 'D5', 'E14')
        dosage_str = ''
        dosage_to_exclude = 'None'
    } else {
        times = c('D2', 'D3')
        if (grepl('ld', organism, fixed = TRUE)){
            dosage_str = '_ld'
            dosage_to_exclude = 'high dose'
        } else {
            dosage_str = '_hd'
            dosage_to_exclude = 'low dose'
        }
        organism = 'Dwarfhamster'
    }
    # Mask samples accordingly
    mask = colData$organism %in% c(organism) & colData$dosage != dosage_to_exclude

    scolData = colData[mask, ]
    scountData = countData[, mask]

    # Setup data and supply design matrix
    dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                                  design = ~ time)
    dds <- collapseReplicates(dds, dds$replicate)
    # Filter genes below 10 counts in total
    dds <- dds[rowSums(counts(dds)) >= 10,]
    # Setup deseq with single cell recommendations, add reduced design matrix
    dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced= ~ 1)

    for (time in times){
        res <- results(dds, contrast=c("time", time, "D0"))
        resOrdered <- res[order(res$pvalue),]
        file = paste0(deseq2_results_path, "Allcelltypes_",organism, dosage_str, "_", time, "_vs_D0.csv")
        write.csv(as.data.frame(resOrdered), file=file)
    }
}


converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



# run2 (failing)

In [13]:
# run 2 DESeq2 per celltype: time + organism vs time, 
# essentially tests which gene time courses are different between organisms in an constant way
# We only look at Gold vs Dwarf High dose and only at comparable time points
for (celltype in c('Neutrophils', 'Endothelials', 'Macrophages')){
    mask = colData$celltype %in% c(celltype) & colData$time %in% c('D0', 'D2', 'D3') &
           colData$dosage != 'low dose' & colData$ncells >= 20
    scolData = colData[mask, ]
    scountData = countData[, mask]

    # Setup data and supply design matrix
    dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                                  design = ~ time + organism)
    dds <- collapseReplicates(dds, dds$replicate)
    # Filter genes below 10 counts in total
    dds <- dds[rowSums(counts(dds)) >= 10,]
    # Setup deseq with single cell recommendations, add reduced design matrix
    dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced= ~ time)

    res <- results(dds, contrast=c("organism", 'Goldhamster', "Dwarfhamster"))
    resOrdered <- res[order(res$pvalue),]
    file = paste0(deseq2_results_path, celltype, "_GoldvsDwarf_hd", ".csv")
    write.csv(as.data.frame(resOrdered), file=file)
}


converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



# run3

In [14]:
# run 3 DESeq2 per celltype: time * organism vs time + organism, which gene time courses are different 
# between organisms other than additive effects
for (celltype in c('Neutrophils', 'Endothelials', 'Macrophages')){
    mask = colData$celltype %in% c(celltype) & colData$time %in% c('D0', 'D2', 'D3') &
           colData$dosage != 'low dose' & colData$ncells >= 20
    scolData = colData[mask, ]
    scountData = countData[, mask]

    # Setup data and supply design matrix
    dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                                  design = ~ time * organism)
    dds <- collapseReplicates(dds, dds$replicate)
    # Filter genes below 10 counts in total
    dds <- dds[rowSums(counts(dds)) >= 10,]
    # Setup deseq with single cell recommendations, add reduced design matrix
    dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced= ~ time + organism)

    res <- results(dds, contrast=c("organism", 'Goldhamster', "Dwarfhamster"))
    resOrdered <- res[order(res$pvalue),]
    file = paste0(deseq2_results_path, celltype, "_GoldvsDwarf_hd_interaction", ".csv")
    write.csv(as.data.frame(resOrdered), file=file)
}


converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



# run4 (deprecated)

In [15]:
# run 4 DESeq2 per celltype: infection * organism vs infection + organism, which gene regulation upon infection is different 
# between organisms other than additive effects
for (celltype in c('Neutrophils', 'Endothelials', 'Macrophages')){
    mask = colData$celltype %in% c(celltype) & colData$time %in% c('D0', 'D2')
    scolData = colData[mask, ]
    scountData = countData[, mask]

    # Setup data and supply design matrix
    dds <- DESeqDataSetFromMatrix(countData = scountData, colData = scolData,
                                  design = ~ time * organism)
    dds <- collapseReplicates(dds, dds$replicate)
    # Filter genes below 10 counts in total
    dds <- dds[rowSums(counts(dds)) >= 10,]
    # Setup deseq with single cell recommendations, add reduced design matrix
    dds <- DESeq(dds, test="LRT", minReplicatesForReplace=Inf, reduced= ~ time + organism)

    res <- results(dds, contrast=c("organism", 'Goldhamster', "Dwarfhamster"))
    resOrdered <- res[order(res$pvalue),]
    file = paste0("./deseq2_results_", celltype, "_GoldvsDwarf_specific_interaction_D2", ".csv")
    write.csv(as.data.frame(resOrdered), file=file)
}


converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

"some variables in design formula are characters, converting to factors"
estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

