In [1]:
setwd('/lustre/scratch117/cellgen/team297/kt16/COVID_imperial_renal/')
source('scripts/glmm_functions.R')

In [2]:
# Basic parameters to use.
min_cells = 10
ncpus = 10

### All memory

In [3]:
sce <- readRDS('h5ad/df.fil3_gex_bcells_vdj_sce_B_ASC_all.RDS')
counts(sce) <- assays(sce)[['X']] # because i'm saving from a h5ad object with anndata2ri
sce$case_control <- factor(sce$case_control, levels = c('NEGATIVE', 'POSITIVE', 'RECOVERY'))
sce$WHO_temp_severity <- factor(sce$WHO_temp_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'))
sce$WHO_temp_severity_group <- factor(sce$WHO_temp_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'), labels = c('NA', 'mild_moderate', 'mild_moderate', 'severe_critical', 'severe_critical'))
sce$WHO_severity_group <- factor(sce$WHO_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'), labels = c('NA', 'mild_moderate', 'mild_moderate', 'severe_critical', 'severe_critical')) # interpreted as peak severity
sce$grouped_temp_severity <- ifelse(sce$WHO_temp_severity %in% c("mild", "moderate"), "mild_moderate", "severe_critical")
sce$grouped_severity <- ifelse(sce$WHO_severity %in% c("mild", "moderate"), "mild_moderate", "severe_critical")
sce$age_scaled <- scale(sce$calc_age) # scale age

In [4]:
# Remove samples with less than 10 cells
nCells <- table(sce$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce1 <- sce[, !sce$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce1, id = as.character(colData(sce1)[, c("sample_id")]))
y <- DGEList(counts = counts(smrzd), samples = colData(smrzd))

y1 <- setupDGElist(y, "WHO_temp_severity_group", remove = "NA") # use grouped_temp_severity later
# sanity check
table(y1$samples$grouped_temp_severity, y1$samples$individual_id)
table(y1$samples$grouped_temp_severity, y1$samples$centre)
table(y1$samples$grouped_temp_severity, y1$samples$sex)
table(y1$samples$grouped_temp_severity, y1$samples$ethnicity)

res1 <- testDGElist(y1,
            formula = as.formula("~ grouped_temp_severity + sex + ethnicity + age_scaled + centre + (1|individual_id)"),
            individual_id = "individual_id",
            ncores = ncpus)
# print(colnames(res1@stats))
results1 <- degTable(res1, contrast = 'grouped_temp_severity', group = 'severe_critical')

                 
                  C20 C21 C23 C33 C34 C36 C40 C42 C58 C60 C63 C65 C93 C101 C123
  mild_moderate     3   1   3   0   1   2   2   2   2   2   1   1   2    1    1
  severe_critical   0   2   0   2   0   0   1   0   0   0   0   2   0    1    2
                 
                  C124 C126 C127 C128 C132 C137 C138 C141 C147 C170 C187 C190
  mild_moderate      3    3    3    3    3    2    1    1    0    1    1    3
  severe_critical    0    0    0    0    0    0    0    1    3    0    0    0

                 
                  Cambridge NCL
  mild_moderate          12  36
  severe_critical         7   7

                 
                   F  M
  mild_moderate   15 33
  severe_critical  0 14

                 
                  asian black other white
  mild_moderate      26     4     4    14
  severe_critical     8     2     0     4


n = 62 samples, 27 individuals
Time difference of 3.792599 mins

q_grouped_temp_severity
-----------------------
Not Significant     Significant 
           3805            4651 

q_sex
-----
Not Significant     Significant 
           6939            1517 

q_ethnicity
-----------
Not Significant     Significant 
           7377            1079 

q_age_scaled
------------
Not Significant     Significant 
           7584             872 

q_centre
--------
Not Significant     Significant 
             44            8412 


### Comparison 2: deg from the interaction between WHO (peak) severity groups and time from infection (time from first symptoms)

In [5]:
# remove samples >21 days
sce2 <- sce[,sce$time_from_infection <= 21] # prevent issues with spline
nCells <- table(sce2$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce2 <- sce2[, !sce2$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce2, id = as.character(colData(sce2)[, c("sample_id")]))
y <- DGEList(counts = counts(smrzd), samples = colData(smrzd))
y2 <- setupDGElist(y, "WHO_severity_group", remove = "NA") # use grouped_severity later
# sanity check
table(y2$samples$grouped_severity, y2$samples$individual_id)
table(y2$samples$grouped_severity, y2$samples$centre)
table(y2$samples$grouped_severity, y2$samples$sex)
table(y2$samples$grouped_severity, y2$samples$ethnicity)

res2 <- testDGElist(y2,
            formula = as.formula("~ splines::bs(time_from_infection, degree = 2) * grouped_severity + sex + ethnicity + age_scaled + centre + (1|individual_id)"),
            individual_id = "individual_id",
            modified = TRUE,
            ncores = ncpus)
# print(colnames(res2$stats))
results2 <- degTable(res2, contrast = 'splines::bs(time_from_infection, degree = 2):grouped_severity', 'severe_critical', modified = TRUE)

                 
                  C20 C21 C23 C33 C34 C36 C40 C42 C58 C60 C63 C65 C93 C101 C123
  mild_moderate     0   0   3   0   1   2   0   2   2   0   1   0   2    0    0
  severe_critical   3   3   0   2   0   0   3   0   0   2   0   3   0    2    3
                 
                  C124 C126 C127 C128 C132 C137 C138 C141 C147 C170 C187 C190
  mild_moderate      3    3    3    3    3    2    1    0    0    1    1    3
  severe_critical    0    0    0    0    0    0    0    2    3    0    0    0

                 
                  Cambridge NCL
  mild_moderate          10  26
  severe_critical         9  17

                 
                   F  M
  mild_moderate   15 21
  severe_critical  0 26

                 
                  asian black other white
  mild_moderate      21     1     4    10
  severe_critical    13     5     0     8


n = 62 samples, 27 individuals
Time difference of 5.233879 mins

q_splines::bs(time_from_infection, degree = 2)
----------------------------------------------
Not Significant     Significant 
           6115            1577 

q_grouped_severity
------------------
Not Significant     Significant 
           6735             957 

q_sex
-----
Not Significant     Significant 
           6747             945 

q_ethnicity
-----------
Not Significant     Significant 
           6774             918 

q_age_scaled
------------
Not Significant     Significant 
           6885             807 

q_centre
--------
Not Significant     Significant 
             25            7667 

q_splines::bs(time_from_infection, degree = 2):grouped_severity
---------------------------------------------------------------
Not Significant     Significant 
           1075            6617 


### Comparison 3: just wave 1, deg from positive vs negative

In [6]:
sce3 <- sce[, sce$centre == 'NCL']
# Remove samples with less than MIN
nCells <- table(sce3$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce3 <- sce3[,!sce3$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce3, id=as.character(colData(sce3)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y3 <- setupDGElist(y, 'case_control')
# sanity check
table(y3$samples$case_control, y3$samples$individual_id)
table(y3$samples$case_control, y3$samples$sex)
table(y3$samples$case_control, y3$samples$ethnicity)

res3 <- testDGElist(y3, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res3@stats))
results3 <- degTable(res3, contrast = 'case_control', group = 'POSITIVE')

          
           C20 C21 C23 C34 C36 C40 C42 C60 C63 C65 C69 C85 C93 C103 C104 C106
  NEGATIVE   0   0   0   0   0   0   0   0   0   0   1   1   0    1    1    1
  POSITIVE   3   3   3   1   2   3   2   2   1   3   0   0   2    0    0    0
          
           C123 C124 C126 C127 C128 C132 C142 C152 C161 C164 C167 C193 C195
  NEGATIVE    0    0    0    0    0    0    1    1    1    1    1    1    1
  POSITIVE    3    3    3    3    3    3    0    0    0    0    0    0    0
          
           C197 C214 C238
  NEGATIVE    1    1    1
  POSITIVE    0    0    0

          
            F  M
  NEGATIVE  5 10
  POSITIVE 10 33

          
           asian black other white
  NEGATIVE     5     2     4     4
  POSITIVE    23     4     3    13


n = 58 samples, 32 individuals
Time difference of 3.150984 mins

q_case_control
--------------
Not Significant     Significant 
           1549            7008 

q_sex
-----
Not Significant     Significant 
           8434             123 

q_ethnicity
-----------
Not Significant     Significant 
           8407             150 

q_age_scaled
------------
Not Significant     Significant 
           8425             132 


### Comparison 4: just patients that were negative in wave 1 but positive in wave 2, deg from recovery vs negative

In [7]:
sce4 <- sce[, sce$individual_id %in% c('C101', 'C108', 'C137', 'C138', 'C140', 
                                       'C141', 'C145', 'C146', 'C147', 'C168',
                                        'C169', 'C170', 'C187', 'C190', 'C33')]
# Remove samples with less than MIN
nCells <- table(sce4$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce4 <- sce4[,!sce4$sample_id %in% rmSamples]
# remove non-complete data (all have positive), 1 = negative, 3 = recovery
df <- table(sce4$individual_id, sce4$case_control)
keep_ids <- row.names(df)[which(df[,1] != 0 & df[,3] != 0)]
sce4 <- sce4[, sce4$individual_id %in% keep_ids]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce4, id=as.character(colData(sce4)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y4 <- setupDGElist(y, 'case_control', remove = 'POSITIVE')
# sanity check
table(y4$samples$case_control, y4$samples$individual_id)
table(y4$samples$case_control, y4$samples$sex)
table(y4$samples$case_control, y4$samples$ethnicity)

res4 <- testDGElist(y4, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res4@stats))
results4 <- degTable(res4, contrast = 'case_control', group = 'RECOVERY')

“no non-missing arguments to min; returning Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to min; returning Inf”


< table of extent 0 x 0 >

< table of extent 0 x 0 >

< table of extent 0 x 0 >

ERROR: Error in estimateDisp.default(y = y$counts, design = design, group = group, : object 'prior.n' not found


In [11]:
sce4 <- sce[, sce$individual_id %in% c('C101', 'C108', 'C137', 'C138', 'C140', 
                                       'C141', 'C145', 'C146', 'C147', 'C168',
                                        'C169', 'C170', 'C187', 'C190', 'C33')]
sce4

class: SingleCellExperiment 
dim: 32913 1075 
metadata(10): WHO_temp_severity_colors case_control_colors ...
  initial_cluster_B_colors seurat_clusters_colors
assays(2): X counts
rownames(32913): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(6): vst.mean vst.variance ... vst.variable GEX
colnames(1075): UK-CIC10690315+UK-CIC10690507_AACCGCGCACTCTGTC-1
  UK-CIC10690315+UK-CIC10690507_CAAGATCGTGGTTTCA-1 ...
  UK-CIC10690384+UK-CIC10690576_TTTCCTCTCATGTCCC-1
  UK-CIC10690384+UK-CIC10690576_TTTGGTTGTTGTCTTT-1
colData names(72): orig.ident nCount_RNA ... grouped_severity
  age_scaled
reducedDimNames(5): X_harmony_rna X_pca_rna UMAP X_umapafterharmony_rna
  X_umapbeforeharmony_rna
altExpNames(0):

### Comparison 5: just patients that were negative in wave 1 but positive in wave 2, deg from positive vs negative

In [8]:
sce5 <- sce[, sce$individual_id %in% c('C101', 'C108', 'C137', 'C138', 'C140', 
                                       'C141', 'C145', 'C146', 'C147', 'C168',
                                        'C169', 'C170', 'C187', 'C190', 'C33')]
# Remove samples with less than MIN
nCells <- table(sce5$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce5 <- sce5[,!sce5$sample_id %in% rmSamples]
# remove non-complete data (all have positive), 1 = negative, 2 = positive
df <- table(sce5$individual_id, sce5$case_control)
keep_ids <- row.names(df)[which(df[,1] != 0 & df[,2] != 0)]
sce5 <- sce5[, sce5$individual_id %in% keep_ids]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce5, id=as.character(colData(sce5)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y5 <- setupDGElist(y, 'case_control', remove = 'RECOVERY')
# sanity check
table(y5$samples$case_control, y5$samples$individual_id)
table(y5$samples$case_control, y5$samples$sex)
table(y5$samples$case_control, y5$samples$ethnicity)

res5 <- testDGElist(y5, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res5@stats))
results5 <- degTable(res5, contrast = 'case_control', group = 'POSITIVE')

“no non-missing arguments to min; returning Inf”
“no non-missing arguments to min; returning Inf”
“no non-missing arguments to min; returning Inf”


< table of extent 0 x 0 >

< table of extent 0 x 0 >

< table of extent 0 x 0 >

ERROR: Error in estimateDisp.default(y = y$counts, design = design, group = group, : object 'prior.n' not found


In [12]:
save(res1, res2, res3, results1, results2, results3, file = 'h5ad/df.fil3_gex_bcells_vdj_sce_ASC_all_deg.RData')