In [1]:
setwd('/lustre/scratch117/cellgen/team297/kt16/COVID_imperial_renal/')
source('scripts/glmm_functions.R')

In [2]:
# Basic parameters to use.
min_cells = 10
ncpus = 10

### All memory

In [3]:
sce <- readRDS('h5ad/df.fil3_gex_bcells_vdj_sce_B_nonswitched_mem.RDS')
counts(sce) <- assays(sce)[['X']] # because i'm saving from a h5ad object with anndata2ri
sce$case_control <- factor(sce$case_control, levels = c('NEGATIVE', 'POSITIVE', 'RECOVERY'))
sce$WHO_temp_severity <- factor(sce$WHO_temp_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'))
sce$WHO_temp_severity_group <- factor(sce$WHO_temp_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'), labels = c('NA', 'mild_moderate', 'mild_moderate', 'severe_critical', 'severe_critical'))
sce$WHO_severity_group <- factor(sce$WHO_severity, levels = c('NA', 'mild', 'moderate', 'severe', 'critical'), labels = c('NA', 'mild_moderate', 'mild_moderate', 'severe_critical', 'severe_critical')) # interpreted as peak severity
sce$grouped_temp_severity <- ifelse(sce$WHO_temp_severity %in% c("mild", "moderate"), "mild_moderate", "severe_critical")
sce$grouped_severity <- ifelse(sce$WHO_severity %in% c("mild", "moderate"), "mild_moderate", "severe_critical")
sce$age_scaled <- scale(sce$calc_age) # scale age

In [4]:
# Remove samples with less than 10 cells
nCells <- table(sce$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce1 <- sce[, !sce$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce1, id = as.character(colData(sce1)[, c("sample_id")]))
y <- DGEList(counts = counts(smrzd), samples = colData(smrzd))

y1 <- setupDGElist(y, "WHO_temp_severity_group", remove = "NA") # use grouped_temp_severity later
# sanity check
table(y1$samples$grouped_temp_severity, y1$samples$individual_id)
table(y1$samples$grouped_temp_severity, y1$samples$centre)
table(y1$samples$grouped_temp_severity, y1$samples$sex)
table(y1$samples$grouped_temp_severity, y1$samples$ethnicity)

res1 <- testDGElist(y1,
            formula = as.formula("~ grouped_temp_severity + sex + ethnicity + age_scaled + centre + (1|individual_id)"),
            individual_id = "individual_id",
            ncores = ncpus)
# print(colnames(res1@stats))
results1 <- degTable(res1, contrast = 'grouped_temp_severity', group = 'severe_critical')

                 
                  C20 C21 C23 C31 C33 C36 C42 C63 C82 C93 C101 C113 C123 C124
  mild_moderate     2   1   1   3   0   3   3   1   1   3    2    1    0    3
  severe_critical   0   2   0   0   1   0   0   0   0   0    1    1    2    0
                 
                  C126 C127 C128 C132 C137 C138 C140 C141 C146 C147 C169 C170
  mild_moderate      3    1    1    3    1    2    5    2    1    0    1    1
  severe_critical    0    0    0    0    0    0    0    1    0    3    0    0
                 
                  C187 C190
  mild_moderate      2    1
  severe_critical    0    0

                 
                  Cambridge NCL
  mild_moderate          18  30
  severe_critical         6   5

                 
                   F  M
  mild_moderate   19 29
  severe_critical  1 10

                 
                  asian black other white
  mild_moderate      29     6     4     9
  severe_critical     7     1     0     3


n = 59 samples, 28 individuals
Time difference of 1.689719 mins

q_grouped_temp_severity
-----------------------
Not Significant     Significant 
           3275            1183 

q_sex
-----
Not Significant     Significant 
           4274             184 

q_ethnicity
-----------
Not Significant     Significant 
           4268             190 

q_age_scaled
------------
Not Significant     Significant 
           4298             160 

q_centre
--------
Not Significant     Significant 
            913            3545 


### Comparison 2: deg from the interaction between WHO (peak) severity groups and time from infection (time from first symptoms)

In [5]:
# remove samples >21 days
sce2 <- sce[,sce$time_from_infection <= 21] # prevent issues with spline
nCells <- table(sce2$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce2 <- sce2[, !sce2$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce2, id = as.character(colData(sce2)[, c("sample_id")]))
y <- DGEList(counts = counts(smrzd), samples = colData(smrzd))
y2 <- setupDGElist(y, "WHO_severity_group", remove = "NA") # use grouped_severity later
# sanity check
table(y2$samples$grouped_severity, y2$samples$individual_id)
table(y2$samples$grouped_severity, y2$samples$centre)
table(y2$samples$grouped_severity, y2$samples$sex)
table(y2$samples$grouped_severity, y2$samples$ethnicity)

res2 <- testDGElist(y2,
            formula = as.formula("~ splines::bs(time_from_infection, degree = 2) * grouped_severity + sex + ethnicity + age_scaled + centre + (1|individual_id)"),
            individual_id = "individual_id",
            modified = TRUE,
            ncores = ncpus)
# print(colnames(res2$stats))
results2 <- degTable(res2, contrast = 'splines::bs(time_from_infection, degree = 2):grouped_severity', 'severe_critical', modified = TRUE)

                 
                  C20 C21 C23 C31 C33 C36 C42 C63 C82 C93 C101 C113 C123 C124
  mild_moderate     0   0   1   3   0   3   3   1   1   3    0    0    0    3
  severe_critical   2   3   0   0   1   0   0   0   0   0    3    2    2    0
                 
                  C126 C127 C128 C132 C137 C138 C140 C141 C146 C147 C169 C170
  mild_moderate      3    1    1    3    1    2    5    0    0    0    1    1
  severe_critical    0    0    0    0    0    0    0    3    1    3    0    0
                 
                  C187 C190
  mild_moderate      2    1
  severe_critical    0    0

                 
                  Cambridge NCL
  mild_moderate          13  26
  severe_critical        11   9

                 
                   F  M
  mild_moderate   18 21
  severe_critical  2 18

                 
                  asian black other white
  mild_moderate      24     4     4     7
  severe_critical    12     3     0     5


n = 59 samples, 28 individuals
Time difference of 3.083607 mins

q_splines::bs(time_from_infection, degree = 2)
----------------------------------------------
Not Significant     Significant 
            992            2702 

q_grouped_severity
------------------
Not Significant     Significant 
           3571             123 

q_sex
-----
Not Significant     Significant 
           3587             107 

q_ethnicity
-----------
Not Significant     Significant 
           2522            1172 

q_age_scaled
------------
Not Significant     Significant 
           3578             116 

q_centre
--------
Not Significant     Significant 
            491            3203 

q_splines::bs(time_from_infection, degree = 2):grouped_severity
---------------------------------------------------------------
Not Significant     Significant 
            201            3493 


### Comparison 3: just wave 1, deg from positive vs negative

In [6]:
sce3 <- sce[, sce$centre == 'NCL']
# Remove samples with less than MIN
nCells <- table(sce3$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce3 <- sce3[,!sce3$sample_id %in% rmSamples]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce3, id=as.character(colData(sce3)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y3 <- setupDGElist(y, 'case_control')
# sanity check
table(y3$samples$case_control, y3$samples$individual_id)
table(y3$samples$case_control, y3$samples$sex)
table(y3$samples$case_control, y3$samples$ethnicity)

res3 <- testDGElist(y3, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res3@stats))
results3 <- degTable(res3, contrast = 'case_control', group = 'POSITIVE')

          
           C20 C21 C23 C31 C36 C42 C63 C82 C93 C103 C104 C106 C113 C123 C124
  NEGATIVE   0   0   0   0   0   0   0   0   0    1    1    1    0    0    0
  POSITIVE   2   3   1   3   3   3   1   1   3    0    0    0    2    2    3
          
           C126 C127 C128 C132 C134 C142 C164 C167 C193 C195 C197 C212 C214
  NEGATIVE    0    0    0    0    1    1    1    1    1    1    1    1    1
  POSITIVE    3    1    1    3    0    0    0    0    0    0    0    0    0
          
           C224 C234 C238 C251
  NEGATIVE    1    1    1    1
  POSITIVE    0    0    0    0

          
            F  M
  NEGATIVE  6 10
  POSITIVE 11 24

          
           asian black other white
  NEGATIVE     7     3     2     4
  POSITIVE    18     4     2    11


n = 51 samples, 32 individuals
Time difference of 1.508193 mins
Errors in 1 gene(s):XIST
q_case_control
--------------
Not Significant     Significant 
           4433              88 

q_sex
-----
Not Significant     Significant 
           2596            1925 

q_ethnicity
-----------
Not Significant     Significant 
           4433              88 

q_age_scaled
------------
Not Significant     Significant 
           4444              77 


### Comparison 4: just patients that were negative in wave 1 but positive in wave 2, deg from recovery vs negative

In [7]:
sce4 <- sce[, sce$individual_id %in% c('C101', 'C108', 'C137', 'C138', 'C140', 
                                       'C141', 'C145', 'C146', 'C147', 'C168',
                                        'C169', 'C170', 'C187', 'C190', 'C33')]
# Remove samples with less than MIN
nCells <- table(sce4$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce4 <- sce4[,!sce4$sample_id %in% rmSamples]
# remove non-complete data (all have positive), 1 = negative, 3 = recovery
df <- table(sce4$individual_id, sce4$case_control)
keep_ids <- row.names(df)[which(df[,1] != 0 & df[,3] != 0)]
sce4 <- sce4[, sce4$individual_id %in% keep_ids]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce4, id=as.character(colData(sce4)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y4 <- setupDGElist(y, 'case_control', remove = 'POSITIVE')
# sanity check
table(y4$samples$case_control, y4$samples$individual_id)
table(y4$samples$case_control, y4$samples$sex)
table(y4$samples$case_control, y4$samples$ethnicity)

res4 <- testDGElist(y4, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res4@stats))
results4 <- degTable(res4, contrast = 'case_control', group = 'RECOVERY')

          
           C137 C140
  NEGATIVE    1    1
  RECOVERY    1    1

          
           F
  NEGATIVE 2
  RECOVERY 2

          
           asian
  NEGATIVE     2
  RECOVERY     2


n = 4 samples, 2 individuals


ERROR: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels


### Comparison 5: just patients that were negative in wave 1 but positive in wave 2, deg from positive vs negative

In [8]:
sce5 <- sce[, sce$individual_id %in% c('C101', 'C108', 'C137', 'C138', 'C140', 
                                       'C141', 'C145', 'C146', 'C147', 'C168',
                                        'C169', 'C170', 'C187', 'C190', 'C33')]
# Remove samples with less than MIN
nCells <- table(sce5$sample_id)
rmSamples <- names(nCells[nCells < min_cells])
sce5 <- sce5[,!sce5$sample_id %in% rmSamples]
# remove non-complete data (all have positive), 1 = negative, 2 = positive
df <- table(sce5$individual_id, sce5$case_control)
keep_ids <- row.names(df)[which(df[,1] != 0 & df[,2] != 0)]
sce5 <- sce5[, sce5$individual_id %in% keep_ids]
# Summarize Counts
smrzd <- aggregateAcrossCells(sce5, id=as.character(colData(sce5)[,c("sample_id")]))
y <- DGEList(counts=counts(smrzd), samples=colData(smrzd))
y5 <- setupDGElist(y, 'case_control', remove = 'RECOVERY')
# sanity check
table(y5$samples$case_control, y5$samples$individual_id)
table(y5$samples$case_control, y5$samples$sex)
table(y5$samples$case_control, y5$samples$ethnicity)

res5 <- testDGElist(y5, 
            formula = as.formula("~ case_control + sex + ethnicity + age_scaled + (1|individual_id)"), 
            individual_id = 'individual_id',
            ncores = ncpus
           )
# print(colnames(res5@stats))
results5 <- degTable(res5, contrast = 'case_control', group = 'POSITIVE')

          
           C137 C140 C147
  NEGATIVE    1    1    1
  POSITIVE    1    5    3

          
           F M
  NEGATIVE 2 1
  POSITIVE 6 3

          
           asian
  NEGATIVE     3
  POSITIVE     9


n = 12 samples, 3 individuals


ERROR: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels


In [9]:
save(res1, res2, res3, results1, results2, results3, file = 'h5ad/df.fil3_gex_bcells_vdj_sce_B_nonswitched_mem_deg.RData')