# Analyze ladder pilot counts in DESeq2

In [1]:
library("data.table")
library("dplyr")
library("DESeq2")
# for parallelization
library("BiocParallel")
register(MulticoreParam(8))
# for parsing barcode IDs
library("stringr")

setwd("/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, 

## Setup path to files

In [2]:
# samples info file
samples_info_file = "./gxg_bya_aug2021_samples.txt"
# counts file
# counts_file = "./counts/stacked_barcode_counts.txt"
counts_file = "./counts/barcode_counts.txt"

# output directory
dir.create("./deseq/")
# deseq results file
# deseq_output_file= "./deseq/gxg_bya_aug2021_results.txt"
deseq_output_file= "./deseq/gxg_bya_aug2021_umi_results.txt"

print("Input files:")
print(samples_info_file)
print(counts_file)
print("Output files:")
print(deseq_output_file)

“'./deseq' already exists”


[1] "Input files:"
[1] "./gxg_bya_aug2021_samples.txt"
[1] "./counts/barcode_counts.txt"
[1] "Output files:"
[1] "./deseq/gxg_bya_aug2021_umi_results.txt"


## Load samples info, barcode counts

In [3]:
# loading samples
samples <- read.table(file = samples_info_file, header = TRUE, sep = '\t', row.names = 1)
samples$rep = factor(samples$rep)

# loading counts
counts <- fread(counts_file, header=TRUE, sep = '\t', na.strings="", data.table=TRUE)
# order counts by samples table
setcolorder(counts, c('barcode', rownames(samples)))

## Run DESeq with default settings to obtain barcode logFC values in CRISPEY competition
Assumes all samples are biological replicates of the same competition setup

In [4]:
# Default DESeq workflow
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)
dds

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000
# total_reads_cutoff = 150 # 10 reads per sample
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# run DEseq on default settings
dds <- DESeq(dds, parallel=TRUE)

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.



class: DESeqDataSet 
dim: 32078 15 
metadata(1): version
assays(1): counts
rownames(32078): 001_022-6 001_045-6 ... Ladder_043-3 Ladder_043-4
rowData names(0):
colnames(15): BYA-t1-1 BYA-t1-2 ... BYA-t13-2 BYA-t13-3
colData names(2): gen rep

estimating size factors

estimating dispersions

gene-wise dispersion estimates: 8 workers

mean-dispersion relationship

final dispersion estimates, fitting model and testing: 8 workers



In [5]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 21016 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 6450, 31%
LFC < 0 (down)     : 6460, 31%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 65)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 21016 rows and 6 columns
                  baseMean        log2FoldChange                lfcSE
                 <numeric>             <numeric>            <numeric>
047_121-4 3523.73525739594   -0.0236731403089774 0.000552121422124256
047_121-6 3179.40531407283   -0.0301722157156849 0.000799794832632027
061_121-3 2735.36503367647   -0.0273451623565464  0.00076468622147436
044_078-5 1370.97793141205    0.0657431827927165  0.00197657863376068
047_121-3 3318.28354563781    -0.034285167171121  0.00104428542677937
...                    ...                   ...                  ...
047_034-1 6155.53132093995  1.02495582578138e-06 0.000438712099575668
046_040-5 95.4556168155088 -9.11489659803861e-06  0.00469053442246579
071_029-2 229.262778969708  5.11869372735346e-06  0.00264701041426975
074_046-3 2062.66289314876   1.6860081814909e-06 0.000967595698735562
058_059-6 3170.65782923556 -6.38892384219036e-07  0.000634298708116

## To measure fitness relative to neutral barcodes, specify neutral set in controlGenes when estimating size factors and recalculate logFC

In [6]:
# table of neutral oligos and their associated barcodes
neutral_oligos_file <- "/home/users/rang/crispey3/library_design/Output/crispey3_neutral_oligos_only.txt"
neutral_oligos <- fread(file = neutral_oligos_file, header = TRUE, sep = '\t', data.table=FALSE)

# get neutral barcode set
neutral_barcodes <- counts$barcode[str_split(counts$barcode, '-', simplify=TRUE)[,1] %in% neutral_oligos$barcode_id]

In [7]:
# re-run DESeq with specified neutral set
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000
# total_reads_cutoff = 150 # 10 reads per sample
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# estimate size factors, specifying neutral barcodes under controlGenes
dds <- estimateSizeFactors(dds, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds)))
# estimate dispersion
dds <- estimateDispersions(dds)
# calculate logFC
dds <- nbinomWaldTest(dds)

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates



In [8]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 21016 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 5387, 26%
LFC < 0 (down)     : 7725, 37%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 65)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 21016 rows and 6 columns
                  baseMean        log2FoldChange                lfcSE
                 <numeric>             <numeric>            <numeric>
047_121-4 3548.48321306587   -0.0247539149147205 0.000565091355041835
047_121-6 3205.76057480763   -0.0312660952676455 0.000847596772874003
061_121-3 2756.62009290145   -0.0284310811271405 0.000788181019673556
047_121-3 3347.65410690894   -0.0353864697476955  0.00106774764631586
073_121-4 2586.16329638664   -0.0283885860636768 0.000859083292880379
...                    ...                   ...                  ...
048_045-6 122.825606126017  8.07220359134424e-06  0.00464757254575967
043_062-6 3934.93940750412 -9.28494957400973e-07 0.000539243209397071
073_030-6  162.88932799809  6.47315963645418e-06  0.00407303796362289
058_044-3 189.835487987736  -2.6688015881864e-06  0.00382197804736487
192_032-4 574.878015666393  1.05475555581979e-06  0.001550808547425

## To measure the effect of a treatment on fitness effect, add the treatment variable to the dds design and re-run DEseq
New design should include the <code>gen\*cond</code> and <code>rep\*cond</code> interaction terms.<br>
Ensure that the samples file contains the <code>cond</code> column indicating the treatment status of each sample.<br>
Pseudo-conditions can be created between competition replicates to test for possible variance between competition batches (e.g. rep123 vs rep456). logFC values calculated this way should be the same as when each condition's samples is run in DESeq2 separately.

In [None]:
# samples file with condition label
samples_cond_file <- "./ladder_pilot_samples_pseudoConditions.txt"
# loading samples
samples_cond <- read.table(file = samples_cond_file, header = TRUE, sep = '\t', row.names = 1)
samples_cond$rep = factor(samples_cond$rep)

In [None]:
# re-run DESeq
dds2 <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                                         colData = samples_cond,
                                         design= ~gen+rep+cond+gen:cond+rep:cond)

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000 # conservative cutoff
# total_reads_cutoff = 150 # 10 reads per sample, considered bare minimum
keep <- rowSums(counts(dds2)) >= total_reads_cutoff
dds2 <- dds2[keep,]

# estimate size factors, specifying neutral oligos as control genes
dds2 <- estimateSizeFactors(dds2, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds2)))
# estimate dispersion
dds2 <- estimateDispersions(dds2)
# calculate logFC
dds2 <- nbinomWaldTest(dds2)

## To view condition-specific fitness, specify contrast when viewing results
If multiple conditions are present, fitness changes between non-reference conditions can be compared directly by specifying: <code>results(dds2, contrast=list("gen.condC", "gen.condB"))</code>

In [None]:
# view all terms in deseq model
resultsNames(dds2)

# view results:
### fitness in cond A (control)
res2 <- results(dds2, name='gen')
summary(res2)

### fitness in cond B (treatment)
res3 <- results(dds2, contrast=list( c("gen","gen.condB") ))
summary(res3)

### changes in fitness between cond A and B
res4 <- results(dds2, name='gen.condB')
summary(res4)

# for fitness in cond A, view table of results sorted by pvalue
resOrdered2 <- res2[order(res2$pvalue),]
resOrdered2

# # export results to file
# fwrite(as.data.frame(resOrdered2), file="deseq_fitness_condA.txt", sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)