# Analyze ladder pilot counts in DESeq2

In [1]:
library("data.table")
library("dplyr")
library("DESeq2")
# for parallelization
library("BiocParallel")
register(MulticoreParam(8))
# for parsing barcode IDs
library("stringr")

setwd("/home/users/rang/scratch/yeast/crispey3/gxg_bya_aug2021/")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, 

## Setup path to files

In [9]:
# samples info file
samples_info_file = "./gxg_bya_aug2021_samples.txt"
# counts file
# counts_file = "./counts/stacked_barcode_counts.txt"
counts_file = "./counts/barcode_counts.txt"

# output directory
dir.create("./deseq/")
# deseq results file
# deseq_output_file= "./deseq/gxg_bya_aug2021_results.txt"
deseq_output_file= "./deseq/gxg_bya_aug2021_umi_results.txt"

print("Input files:")
print(samples_info_file)
print(counts_file)
print("Output files:")
print(deseq_output_file)

“'./deseq' already exists”


[1] "Input files:"
[1] "./gxg_bya_aug2021_samples.txt"
[1] "./counts/barcode_counts.txt"
[1] "Output files:"
[1] "./deseq/gxg_bya_aug2021_umi_results.txt"


## Load samples info, barcode counts

In [10]:
# loading samples
samples <- read.table(file = samples_info_file, header = TRUE, sep = '\t', row.names = 1)
samples$rep = factor(samples$rep)

# loading counts
counts <- fread(counts_file, header=TRUE, sep = '\t', na.strings="", data.table=TRUE)
# order counts by samples table
setcolorder(counts, c('barcode', rownames(samples)))

## Run DESeq with default settings to obtain barcode logFC values in CRISPEY competition
Assumes all samples are biological replicates of the same competition setup

In [11]:
# Default DESeq workflow
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)
dds

# filter barcodes that do not meet minimum read cutoff
# total_reads_cutoff = 1000 # conservative cutoff
total_reads_cutoff = 150 # 10 reads per sample
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# run DEseq on default settings
dds <- DESeq(dds, parallel=TRUE)

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.



class: DESeqDataSet 
dim: 32078 15 
metadata(1): version
assays(1): counts
rownames(32078): 001_022-6 001_045-6 ... Ladder_043-3 Ladder_043-4
rowData names(0):
colnames(15): BYA-t1-1 BYA-t1-2 ... BYA-t13-2 BYA-t13-3
colData names(2): gen rep

estimating size factors

estimating dispersions

gene-wise dispersion estimates: 8 workers

mean-dispersion relationship

final dispersion estimates, fitting model and testing: 8 workers



In [12]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 27661 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 8283, 30%
LFC < 0 (down)     : 8807, 32%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 9)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 27661 rows and 6 columns
                  baseMean        log2FoldChange                lfcSE
                 <numeric>             <numeric>            <numeric>
047_121-4 3495.62565319293   -0.0226017083924747 0.000547720039634798
047_121-6 3150.15685766237   -0.0290856121276593 0.000769539422939968
061_121-3 2711.61973720458   -0.0262657909379971 0.000753703066762615
044_078-5 1384.65631552339    0.0668437662632251  0.00194218285678182
047_121-3  3285.5164241415   -0.0331885439355236  0.00100742797060434
...                    ...                   ...                  ...
042_094-4 71.8666118576679  4.75329305677299e-06  0.00584139140055678
045_013-2 897.881114635419   6.7745670673461e-07  0.00151837457667883
045_117-5 811.446206093225 -3.98736236793644e-07  0.00117861694680714
068_073-2 58.7434160347429  4.64534345628324e-06   0.0138625475590925
068_073-3 79.7726660008753  1.50090732618533e-06  0.005346982550840

## To measure fitness relative to neutral barcodes, specify neutral set in controlGenes when estimating size factors and recalculate logFC

In [13]:
# table of neutral oligos and their associated barcodes
neutral_oligos_file <- "/home/users/rang/crispey3/library_design/Output/crispey3_neutral_oligos_only.txt"
neutral_oligos <- fread(file = neutral_oligos_file, header = TRUE, sep = '\t', data.table=FALSE)

# get neutral barcode set
neutral_barcodes <- counts$barcode[str_split(counts$barcode, '-', simplify=TRUE)[,1] %in% neutral_oligos$barcode_id]

In [14]:
# re-run DESeq with specified neutral set
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)

# filter barcodes that do not meet minimum read cutoff
# total_reads_cutoff = 1000 # conservative cutoff
total_reads_cutoff = 150 # 10 reads per sample, considered bare minimum
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# estimate size factors, specifying neutral barcodes under controlGenes
dds <- estimateSizeFactors(dds, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds)))
# estimate dispersion
dds <- estimateDispersions(dds)
# calculate logFC
dds <- nbinomWaldTest(dds)

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates



In [15]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 27661 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 6295, 23%
LFC < 0 (down)     : 10946, 40%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 10)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 27661 rows and 6 columns
                  baseMean       log2FoldChange                lfcSE
                 <numeric>            <numeric>            <numeric>
047_121-4 3538.57706113862  -0.0244489126830365 0.000558650612357991
047_121-6 3195.69046672959  -0.0309545096426945 0.000837175222329178
061_121-3 2748.33705869788  -0.0281224018023505 0.000778324495565551
047_121-3 3336.55653831723   -0.035071211415918  0.00104697778203243
073_121-4 2578.36383537885  -0.0280762312866567 0.000841718975033322
...                    ...                  ...                  ...
048_097-4  291.07050328049 2.55702129150978e-06  0.00339469821425607
065_021-6 804.249093980508 9.10506412869197e-07  0.00130977351565253
056_015-2 718.424769674904 8.21157572798714e-07  0.00139149693654893
071_097-4 4807.75789400708 3.98828321649142e-07 0.000778985495085711
035_001-1 606.534041881413 2.36366000809452e-07  0.00149845249806366
          

## To measure the effect of a treatment on fitness effect, add the treatment variable to the dds design and re-run DEseq
New design should include the <code>gen\*cond</code> and <code>rep\*cond</code> interaction terms.<br>
Ensure that the samples file contains the <code>cond</code> column indicating the treatment status of each sample.<br>
Pseudo-conditions can be created between competition replicates to test for possible variance between competition batches (e.g. rep123 vs rep456). logFC values calculated this way should be the same as when each condition's samples is run in DESeq2 separately.

In [None]:
# samples file with condition label
samples_cond_file <- "./ladder_pilot_samples_pseudoConditions.txt"
# loading samples
samples_cond <- read.table(file = samples_cond_file, header = TRUE, sep = '\t', row.names = 1)
samples_cond$rep = factor(samples_cond$rep)

In [None]:
# re-run DESeq
dds2 <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                                         colData = samples_cond,
                                         design= ~gen+rep+cond+gen:cond+rep:cond)

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000 # conservative cutoff
# total_reads_cutoff = 150 # 10 reads per sample, considered bare minimum
keep <- rowSums(counts(dds2)) >= total_reads_cutoff
dds2 <- dds2[keep,]

# estimate size factors, specifying neutral oligos as control genes
dds2 <- estimateSizeFactors(dds2, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds2)))
# estimate dispersion
dds2 <- estimateDispersions(dds2)
# calculate logFC
dds2 <- nbinomWaldTest(dds2)

## To view condition-specific fitness, specify contrast when viewing results
If multiple conditions are present, fitness changes between non-reference conditions can be compared directly by specifying: <code>results(dds2, contrast=list("gen.condC", "gen.condB"))</code>

In [None]:
# view all terms in deseq model
resultsNames(dds2)

# view results:
### fitness in cond A (control)
res2 <- results(dds2, name='gen')
summary(res2)

### fitness in cond B (treatment)
res3 <- results(dds2, contrast=list( c("gen","gen.condB") ))
summary(res3)

### changes in fitness between cond A and B
res4 <- results(dds2, name='gen.condB')
summary(res4)

# for fitness in cond A, view table of results sorted by pvalue
resOrdered2 <- res2[order(res2$pvalue),]
resOrdered2

# # export results to file
# fwrite(as.data.frame(resOrdered2), file="deseq_fitness_condA.txt", sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)