# Analyze ladder pilot counts in DESeq2

In [1]:
library("data.table")
library("dplyr")
library("DESeq2")
# for parallelization
library("BiocParallel")
register(MulticoreParam(8))
# for parsing barcode IDs
library("stringr")

setwd("/home/users/rang/scratch/yeast/crispey3/pool1_scm_aug2021/")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, 

## Setup path to files

In [2]:
# samples info file
samples_info_file = "./pool1_scm_aug2021_samples.txt"
# counts file
counts_file = "./counts/stacked_barcode_counts.txt"

# output directory
dir.create("./deseq/")
# deseq results file
deseq_output_file= "./deseq/pool1_scm_aug2021_results.txt"

print("Input files:")
print(samples_info_file)
print(counts_file)
print("Output files:")
print(deseq_output_file)

“'./deseq' already exists”


[1] "Input files:"
[1] "./pool1_scm_aug2021_samples.txt"
[1] "./counts/stacked_barcode_counts.txt"
[1] "Output files:"
[1] "./deseq/pool1_scm_aug2021_results.txt"


## Load samples info, barcode counts

In [3]:
# loading samples
samples <- read.table(file = samples_info_file, header = TRUE, sep = '\t', row.names = 1)
samples$rep = factor(samples$rep)

# loading counts
counts <- fread(counts_file, header=TRUE, sep = '\t', na.strings="", data.table=TRUE)

## Run DESeq with default settings to obtain barcode logFC values in CRISPEY competition
Assumes all samples are biological replicates of the same competition setup

In [4]:
# Default DESeq workflow
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)
dds

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# run DEseq on default settings
dds <- DESeq(dds, parallel=TRUE)

  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.



class: DESeqDataSet 
dim: 4577 36 
metadata(1): version
assays(1): counts
rownames(4577): 001_001 001_002 ... Ladder_042 Ladder_043
rowData names(0):
colnames(36): P1-SCM-T1-F1 P1-SCM-T1-F2 ... P1-SCM-T6-F5 P1-SCM-T6-F6
colData names(2): gen rep

estimating size factors

estimating dispersions

gene-wise dispersion estimates: 8 workers

mean-dispersion relationship

final dispersion estimates, fitting model and testing: 8 workers



In [5]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 3508 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 62, 1.8%
LFC < 0 (down)     : 62, 1.8%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 25)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 3508 rows and 6 columns
                   baseMean        log2FoldChange               lfcSE
                  <numeric>             <numeric>           <numeric>
Ladder_006 468.472602997419   -0.0847131617881736 0.00214126498775399
034_121    1525.68683332542     -0.11649036303777 0.00331869499662713
020_120    522.247853367154    -0.109810129891145 0.00373009939833936
Ladder_009 293.320127365387   -0.0705203927061838 0.00244571302754112
015_121    299.872971170762    -0.091806321357343 0.00389919356456986
...                     ...                   ...                 ...
016_011    64.2838721996283  6.45097329325252e-06 0.00446226190442581
021_083    49.0623921652223 -6.29764617160151e-06 0.00524662742799386
016_064    165.557503022584  2.06145499257209e-06 0.00333520160141386
002_027    397.534606228477 -1.14298895429429e-06 0.00272648746427979
025_116     26.082687367377   1.4827236057696e-06  0.008868529368285

## To measure fitness relative to neutral barcodes, specify neutral set in controlGenes when estimating size factors and recalculate logFC

In [48]:
# table of neutral oligos and their associated barcodes
neutral_oligos_file <- "/home/users/rang/crispey3/library_design/Output/crispey3_neutral_oligos_only.txt"
neutral_oligos <- fread(file = neutral_oligos_file, header = TRUE, sep = '\t', data.table=FALSE)

# get neutral barcode set
neutral_barcodes <- counts$barcode[str_split(counts$barcode, '-', simplify=TRUE)[,1] %in% neutral_oligos$barcode_id]


# ladder_oligos_annotation_file <- 'ladder_oligos_annotated.txt'
# ladder_oligos_annotation <- fread(file = ladder_oligos_annotation_file, header = TRUE, sep = '\t', data.table=FALSE)

# # get neutral barcode set
# neutral_barcodes = as.vector(ladder_oligos_annotation[ladder_oligos_annotation$breslow_growth_rate==1,'PBC'])
# neutral_barcodes

In [52]:
# re-run DESeq with specified neutral set
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                              colData = samples,
                              design= ~gen+rep)

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000
keep <- rowSums(counts(dds)) >= total_reads_cutoff
dds <- dds[keep,]

# estimate size factors, specifying neutral barcodes under controlGenes
dds <- estimateSizeFactors(dds, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds)))
# estimate dispersion
dds <- estimateDispersions(dds)
# calculate logFC
dds <- nbinomWaldTest(dds)

  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

  the design formula contains one or more numeric variables that have mean or
  standard deviation larger than 5 (an arbitrary threshold to trigger this message).
  it is generally a good idea to center and scale numeric variables in the design
  to improve GLM convergence.

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates



In [58]:
# view results summary
res <- results(dds, name='gen', alpha=0.05)
summary(res)

# view table of results sorted by pvalue
resOrdered <- res[order(res$pvalue),]
resOrdered

# export results to file
fwrite(as.data.frame(resOrdered), file=deseq_output_file, sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)


out of 3508 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 32, 0.91%
LFC < 0 (down)     : 100, 2.9%
outliers [1]       : 0, 0%
low counts [2]     : 0, 0%
(mean count < 25)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results



log2 fold change (MLE): gen 
Wald test p-value: gen 
DataFrame with 3508 rows and 6 columns
                   baseMean        log2FoldChange               lfcSE
                  <numeric>             <numeric>           <numeric>
Ladder_006 471.469787323985   -0.0858628174140032 0.00225927265592334
034_121    1535.92539390841    -0.117562701280057 0.00329803335028865
Ladder_009  295.00562261264   -0.0716867875935566 0.00248111007537022
020_120    526.145522702677    -0.110956255209262 0.00386344900360967
033_118    356.299286813241   -0.0495374413266234 0.00210024762701001
...                     ...                   ...                 ...
005_035     1634.7134306282   7.5532219972448e-06 0.00203309466334145
009_112    1613.61306808239 -1.27212389953005e-05  0.0034496245226822
011_002    222.754247060867 -8.02021226431887e-06 0.00267546961538225
015_053    173.481239405436  8.64967622427587e-06 0.00632455931906663
005_088    84.2998321478103 -5.78980910399331e-07 0.0048467438290937

In [57]:
help(fwrite)

## To measure the effect of a treatment on fitness effect, add the treatment variable to the dds design and re-run DEseq
New design should include the <code>gen\*cond</code> and <code>rep\*cond</code> interaction terms.<br>
Ensure that the samples file contains the <code>cond</code> column indicating the treatment status of each sample.<br>
Pseudo-conditions can be created between competition replicates to test for possible variance between competition batches (e.g. rep123 vs rep456). logFC values calculated this way should be the same as when each condition's samples is run in DESeq2 separately.

In [None]:
# samples file with condition label
samples_cond_file <- "./ladder_pilot_samples_pseudoConditions.txt"
# loading samples
samples_cond <- read.table(file = samples_cond_file, header = TRUE, sep = '\t', row.names = 1)
samples_cond$rep = factor(samples_cond$rep)

In [None]:
# re-run DESeq
dds2 <- DESeqDataSetFromMatrix(countData = as.matrix(counts, rownames = 1),
                                         colData = samples_cond,
                                         design= ~gen+rep+cond+gen:cond+rep:cond)

# filter barcodes that do not meet minimum read cutoff
total_reads_cutoff = 1000

keep <- rowSums(counts(dds2)) >= total_reads_cutoff
dds2 <- dds2[keep,]

# estimate size factors, specifying neutral oligos as control genes
dds2 <- estimateSizeFactors(dds2, controlGenes=grep(paste(neutral_barcodes, collapse="|"), rownames(dds2)))
# estimate dispersion
dds2 <- estimateDispersions(dds2)
# calculate logFC
dds2 <- nbinomWaldTest(dds2)

## To view condition-specific fitness, specify contrast when viewing results
If multiple conditions are present, fitness changes between non-reference conditions can be compared directly by specifying: <code>results(dds2, contrast=list("gen.condC", "gen.condB"))</code>

In [None]:
# view all terms in deseq model
resultsNames(dds2)

# view results:
### fitness in cond A (control)
res2 <- results(dds2, name='gen')
summary(res2)

### fitness in cond B (treatment)
res3 <- results(dds2, contrast=list( c("gen","gen.condB") ))
summary(res3)

### changes in fitness between cond A and B
res4 <- results(dds2, name='gen.condB')
summary(res4)

# for fitness in cond A, view table of results sorted by pvalue
resOrdered2 <- res2[order(res2$pvalue),]
resOrdered2

# # export results to file
# fwrite(as.data.frame(resOrdered2), file="deseq_fitness_condA.txt", sep="\t", quote=FALSE, row.names=TRUE, col.names=TRUE)