 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"></ul></div>

In [1]:
# This script prepares data for the haematopoiesis analysis.
# It involves two batches of publicly available data.

##########################################
##########################################

# Download and read the counts, metadata of Nestorowa et al. 2016
fname <- "GSE81682_HTSeq_counts.txt.gz"
if (!file.exists(fname)) { download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE81682&format=file&file=GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz", fname) }
dataF <- read.table(fname, header=TRUE, row.names=1, check.names=FALSE)
dataF <- as.matrix(dataF)
dim(dataF)

In [18]:
head(dataF)

Unnamed: 0,other,LMPP,LMPP.1,MPP,MPP.1,MPP.2,LTHSC,MPP.3,LMPP.2,MPP.4,⋯,CMP,CMP.1,MEP,MEP.1,CMP.2,CMP.3,GMP,GMP.1,MEP.2,CMP.4
ENSMUSG00000000001,0,7,1,185,2,2,136,232,354,181,⋯,43,1652,182,500,516,137,267,317,85,676
ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000028,4,1,2,4,3,1,1,2,2,0,⋯,5,4,50,401,293,5,596,649,102,457
ENSMUSG00000000031,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000037,0,0,0,0,0,0,0,0,1,20,⋯,3,1,0,0,0,0,0,0,1,134
ENSMUSG00000000049,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [2]:
fname <- "metaF.txt"
if (!file.exists(fname)) { download.file("http://blood.stemcells.cam.ac.uk/data/all_cell_types.txt", fname) }
metaF <- read.table(fname, stringsAsFactors = FALSE, header=TRUE, check.names=FALSE)
metainds <- match(colnames(dataF), rownames(metaF))
missing.meta <- is.na(metainds)
metaF <- metaF[metainds,] # This will contain NA's... which is okay, at this point, to preserve length.

In [3]:
# Defining the cell type based on the metadata.
metatypeF <- rep("other", nrow(metaF))
for (col in rev(colnames(metaF))) { # reverse, so earlier columns end up overwriting later ones.
    chosen <- metaF[,col]==1
    metatypeF[chosen] <- sub("[0-9]?_.*", "", col)
}
metatypeF[metatypeF=="ESLAM"] <- "HSPC"

In [4]:
# Filling in metadata from the cell sorting label, if metadata was missing.
metatypeF[missing.meta] <- sub("_.*", "", colnames(dataF)[missing.meta])
metatypeF[metatypeF=="LT-HSC"] <- "LTHSC"
metatypeF[metatypeF=="Prog"] <- "other"
colnames(dataF)<-metatypeF

In [5]:
# Perform size factor normalization within this data set.
library(scran)
high.abF <- scater::calcAverage(dataF) > 1
clustF <- quickCluster(dataF, method="igraph", subset.row=high.abF)
sizeF <- computeSumFactors(dataF, cluster=clustF, subset.row=high.abF)
dataF2 <- t(t(dataF)/sizeF)

Loading required package: BiocParallel
Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, cbind, colMeans, colSums, colnames, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, lengths, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, rank, rbind, rowM

In [19]:
head(dataF2)

Unnamed: 0,other,LMPP,LMPP.1,MPP,MPP.1,MPP.2,LTHSC,MPP.3,LMPP.2,MPP.4,⋯,CMP,CMP.1,MEP,MEP.1,CMP.2,CMP.3,GMP,GMP.1,MEP.2,CMP.4
ENSMUSG00000000001,0.0,70.77609,60.62664,344.66905,1.544963,3.802663,774.353951,94.3758999,637.94114,522.2063,⋯,58.42189,907.1998198,275.51728,243.1427,201.3878,70.785144,145.4816,239.2203,124.7205,234.27721
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,230.4496,10.11087,121.25327,7.452304,2.317445,1.901332,5.693779,0.8135853,3.604187,0.0,⋯,6.793243,2.1966097,75.69156,195.0005,114.3539,2.583399,324.7454,489.7603,149.6646,158.37971
ENSMUSG00000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.802094,57.70235,⋯,4.075946,0.5491524,0.0,0.0,0.0,0.0,0.0,0.0,1.4673,46.43956
ENSMUSG00000000049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Cleaning up memory.
gc() 

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,3839102,205.1,5684620,303.6,4816072,257.3
Vcells,136594542,1042.2,323757950,2470.1,270016852,2060.1


In [7]:
##########################################
##########################################

# Download and read the counts and meta data of Paul et al. 2015
fname <- "umitab_Amit.txt.gz"
if (!file.exists(fname)) { download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE72857&format=file&file=GSE72857%5Fumitab%2Etxt%2Egz", fname) }
dataA <- read.table(fname, header=TRUE, row.names=1)
metaA <- read.csv2("MAP.csv",sep=",",stringsAsFactors = FALSE, head=TRUE, row.names=1)
dim(dataA)

In [30]:
max(dataF)

In [8]:
# Only selecting cells that are in the metadata.
metainds <- match(rownames(metaA), colnames(dataA))
dataA <- dataA[,metainds]
dataA <- as.matrix(dataA)

In [9]:
# Organizing cell type labels.
metatypeA <- character(nrow(metaA))
metatypeA[metaA[,1]<7] <- "ERY"
metatypeA[metaA[,1]>6 & metaA[,1]<12] <- "CMP"
metatypeA[metaA[,1]>11] <- "GMP"
colnames(dataA) <- metatypeA

In [10]:
# Perform size factor normalization within this data set.
high.abA <- scater::calcAverage(dataA) > 1
clustA <- quickCluster(dataA, method="igraph", subset.row=high.abA)
sizeA <- computeSumFactors(dataA, cluster=clustA, subset.row=high.abA)
dataA2 <- t(t(dataA)/sizeA)

In [11]:
# Cleaning up memory.
gc() 

##########################################
##########################################

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,3878161,207.2,5684620,303.6,5684620,303.6
Vcells,250845441,1913.8,537419139,4100.2,671755572,5125.1


In [12]:
# Download list of highly variable genes identified by Nestrowa et al. 2016
fname <- "coordinates_gene_counts_flow_cytometry.txt.gz"
if (!file.exists(fname)) { download.file("http://blood.stemcells.cam.ac.uk/data/coordinates_gene_counts_flow_cytometry.txt.gz", fname) }
TFs <- read.table(fname, nrows=1, stringsAsFactors=FALSE)
features <- as.character(unlist(TFs))
features <- features[grep("ENSMUS", features)]

In [13]:
# Pull down IDs from BioMaRt.
library(biomaRt)
mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl", host="www.ensembl.org" )
out <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), values = features, mart = mart,filters = "ensembl_gene_id")

In [14]:
# Select features that are HVGs _and_ present in both data sets.
mF <- match(out$ensembl_gene_id, rownames(dataF2))
mA <- pmatch(out$mgi_symbol, rownames(dataA2)) # partial, due to use of concatenated gene symbols.
keep <- !is.na(mF) & !is.na(mA)

In [15]:
dataA3 <- dataA2[mA[keep],]
dataF3 <- dataF2[mF[keep],]
rownames(dataA3) <- rownames(dataF3)

In [16]:
# Rescaling the first dataset to match the coverage of the second.
aveA <- rowMeans(dataA3)
aveF <- rowMeans(dataF3)
dataF3 <- dataF3 * median(aveA/aveF)

In [17]:
# Perform log-transformation and save results to file.
logDataF3 <- log(1 + dataF3)
logDataA3 <- log(1 + dataA3)
save(logDataA3, logDataF3, file="logdataFandA_all.RData")

In [23]:
head(logDataF3)
dim(logDataF3)

Unnamed: 0,other,LMPP,LMPP.1,MPP,MPP.1,MPP.2,LTHSC,MPP.3,LMPP.2,MPP.4,⋯,CMP,CMP.1,MEP,MEP.1,CMP.2,CMP.3,GMP,GMP.1,MEP.2,CMP.4
ENSMUSG00000000171,0.35842577,0.04232184,0.17765409,0.152970186,0.0041200426,0.006078509,0.31127608,0.002605536,0.342431623,0.17496338,⋯,0.210110648,0.439775422,0.246811766,0.405838365,0.35219428,0.216196916,0.44337546,0.4716004,0.39093511,0.1248594407
ENSMUSG00000000290,0.0,0.01074956,0.0,0.001989468,0.0008253676,0.006078509,0.0,0.234919301,0.001924412,0.0,⋯,0.001451211,0.001759424,0.023982209,0.039243797,0.01284963,0.001103952,0.002326966,0.0,0.11401322,0.0007406125
ENSMUSG00000000594,0.0,0.35170863,0.2304894,0.639826507,0.0228566193,0.346844989,0.21297841,0.253650963,0.496750415,0.04522065,⋯,0.383648271,0.578323153,0.152764215,0.041240956,0.31300941,0.524282681,0.753058981,0.8417665,0.07112449,0.5166663503
ENSMUSG00000001082,0.220202,0.17775768,0.06279077,0.00595657,0.003297391,0.014126137,0.01209872,0.001737778,0.27890831,0.25932552,⋯,0.063305166,0.615800037,0.241742005,0.101723403,0.19564782,0.186393464,0.407602696,0.4476589,0.19721662,0.1417185034
ENSMUSG00000001380,0.35842577,0.05262824,0.0,0.278103768,0.0041200426,0.008096496,0.01809362,0.003472541,0.003845127,0.24976209,⋯,0.002900319,0.26548351,0.21730329,0.567637634,0.39945996,0.002206687,0.195151354,0.2549436,0.18295089,0.1271455535
ENSMUSG00000001750,0.05976036,0.05262824,0.06279077,0.313655325,0.0,0.004056442,0.01809362,0.0,0.001924412,0.43557557,⋯,0.276150267,0.107846057,0.003231054,0.001558162,0.10184778,0.156146959,0.237831198,0.3876372,0.39093511,0.0022201941


In [38]:
head(logDataA3)
max(dataA)

Unnamed: 0,GMP,ERY,GMP.1,ERY.1,GMP.2,ERY.2,ERY.3,ERY.4,ERY.5,ERY.6,⋯,CMP,CMP.1,CMP.2,GMP.3,GMP.4,ERY.7,GMP.5,CMP.3,GMP.6,ERY.8
ENSMUSG00000000171,0.0,0.7133989,0.7626812,0.6200165,0.0,0.0,0.0,0.6782274,0.5081123,0.8107793,⋯,0.0,1.013343,0.0,0.0,0.0,0.8050633,0.5404358,0,1.4376265,1.2533303
ENSMUSG00000000290,0.6317197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.6718648,0.0,0.0,0.0,0,0.0,0.0
ENSMUSG00000000594,0.6317197,0.0,0.9991696,0.0,2.231913,0.5136644,0.0,0.0,0.5081123,0.0,⋯,1.125407,1.013343,1.250356,0.6718648,0.0,0.0,0.8893345,0,0.5892703,0.2721228
ENSMUSG00000001082,1.0158408,0.0,0.0,0.0,1.124741,0.0,0.0,0.6782274,0.5081123,0.4854034,⋯,0.0,1.013343,0.0,0.6718648,0.0,0.6013382,0.8893345,0,0.5892703,0.0
ENSMUSG00000001380,1.0158408,0.7133989,0.0,0.8278695,0.0,0.5136644,1.917527,0.0,0.5081123,0.4854034,⋯,0.0,0.0,0.0,0.0,0.0,0.9742266,0.5404358,0,0.0,0.6617828
ENSMUSG00000001750,0.0,0.0,0.7626812,0.0,1.124741,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.9982723,0.0,0.5404358,0,0.9575651,0.0
