 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"></ul></div>

In [1]:
# Download and read the counts, metadata of Nestorowa et al. 2016
fname <- "GSE81682_HTSeq_counts.txt.gz"
if (!file.exists(fname)) { download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE81682&format=file&file=GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz", fname) }
dataF <- read.table(fname, header=TRUE, row.names=1, check.names=FALSE)
dataF <- as.matrix(dataF)
dim(dataF)


In [2]:
fname <- "metaF.txt"
if (!file.exists(fname)) { download.file("http://blood.stemcells.cam.ac.uk/data/all_cell_types.txt", fname) }
metaF <- read.table(fname, stringsAsFactors = FALSE, header=TRUE, check.names=FALSE)
metainds <- match(colnames(dataF), rownames(metaF))
missing.meta <- is.na(metainds)
metaF <- metaF[metainds,] # This will contain NA's... which is okay, at this point, to preserve length.

In [3]:
metaF

Unnamed: 0,LTHSC_broad,LMPP_broad,MPP_broad,CMP_broad,MEP_broad,GMP_broad,MPP1_broad,MPP2_broad,MPP3_broad,STHSC_broad,⋯,CMP,MEP,GMP,MPP1,MPP2,MPP3,STHSC,ESLAM,HSC1,Projected
HSPC_007,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_013,0,1,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_019,0,1,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_025,0,0,1,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_031,0,0,1,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_037,0,0,1,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
LT-HSC_001,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_001,0,0,1,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_008,0,1,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HSPC_014,0,0,1,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


In [4]:
#Defining the cell type based on the metadata.
metatypeF <- rep("other", nrow(metaF))
for (col in rev(colnames(metaF))) { # reverse, so earlier columns end up overwriting later ones.
    chosen <- metaF[,col]==1
    metatypeF[chosen] <- sub("[0-9]?_.*", "", col)
}
metatypeF[metatypeF=="ESLAM"] <- "HSPC"

In [5]:
dataF

Unnamed: 0,HSPC_007,HSPC_013,HSPC_019,HSPC_025,HSPC_031,HSPC_037,LT-HSC_001,HSPC_001,HSPC_008,HSPC_014,⋯,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
ENSMUSG00000000001,0,7,1,185,2,2,136,232,354,181,⋯,43,1652,182,500,516,137,267,317,85,676
ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000028,4,1,2,4,3,1,1,2,2,0,⋯,5,4,50,401,293,5,596,649,102,457
ENSMUSG00000000031,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000037,0,0,0,0,0,0,0,0,1,20,⋯,3,1,0,0,0,0,0,0,1,134
ENSMUSG00000000049,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000056,1,1,2,1,2,3,0,0,0,0,⋯,33,715,169,502,304,477,390,3,4,320
ENSMUSG00000000058,1,0,0,0,0,0,0,0,242,1,⋯,0,1,0,2,1,0,0,1,0,349
ENSMUSG00000000078,0,18,3,9,51,3,46,56,1,364,⋯,15,0,1,39,32,5,3,8,8,105
ENSMUSG00000000085,2,1,2,0,0,153,0,0,0,59,⋯,1,0,25,26,182,182,178,167,1,1


In [6]:
# Filling in metadata from the cell sorting label, if metadata was missing.
metatypeF[missing.meta] <- sub("_.*", "", colnames(dataF)[missing.meta])
metatypeF[metatypeF=="LT-HSC"] <- "LTHSC"
metatypeF[metatypeF=="Prog"] <- "other"
# colnames(dataF)<-metatypeF

In [7]:
metatypeF

In [8]:
for(i in seq(1, length(metatypeF), by = 1))  { 
    if (metatypeF[i] == 'GMP' ){
        metatypeF[i] <- TRUE
    } else if (metatypeF[i] == 'CMP' ){
        metatypeF[i] <- TRUE
    } else if (metatypeF[i] == 'MEP' ){
        metatypeF[i] <- TRUE
    } else {
        metatypeF[i] <- FALSE
    }
}
metatypeF

In [9]:
keep <- metatypeF
keep

In [10]:
fname <- "metaF.txt"
if (!file.exists(fname)) { download.file("http://blood.stemcells.cam.ac.uk/data/all_cell_types.txt", fname) }
metaF <- read.table(fname, stringsAsFactors = FALSE, header=TRUE, check.names=FALSE)
metainds <- match(colnames(dataF), rownames(metaF))
missing.meta <- is.na(metainds)
metaF <- metaF[metainds,] # This will contain NA's... which is okay, at this point, to preserve length.

In [11]:
#Defining the cell type based on the metadata.
metatypeF1 <- rep("other", nrow(metaF))
for (col in rev(colnames(metaF))) { # reverse, so earlier columns end up overwriting later ones.
    chosen <- metaF[,col]==1
    metatypeF1[chosen] <- sub("[0-9]?_.*", "", col)
}
metatypeF1[metatypeF1=="ESLAM"] <- "HSPC"

In [12]:
metatypeF1

In [13]:
dim(dataF)

In [14]:
length(keep)

In [None]:
n = c(1, 3, 5) 
s = c("aa", "bb", "cc") 
z = c("a1","b1","c1")
df = data.frame(n, s, z)

In [None]:
df[,c(TRUE,FALSE,TRUE)]

In [None]:
c(TRUE,FALSE,TRUE)

In [17]:
metatypeF[1:1919]

In [18]:
cutdata <- dataF[,as.logical(metatypeF)]

In [19]:
head(cutdata)

Unnamed: 0,Prog_007,Prog_013,Prog_019,Prog_025,Prog_031,Prog_037,Prog_001,Prog_008,Prog_014,Prog_020,⋯,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
ENSMUSG00000000001,239,568,234,95,2,4,22,22,97,69,⋯,43,1652,182,500,516,137,267,317,85,676
ENSMUSG00000000003,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000028,72,3,20,48,1,0,2,2443,210,4,⋯,5,4,50,401,293,5,596,649,102,457
ENSMUSG00000000031,0,1,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000037,0,0,0,42,0,0,0,0,16,1,⋯,3,1,0,0,0,0,0,0,1,134
ENSMUSG00000000049,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [20]:
dim(cutdata)
tail(cutdata,100)

Unnamed: 0,Prog_007,Prog_013,Prog_019,Prog_025,Prog_031,Prog_037,Prog_001,Prog_008,Prog_014,Prog_020,⋯,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
ENSMUSG00000107390,6,0,1,1,1,1,4,1,4,2,⋯,2,4,1,0,2,2,0,1,1,2
ENSMUSG00000107391,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107392,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ERCC-00002,90663,88218,43694,10960,87230,48575,79422,59417,21562,66961,⋯,17754,17515,7952,12581,21927,26493,16578,7940,4610,20510
ERCC-00003,9543,6659,3617,1243,7908,3079,8437,5564,2614,5442,⋯,1112,1467,344,1058,2005,2579,1973,834,421,1394
ERCC-00004,33820,28065,16780,6246,30743,15308,29145,19790,9287,25209,⋯,5997,6800,2255,5120,8778,10868,6949,3379,1779,8188
ERCC-00009,9854,14659,5722,1472,8016,9179,10197,10750,2416,8222,⋯,2400,1420,946,1580,3563,3144,1524,1741,953,2701
ERCC-00012,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ERCC-00013,0,0,0,0,0,0,0,0,0,0,⋯,0,1,0,0,0,105,81,0,0,0
ERCC-00014,0,0,0,0,0,0,0,0,0,0,⋯,0,0,1,0,0,0,0,0,40,0


In [21]:
processed <- cutdata[1:46078,]



In [22]:
tail(processed,100)
dim(processed)

Unnamed: 0,Prog_007,Prog_013,Prog_019,Prog_025,Prog_031,Prog_037,Prog_001,Prog_008,Prog_014,Prog_020,⋯,Prog_851,Prog_809,Prog_816,Prog_822,Prog_828,Prog_834,Prog_840,Prog_846,Prog_852,Prog_810
ENSMUSG00000107293,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107294,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107295,0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107296,0,0,1,0,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,0
ENSMUSG00000107297,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107298,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107299,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0
ENSMUSG00000107300,0,0,0,0,0,0,0,0,0,0,⋯,0,1,0,0,0,0,0,0,0,0
ENSMUSG00000107301,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000107302,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [23]:
as.logical(metatypeF[1:10])

In [24]:
c(rep(FALSE, 10))

In [199]:
write.table(processed, file = "dataset2_test.txt",row.names=TRUE,col.names=TRUE, sep="\t")

In [None]:
metatypeF[5:10] <- FALSE

In [None]:
metatypeF[1:10]

In [28]:
metatypeF <- metatypeF[1:1920]
length(metatypeF)
dataF[,metatypeF]

ERROR: Error in dataF[, metatypeF]: subscript out of bounds


In [29]:
# Cleaning up memory.
gc() 

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,514125,27.5,940480,50.3,940480,50.3
Vcells,83090136,634.0,133439906,1018.1,166793096,1272.6


In [30]:
# Download and read the counts and meta data of Paul et al. 2015
fname <- "umitab_Amit.txt.gz"
if (!file.exists(fname)) { download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE72857&format=file&file=GSE72857%5Fumitab%2Etxt%2Egz", fname) }
dataA <- read.table(fname, header=TRUE, row.names=1)
metaA <- read.csv2("MAP.csv",sep=",",stringsAsFactors = FALSE, head=TRUE, row.names=1)
dim(dataA)

In [31]:
head(metaA)

Unnamed: 0,X7
W31106,15
W31107,3
W31108,15
W31109,3
W31110,15
W31111,4


In [32]:
# Only selecting cells that are in the metadata.
metainds <- match(rownames(metaA), colnames(dataA))
dataA <- dataA[,metainds]
dataA <- as.matrix(dataA)

In [33]:
# Organizing cell type labels.
metatypeA <- character(nrow(metaA))
metatypeA[metaA[,1]<7] <- "ERY"
metatypeA[metaA[,1]>6 & metaA[,1]<12] <- "CMP"
metatypeA[metaA[,1]>11] <- "GMP"
colnames(dataA) <- metatypeA

In [34]:
# Cleaning up memory.
gc() 

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,544215,29.1,940480,50.3,940480,50.3
Vcells,120414677,918.7,460069672,3510.1,575080728,4387.6


In [35]:
# Download list of highly variable genes identified by Nestrowa et al. 2016
fname <- "coordinates_gene_counts_flow_cytometry.txt.gz"
if (!file.exists(fname)) { download.file("http://blood.stemcells.cam.ac.uk/data/coordinates_gene_counts_flow_cytometry.txt.gz", fname) }
TFs <- read.table(fname, nrows=1, stringsAsFactors=FALSE)
features <- as.character(unlist(TFs))
features <- features[grep("ENSMUS", features)]

In [1]:
source("https://bioconductor.org/biocLite.R")
biocLite("biomaRt")
library("biomaRt")

Bioconductor version 3.6 (BiocInstaller 1.28.0), ?biocLite for help
A new version of Bioconductor is available after installing the most recent
  version of R; see http://bioconductor.org/install
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.6 (BiocInstaller 1.28.0), R 3.4.3 (2017-11-30).
Installing package(s) ‘biomaRt’
also installing the dependency ‘XML’

“installation of package ‘biomaRt’ had non-zero exit status”Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
Old packages: 'BH', 'bindr', 'bindrcpp', 'bit', 'blob', 'broom', 'callr',
  'car', 'caret', 'chron', 'cluster', 'config', 'curl', 'CVST', 'data.table',
  'dbplyr', 'ddalpha', 'digest', 'dplyr', 'DRR', 'forcats', 'foreign',
  'glmnet', 'haven', 'hexbin', 'highr', 'hms', 'htmlwidgets', 'httpuv',
  'IRdisplay', 'kernlab', 'knitr', 'lava', 'lme4', 'lubridate', 'maps', 'MASS',
  'Matrix', 'mgcv', 'miniUI', 'modelr', 'mongolite', 'munsell', 'nlme', 'odbc',
  'openssl', 'packrat', 'pbdZMQ', 

ERROR: Error in library("biomaRt"): there is no package called ‘biomaRt’


In [38]:
Sys.getenv("R_HOME")

In [None]:
# Pull down IDs from BioMaRt.

In [1]:
library(biomaRt)

ERROR: Error in library(biomaRt): there is no package called ‘biomaRt’


In [36]:
mart <- useMart("ensembl", dataset = "mmusculus_gene_ensembl", host="www.ensembl.org" )
out <- getBM(attributes = c("ensembl_gene_id", "mgi_symbol"), values = features, mart = mart,filters = "ensembl_gene_id")

ERROR: Error in library(biomaRt): there is no package called ‘biomaRt’


In [None]:
# Select features that are HVGs _and_ present in both data sets.
mF <- match(out$ensembl_gene_id, rownames(dataF))
mA <- pmatch(out$mgi_symbol, rownames(dataA)) # partial, due to use of concatenated gene symbols.
keep <- !is.na(mF) & !is.na(mA)

In [None]:
dataA3 <- dataA[mA[keep],]
dataF3 <- dataF[mF[keep],]
rownames(dataA3) <- rownames(dataF3)

In [None]:
head(dataA3)
ncol(dataA3)
head(dataF3)
ncol(dataF3)

In [None]:
write.table(dataA3, file = "dataset1.txt",row.names=TRUE,col.names=TRUE, sep="\t")

In [None]:
write.table(dataF3, file = "dataset2.txt",row.names=TRUE,col.names=TRUE, sep="\t")

In [17]:
rows_a3 <- rownames(dataA3)

In [18]:
length(unique(rows_a3))

In [19]:
col_a3 <- colnames(dataA3)

In [20]:
length(unique(col_a3))

In [21]:
length(rows_a3)

In [22]:
rows_f3 <- rownames(dataF3)

In [23]:
length(rows_f3)

In [24]:
length(unique(rows_f3))

In [25]:
test_a3 <- dataA3[0:3,0:3]

In [26]:
test_a3

Unnamed: 0,GMP,ERY,GMP.1
ENSMUSG00000000171,0,2,2
ENSMUSG00000000290,1,0,0
ENSMUSG00000000594,1,0,3


In [27]:
t(rowsum(t(test_a3), group = colnames(test_a3), na.rm = T))


Unnamed: 0,ERY,GMP
ENSMUSG00000000171,2,2
ENSMUSG00000000290,0,1
ENSMUSG00000000594,0,4


In [28]:
sum_a3 <- t(rowsum(t(dataA3), group = colnames(dataA3), na.rm = T))
head(sum_a3)

Unnamed: 0,CMP,ERY,GMP
ENSMUSG00000000171,204,2568,992
ENSMUSG00000000290,42,68,158
ENSMUSG00000000594,252,151,1256
ENSMUSG00000001082,130,586,930
ENSMUSG00000001380,75,2053,504
ENSMUSG00000001750,22,63,403


In [29]:
sum_f3 <- t(rowsum(t(dataF3), group = colnames(dataF3), na.rm = T))
head(sum_f3)

Unnamed: 0,CMP,GMP,LMPP,LTHSC,MEP,MPP,other
ENSMUSG00000000171,139537,51571,53923,82175,341859,73589,13808
ENSMUSG00000000290,29199,14210,17044,12929,23879,13488,1191
ENSMUSG00000000594,198541,98261,88370,101559,48808,140520,7650
ENSMUSG00000001082,138908,62798,42161,63321,130052,69625,6583
ENSMUSG00000001380,77986,30407,25547,67145,284592,43361,7423
ENSMUSG00000001750,71727,47157,38155,40180,23081,38933,3749


In [30]:
keeps <- c("CMP", "GMP")
test_f3 <- sum_f3[,keeps]

In [67]:
#write.table(sum_a3, file = "dataset1.txt",row.names=TRUE,col.names=TRUE, sep="\t")
# doesn't  work with seurat 

In [68]:
#write.table(test_f3, file = "dataset2.txt",row.names=TRUE,col.names=TRUE, sep="\t")