Updated all data sets to a named list with elements 'x' and 'y'. The …

…data objects match the first author's last name.
ramhiser · Mar 7, 2012 · 56c3d6f · 56c3d6f
1 parent 47fa37b
commit 56c3d6f
Show file tree

Hide file tree

Showing 27 changed files with 151 additions and 119 deletions.
diff --git a/TODO b/TODO
@@ -1,33 +1,14 @@
-Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
-	Alon
-	Christensen
-	Gravier
-	Shipp
-	Singh
+* Package TODO
+** Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
 	Cho et al. (1998) - Yeast Cell Cycle
 	Bhattacharjee et al. (2001) - Lung Cancer
 	Wen et al. (1998) - Rat CNS
 	Yeoh et al. (2002) - St. Jude Leukemia
+** Add data set descriptions to help.r
+** Add data set descriptions to github Wiki
 
-For 1000 splits, store the training_obs, test_obs, and the the gene ordering with Dudoit's var_sel.
-	Alon
-	Chiaretti
-	Christensen
-	Golub
-	Gravier
-	Khan
-	Shipp
-	Singh
-	Cho et al. (1998) - Yeast Cell Cycle
-	Bhattacharjee et al. (2001) - Lung Cancer
-	Wen et al. (1998) - Rat CNS
-	Yeoh et al. (2002) - St. Jude Leukemia
-Create a helper function that returns the splits along with the reduced dimension.
-	Takes dataset_name and q.
-Add unit test to make sure that q's are specified correctly for each data set.
-Also, add unit test to make sure helper function doesn't mess up with other names.
-	Example: NULL, "Awesome_data", "Asmodean"
 
-Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
+* Maybe/Someday
+** Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
 	How to download them only once?
 	More information here: http://www.omegahat.org/
diff --git a/alon-colon/1-download.r → alon-1999/1-download.r b/alon-colon/1-download.r → alon-1999/1-download.r
@@ -1,7 +1,6 @@
 # A number of cancer data sets are on Bioconductor (http://www.bioconductor.org)
 # First, we install ALL of Bioconductor's R packages.
 source("http://bioconductor.org/biocLite.R")
-biocLite()
 
 # Downloading the Alon Colon Cancer Data Set
-biocLite("colonCA")
+biocLite("colonCA")
diff --git a/alon-colon/2-clean.r → alon-1999/2-clean.r b/alon-colon/2-clean.r → alon-1999/2-clean.r
@@ -5,11 +5,8 @@ data('colonCA')
 # Bioconductor requires exprs() on the data sets.
 # We rename the columns of the data matrix because some of the microarray codes
 # exceed 256 characters in length, which causes errors in subsequent code.
-colon.x <- t(exprs(colonCA))
-colnames(colon.x) <- paste("X", seq_len(ncol(colon.x)), sep = "")
-colon.labels <- colonCA@phenoData$class
-
-colon.df <- data.frame(labels = colon.labels, colon.x)
-
-write.table(colon.df, "colon-cancer.csv", sep = ",", row = FALSE)
+x <- t(exprs(colonCA))
+colnames(x) <- paste("X", seq_len(ncol(x)), sep = "")
+y <- colonCA@phenoData$class
 
+alon <- list(x = x, y = factor(y))
diff --git a/alon-1999/3-save.r b/alon-1999/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Alon et al. (1999) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(alon, file = "alon.RData", compress = "xz")
diff --git a/chiaretti-2004/2-clean.r b/chiaretti-2004/2-clean.r
@@ -31,4 +31,4 @@ if(two_classes) {
   y <- y[idx]
 }
 
-chiaretti <- list(x = x, y = factor(y))
+chiaretti <- list(x = x, y = factor(y))
diff --git a/chiaretti-2004/3-save.r b/chiaretti-2004/3-save.r
@@ -2,4 +2,4 @@
 # The 'xz' compression format will compress the data more than the
 # default 'gzip' format. However, the 'xz' takes slightly longer
 # (~2 seconds longer) than 'gzip'.
-save(chiaretti, file = "chiaretti.RData", compress = "xz")
+save(chiaretti, file = "chiaretti.RData", compress = "xz")
diff --git a/christensen-methylation/1-download.r → christensen-2009/1-download.r b/christensen-methylation/1-download.r → christensen-2009/1-download.r
@@ -7,4 +7,4 @@
 
 download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-19434/E-GEOD-19434.processed.1.zip", destfile = "christensen.zip")
 unzip("christensen.zip", exdir = "christensen")
-download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
+download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
diff --git a/christensen-2009/2-clean.r b/christensen-2009/2-clean.r
@@ -0,0 +1,38 @@
+# Methylation Data Set from Christensen et al. (2009)
+library('plyr')
+
+temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
+temp <- temp[,c(1,8)]
+names(temp) <- c("subject_id", "labels")
+
+# Removes the extraneous " 1" from each subject's ID.
+temp$subject_id <- apply(temp, 1, function(subject) {
+	unlist(strsplit(subject[1], " "))[1]
+})
+
+# The paper considers the three groups: "blood", "placenta", and "other"
+temp[which(temp$labels == "guthrie blood"),]$labels <- "blood"
+temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other"
+
+subjects_files <- dir("christensen")
+christensen <- ldply(subjects_files, function(subject_file) {
+	subject_id <- unlist(strsplit(subject_file, "_"))[1]
+	subject_data <- read.table(paste("christensen/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+	subject_df <- rbind.data.frame(subject_data[,2])
+	subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
+	names(subject_df) <- c("labels", subject_data[,1])
+	subject_df
+}, .progress = "text")
+christensen <- list(
+                    x = subset(christensen, select=-labels),
+                    y = factor(christensen$labels)
+                   )
+
+# Removes the downloaded, compressed ZIP file along with the meta data.
+file.remove("additional_info.txt")
+file.remove("christensen.zip")
+
+# Removes the folder 'christensen' that contained the decompressed data
+unlink("christensen/", recursive = TRUE)
+
+
diff --git a/christensen-2009/3-save.r b/christensen-2009/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Christensen et al. (2009) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(christensen, file = "christensen.RData", compress = "xz")
diff --git a/christensen-methylation/2-clean.r b/christensen-methylation/2-clean.r
diff --git a/christensen-methylation/3-load.r b/christensen-methylation/3-load.r
diff --git a/golub-1999/2-clean.r b/golub-1999/2-clean.r
@@ -7,9 +7,6 @@ library('golubEsets')
 # By default, we only consider the original two classes (i.e. ALL or AML)
 two_classes <- TRUE
 
-# TODO:
-# Rename file to 2-clean.r
-
 # The training data set.
 data('Golub_Train')
 x <- t(exprs(Golub_Train))

diff --git a/gravier-breast/1-download.r → gravier-2010/1-download.r b/gravier-breast/1-download.r → gravier-2010/1-download.r
diff --git a/gravier-2010/2-clean.r b/gravier-2010/2-clean.r
@@ -0,0 +1,45 @@
+# I'm using a 'trim' function that removes trailing and leading white space from a string.
+# The code is from:
+# http://stackoverflow.com/questions/2261079/whitespace-in-r
+
+# Breast Cancer Data Set from Gravier et al. (2010)
+library('plyr')
+temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
+temp <- temp[,c(1,5)]
+names(temp) <- c("subject_id", "labels")
+temp <- temp[which(trim(temp$labels) != ""), ]
+
+# Removes the extraneous " 1" from each subject's ID.
+temp$subject_id <- apply(temp, 1, function(subject) {
+	unlist(strsplit(subject[1], " "))[1]
+})
+
+# From the paper's abstract:
+# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients
+# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome.
+#
+# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5.
+temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor")
+
+subjects_files <- dir("gravier")
+gravier <- ldply(subjects_files, function(subject_file) {
+	subject_id <- unlist(strsplit(subject_file, "_"))[1]
+	subject_data <- read.table(paste("gravier/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+	subject_df <- rbind.data.frame(subject_data[,2])
+	subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
+	names(subject_df) <- c("labels", subject_data[,1])
+	subject_df
+}, .progress = "text")
+
+gravier <- list(
+                x = subset(gravier, select=-labels),
+                y = factor(gravier$labels)
+               )
+
+# Removes the downloaded, compressed ZIP file along with the meta data.
+file.remove("additional_info.txt")
+file.remove("gravier.zip")
+
+# Removes the folder 'christensen' that contained the decompressed data
+unlink("gravier/", recursive = TRUE)
+
diff --git a/gravier-2010/3-save.r b/gravier-2010/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Gravier et al. (2010) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(gravier, file = "gravier.RData", compress = "xz")
diff --git a/gravier-breast/2-clean.r b/gravier-breast/2-clean.r
diff --git a/gravier-breast/3-load.r b/gravier-breast/3-load.r
diff --git a/shipp-DLBCL/1-download.r → shipp-2002/1-download.r b/shipp-DLBCL/1-download.r → shipp-2002/1-download.r
@@ -13,5 +13,5 @@
 # patients (labelled as 'cured')while 26 of them are from patients with fatal or refractory disease (labelled as 'fatal').
 # The expression profile contains 6817 genes.
 
-download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "DLBCL.zip")
-unzip("DLBCL.zip")
+download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "shipp.zip")
+unzip("shipp.zip")
diff --git a/shipp-2002/2-clean.r b/shipp-2002/2-clean.r
@@ -0,0 +1,11 @@
+# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002)
+temp <- read.csv("DLBCLTumor.data", header = FALSE)
+
+shipp <- list(
+              x = temp[,-ncol(temp)],
+              y = temp[,ncol(temp)]
+             )
+
+# Removes downloaded files
+unlink("shipp.zip")
+unlink("DLBCL*")
diff --git a/shipp-2002/3-save.r b/shipp-2002/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Shipp et al. (2002) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(shipp, file = "shipp.RData", compress = "xz")
diff --git a/shipp-DLBCL/2-clean.r b/shipp-DLBCL/2-clean.r
diff --git a/shipp-DLBCL/3-load.r b/shipp-DLBCL/3-load.r
diff --git a/singh-prostate/1-download.r → singh-2002/1-download.r b/singh-prostate/1-download.r → singh-2002/1-download.r
@@ -19,5 +19,5 @@
 # (B) Prediction of clinical outcome: in this data set, 21 patients were evaluable with respect to recurrence following surgery
 # with 8 patients having relapsed and 13 patients having remained relapse free ("non-relapse") for at least 4 years.
 
-download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "ProstateCancer.zip")
-unzip("ProstateCancer.zip")
+download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "singh.zip")
+unzip("singh.zip")
diff --git a/singh-2002/2-clean.r b/singh-2002/2-clean.r
@@ -0,0 +1,14 @@
+# Prostate Cancer Data Set from Singh et al. (2002)
+temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE)
+prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)])
+
+singh <- list(
+              x = temp[,-ncol(temp)],
+              y = temp[,ncol(temp)]
+             )
+
+# Removes downloaded files
+unlink("singh.zip")
+unlink("prostate/", recursive = TRUE)
+unlink("prostate*")
+
diff --git a/singh-2002/3-save.r b/singh-2002/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Singh et al. (2002) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(singh, file = "singh.RData", compress = "xz")
diff --git a/singh-prostate/2-clean.r b/singh-prostate/2-clean.r
diff --git a/singh-prostate/3-load.r b/singh-prostate/3-load.r