From 56c3d6fa83850e11e94e876f2441ea036bc47067 Mon Sep 17 00:00:00 2001
From: John Ramey <johnramey@gmail.com>
Date: Tue, 6 Mar 2012 21:36:57 -0800
Subject: [PATCH] Updated all data sets to a named list with elements 'x' and
 'y'. The data objects match the first author's last name.

---
 TODO                                          | 31 +++----------
 {alon-colon => alon-1999}/1-download.r        |  3 +-
 {alon-colon => alon-1999}/2-clean.r           | 11 ++---
 alon-1999/3-save.r                            |  5 +++
 chiaretti-2004/2-clean.r                      |  2 +-
 chiaretti-2004/3-save.r                       |  2 +-
 .../1-download.r                              |  2 +-
 christensen-2009/2-clean.r                    | 38 ++++++++++++++++
 christensen-2009/3-save.r                     |  5 +++
 christensen-methylation/2-clean.r             | 27 -----------
 christensen-methylation/3-load.r              |  1 -
 golub-1999/2-clean.r                          |  3 --
 {gravier-breast => gravier-2010}/1-download.r |  0
 gravier-2010/2-clean.r                        | 45 +++++++++++++++++++
 gravier-2010/3-save.r                         |  5 +++
 gravier-breast/2-clean.r                      | 34 --------------
 gravier-breast/3-load.r                       |  1 -
 {shipp-DLBCL => shipp-2002}/1-download.r      |  4 +-
 shipp-2002/2-clean.r                          | 11 +++++
 shipp-2002/3-save.r                           |  5 +++
 shipp-DLBCL/2-clean.r                         |  5 ---
 shipp-DLBCL/3-load.r                          |  1 -
 {singh-prostate => singh-2002}/1-download.r   |  4 +-
 singh-2002/2-clean.r                          | 14 ++++++
 singh-2002/3-save.r                           |  5 +++
 singh-prostate/2-clean.r                      |  5 ---
 singh-prostate/3-load.r                       |  1 -
 27 files changed, 151 insertions(+), 119 deletions(-)
 rename {alon-colon => alon-1999}/1-download.r (88%)
 rename {alon-colon => alon-1999}/2-clean.r (52%)
 create mode 100644 alon-1999/3-save.r
 rename {christensen-methylation => christensen-2009}/1-download.r (99%)
 create mode 100644 christensen-2009/2-clean.r
 create mode 100644 christensen-2009/3-save.r
 delete mode 100644 christensen-methylation/2-clean.r
 delete mode 100644 christensen-methylation/3-load.r
 rename {gravier-breast => gravier-2010}/1-download.r (100%)
 create mode 100644 gravier-2010/2-clean.r
 create mode 100644 gravier-2010/3-save.r
 delete mode 100644 gravier-breast/2-clean.r
 delete mode 100644 gravier-breast/3-load.r
 rename {shipp-DLBCL => shipp-2002}/1-download.r (93%)
 create mode 100644 shipp-2002/2-clean.r
 create mode 100644 shipp-2002/3-save.r
 delete mode 100644 shipp-DLBCL/2-clean.r
 delete mode 100644 shipp-DLBCL/3-load.r
 rename {singh-prostate => singh-2002}/1-download.r (93%)
 create mode 100644 singh-2002/2-clean.r
 create mode 100644 singh-2002/3-save.r
 delete mode 100644 singh-prostate/2-clean.r
 delete mode 100644 singh-prostate/3-load.r

diff --git a/TODO b/TODO
index c537e52..d1b9da1 100644
--- a/TODO
+++ b/TODO
@@ -1,33 +1,14 @@
-Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
-	Alon
-	Christensen
-	Gravier
-	Shipp
-	Singh
+* Package TODO
+** Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
 	Cho et al. (1998) - Yeast Cell Cycle
 	Bhattacharjee et al. (2001) - Lung Cancer
 	Wen et al. (1998) - Rat CNS
 	Yeoh et al. (2002) - St. Jude Leukemia
+** Add data set descriptions to help.r
+** Add data set descriptions to github Wiki
 
-For 1000 splits, store the training_obs, test_obs, and the the gene ordering with Dudoit's var_sel.
-	Alon
-	Chiaretti
-	Christensen
-	Golub
-	Gravier
-	Khan
-	Shipp
-	Singh
-	Cho et al. (1998) - Yeast Cell Cycle
-	Bhattacharjee et al. (2001) - Lung Cancer
-	Wen et al. (1998) - Rat CNS
-	Yeoh et al. (2002) - St. Jude Leukemia
-Create a helper function that returns the splits along with the reduced dimension.
-	Takes dataset_name and q.
-Add unit test to make sure that q's are specified correctly for each data set.
-Also, add unit test to make sure helper function doesn't mess up with other names.
-	Example: NULL, "Awesome_data", "Asmodean"
 
-Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
+* Maybe/Someday
+** Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
 	How to download them only once?
 	More information here: http://www.omegahat.org/
diff --git a/alon-colon/1-download.r b/alon-1999/1-download.r
similarity index 88%
rename from alon-colon/1-download.r
rename to alon-1999/1-download.r
index 7d28ff8..1bbc7ee 100644
--- a/alon-colon/1-download.r
+++ b/alon-1999/1-download.r
@@ -1,7 +1,6 @@
 # A number of cancer data sets are on Bioconductor (http://www.bioconductor.org)
 # First, we install ALL of Bioconductor's R packages.
 source("http://bioconductor.org/biocLite.R")
-biocLite()
 
 # Downloading the Alon Colon Cancer Data Set
-biocLite("colonCA")
\ No newline at end of file
+biocLite("colonCA")
diff --git a/alon-colon/2-clean.r b/alon-1999/2-clean.r
similarity index 52%
rename from alon-colon/2-clean.r
rename to alon-1999/2-clean.r
index a6bb291..9f4fae6 100644
--- a/alon-colon/2-clean.r
+++ b/alon-1999/2-clean.r
@@ -5,11 +5,8 @@ data('colonCA')
 # Bioconductor requires exprs() on the data sets.
 # We rename the columns of the data matrix because some of the microarray codes
 # exceed 256 characters in length, which causes errors in subsequent code.
-colon.x <- t(exprs(colonCA))
-colnames(colon.x) <- paste("X", seq_len(ncol(colon.x)), sep = "")
-colon.labels <- colonCA@phenoData$class
-
-colon.df <- data.frame(labels = colon.labels, colon.x)
-
-write.table(colon.df, "colon-cancer.csv", sep = ",", row = FALSE)
+x <- t(exprs(colonCA))
+colnames(x) <- paste("X", seq_len(ncol(x)), sep = "")
+y <- colonCA@phenoData$class
 
+alon <- list(x = x, y = factor(y))
diff --git a/alon-1999/3-save.r b/alon-1999/3-save.r
new file mode 100644
index 0000000..56a7201
--- /dev/null
+++ b/alon-1999/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Alon et al. (1999) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(alon, file = "alon.RData", compress = "xz")
diff --git a/chiaretti-2004/2-clean.r b/chiaretti-2004/2-clean.r
index e92c0ab..ddc4158 100644
--- a/chiaretti-2004/2-clean.r
+++ b/chiaretti-2004/2-clean.r
@@ -31,4 +31,4 @@ if(two_classes) {
   y <- y[idx]
 }
 
-chiaretti <- list(x = x, y = factor(y))
\ No newline at end of file
+chiaretti <- list(x = x, y = factor(y))
diff --git a/chiaretti-2004/3-save.r b/chiaretti-2004/3-save.r
index effe9b0..dbe8eab 100644
--- a/chiaretti-2004/3-save.r
+++ b/chiaretti-2004/3-save.r
@@ -2,4 +2,4 @@
 # The 'xz' compression format will compress the data more than the
 # default 'gzip' format. However, the 'xz' takes slightly longer
 # (~2 seconds longer) than 'gzip'.
-save(chiaretti, file = "chiaretti.RData", compress = "xz")
\ No newline at end of file
+save(chiaretti, file = "chiaretti.RData", compress = "xz")
diff --git a/christensen-methylation/1-download.r b/christensen-2009/1-download.r
similarity index 99%
rename from christensen-methylation/1-download.r
rename to christensen-2009/1-download.r
index 0d274fe..2989582 100644
--- a/christensen-methylation/1-download.r
+++ b/christensen-2009/1-download.r
@@ -7,4 +7,4 @@
 
 download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-19434/E-GEOD-19434.processed.1.zip", destfile = "christensen.zip")
 unzip("christensen.zip", exdir = "christensen")
-download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
\ No newline at end of file
+download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
diff --git a/christensen-2009/2-clean.r b/christensen-2009/2-clean.r
new file mode 100644
index 0000000..666c1bb
--- /dev/null
+++ b/christensen-2009/2-clean.r
@@ -0,0 +1,38 @@
+# Methylation Data Set from Christensen et al. (2009)
+library('plyr')
+
+temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
+temp <- temp[,c(1,8)]
+names(temp) <- c("subject_id", "labels")
+
+# Removes the extraneous " 1" from each subject's ID.
+temp$subject_id <- apply(temp, 1, function(subject) {
+	unlist(strsplit(subject[1], " "))[1]
+})
+
+# The paper considers the three groups: "blood", "placenta", and "other"
+temp[which(temp$labels == "guthrie blood"),]$labels <- "blood"
+temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other"
+
+subjects_files <- dir("christensen")
+christensen <- ldply(subjects_files, function(subject_file) {
+	subject_id <- unlist(strsplit(subject_file, "_"))[1]
+	subject_data <- read.table(paste("christensen/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+	subject_df <- rbind.data.frame(subject_data[,2])
+	subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
+	names(subject_df) <- c("labels", subject_data[,1])
+	subject_df
+}, .progress = "text")
+christensen <- list(
+                    x = subset(christensen, select=-labels),
+                    y = factor(christensen$labels)
+                   )
+
+# Removes the downloaded, compressed ZIP file along with the meta data.
+file.remove("additional_info.txt")
+file.remove("christensen.zip")
+
+# Removes the folder 'christensen' that contained the decompressed data
+unlink("christensen/", recursive = TRUE)
+
+
diff --git a/christensen-2009/3-save.r b/christensen-2009/3-save.r
new file mode 100644
index 0000000..5d1c9cc
--- /dev/null
+++ b/christensen-2009/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Christensen et al. (2009) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(christensen, file = "christensen.RData", compress = "xz")
diff --git a/christensen-methylation/2-clean.r b/christensen-methylation/2-clean.r
deleted file mode 100644
index d7155a8..0000000
--- a/christensen-methylation/2-clean.r
+++ /dev/null
@@ -1,27 +0,0 @@
-# Methylation Data Set from Christensen et al. (2009)
-library(plyr)
-temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE)
-temp <- temp[,c(1,4)]
-names(temp) <- c("subject.id", "labels")
-
-# Removes the extraneous " 1" from each subject's ID.
-temp$subject.id <- apply(temp, 1, function(subject) {
-	unlist(strsplit(subject[1], " "))[1]
-})
-
-# The paper considers the three groups: "blood", "placenta", and "other"
-temp[which(temp$labels == "guthrie blood"),]$labels <- "blood"
-temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other"
-
-subjects.files <- dir("christensen")
-christensen.df <- ldply(subjects.files, function(subject.file) {
-	subject.id <- unlist(strsplit(subject.file, "_"))[1]
-	subject.data <- read.table(paste("christensen/", subject.file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
-	subject.df <- rbind.data.frame(subject.data[,2])
-	subject.df <- cbind.data.frame(temp[which(temp[,1] == subject.id), 2], subject.df)
-	names(subject.df) <- c("labels", subject.data[,1])
-	subject.df
-}, .progress = "text")
-christensen.df$labels <- factor(christensen.df$labels)
-
-write.csv(christensen.df, bzfile("christensen.csv.bz2"), row.names = FALSE)
\ No newline at end of file
diff --git a/christensen-methylation/3-load.r b/christensen-methylation/3-load.r
deleted file mode 100644
index 6b63d0d..0000000
--- a/christensen-methylation/3-load.r
+++ /dev/null
@@ -1 +0,0 @@
-christensen.df <- read.csv(bzfile("christensen.csv.bz2", "r"))
\ No newline at end of file
diff --git a/golub-1999/2-clean.r b/golub-1999/2-clean.r
index 0aac1bd..5420d1f 100644
--- a/golub-1999/2-clean.r
+++ b/golub-1999/2-clean.r
@@ -7,9 +7,6 @@ library('golubEsets')
 # By default, we only consider the original two classes (i.e. ALL or AML)
 two_classes <- TRUE
 
-# TODO:
-# Rename file to 2-clean.r
-
 # The training data set.
 data('Golub_Train')
 x <- t(exprs(Golub_Train))
diff --git a/gravier-breast/1-download.r b/gravier-2010/1-download.r
similarity index 100%
rename from gravier-breast/1-download.r
rename to gravier-2010/1-download.r
diff --git a/gravier-2010/2-clean.r b/gravier-2010/2-clean.r
new file mode 100644
index 0000000..1fb5c41
--- /dev/null
+++ b/gravier-2010/2-clean.r
@@ -0,0 +1,45 @@
+# I'm using a 'trim' function that removes trailing and leading white space from a string.
+# The code is from:
+# http://stackoverflow.com/questions/2261079/whitespace-in-r
+
+# Breast Cancer Data Set from Gravier et al. (2010)
+library('plyr')
+temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
+temp <- temp[,c(1,5)]
+names(temp) <- c("subject_id", "labels")
+temp <- temp[which(trim(temp$labels) != ""), ]
+
+# Removes the extraneous " 1" from each subject's ID.
+temp$subject_id <- apply(temp, 1, function(subject) {
+	unlist(strsplit(subject[1], " "))[1]
+})
+
+# From the paper's abstract:
+# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients
+# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome.
+#
+# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5.
+temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor")
+
+subjects_files <- dir("gravier")
+gravier <- ldply(subjects_files, function(subject_file) {
+	subject_id <- unlist(strsplit(subject_file, "_"))[1]
+	subject_data <- read.table(paste("gravier/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
+	subject_df <- rbind.data.frame(subject_data[,2])
+	subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
+	names(subject_df) <- c("labels", subject_data[,1])
+	subject_df
+}, .progress = "text")
+
+gravier <- list(
+                x = subset(gravier, select=-labels),
+                y = factor(gravier$labels)
+               )
+
+# Removes the downloaded, compressed ZIP file along with the meta data.
+file.remove("additional_info.txt")
+file.remove("gravier.zip")
+
+# Removes the folder 'christensen' that contained the decompressed data
+unlink("gravier/", recursive = TRUE)
+
diff --git a/gravier-2010/3-save.r b/gravier-2010/3-save.r
new file mode 100644
index 0000000..16932bf
--- /dev/null
+++ b/gravier-2010/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Gravier et al. (2010) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(gravier, file = "gravier.RData", compress = "xz")
diff --git a/gravier-breast/2-clean.r b/gravier-breast/2-clean.r
deleted file mode 100644
index b757e32..0000000
--- a/gravier-breast/2-clean.r
+++ /dev/null
@@ -1,34 +0,0 @@
-# Breast Cancer Data Set from Gravier et al. (2010)
-library(plyr)
-temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE)
-temp <- temp[,c(1,5)]
-names(temp) <- c("subject.id", "labels")
-temp <- temp[which(temp$labels != ""), ]
-
-# Removes the extraneous " 1" from each subject's ID.
-temp$subject.id <- apply(temp, 1, function(subject) {
-	unlist(strsplit(subject[1], " "))[1]
-})
-
-# From the paper's abstract:
-# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients
-# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome.
-#
-# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5.
-temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor")
-
-subjects.files <- dir("gravier")
-gravier.df <- ldply(subjects.files, function(subject.file) {
-	subject.id <- unlist(strsplit(subject.file, "_"))[1]
-	subject.data <- read.table(paste("gravier/", subject.file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
-	subject.df <- rbind.data.frame(subject.data[,2])
-	subject.df <- cbind.data.frame(temp[which(temp[,1] == subject.id), 2], subject.df)
-	names(subject.df) <- c("labels", subject.data[,1])
-	subject.df
-}, .progress = "text")
-gravier.df$labels <- factor(gravier.df$labels)
-
-write.csv(gravier.df, bzfile("gravier.csv.bz2"), row.names = FALSE)
-
-
-
diff --git a/gravier-breast/3-load.r b/gravier-breast/3-load.r
deleted file mode 100644
index 4509e30..0000000
--- a/gravier-breast/3-load.r
+++ /dev/null
@@ -1 +0,0 @@
-gravier.df <- read.csv(bzfile("gravier.csv.bz2", "r"))
\ No newline at end of file
diff --git a/shipp-DLBCL/1-download.r b/shipp-2002/1-download.r
similarity index 93%
rename from shipp-DLBCL/1-download.r
rename to shipp-2002/1-download.r
index dc3e7e1..cae4a1b 100644
--- a/shipp-DLBCL/1-download.r
+++ b/shipp-2002/1-download.r
@@ -13,5 +13,5 @@
 # patients (labelled as 'cured')while 26 of them are from patients with fatal or refractory disease (labelled as 'fatal').
 # The expression profile contains 6817 genes.
 
-download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "DLBCL.zip")
-unzip("DLBCL.zip")
\ No newline at end of file
+download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "shipp.zip")
+unzip("shipp.zip")
diff --git a/shipp-2002/2-clean.r b/shipp-2002/2-clean.r
new file mode 100644
index 0000000..505eb9c
--- /dev/null
+++ b/shipp-2002/2-clean.r
@@ -0,0 +1,11 @@
+# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002)
+temp <- read.csv("DLBCLTumor.data", header = FALSE)
+
+shipp <- list(
+              x = temp[,-ncol(temp)],
+              y = temp[,ncol(temp)]
+             )
+
+# Removes downloaded files
+unlink("shipp.zip")
+unlink("DLBCL*")
diff --git a/shipp-2002/3-save.r b/shipp-2002/3-save.r
new file mode 100644
index 0000000..8b2cec5
--- /dev/null
+++ b/shipp-2002/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Shipp et al. (2002) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(shipp, file = "shipp.RData", compress = "xz")
diff --git a/shipp-DLBCL/2-clean.r b/shipp-DLBCL/2-clean.r
deleted file mode 100644
index 0122758..0000000
--- a/shipp-DLBCL/2-clean.r
+++ /dev/null
@@ -1,5 +0,0 @@
-# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002)
-temp <- read.csv("DLBCLTumor.data", header = FALSE)
-DLBCL.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)])
-
-write.csv(DLBCL.df, bzfile("DLBCL.csv.bz2"), row.names = FALSE)
\ No newline at end of file
diff --git a/shipp-DLBCL/3-load.r b/shipp-DLBCL/3-load.r
deleted file mode 100644
index 0edd07f..0000000
--- a/shipp-DLBCL/3-load.r
+++ /dev/null
@@ -1 +0,0 @@
-DLBCL.df <- read.csv(bzfile("DLBCL.csv.bz2", "r"))
\ No newline at end of file
diff --git a/singh-prostate/1-download.r b/singh-2002/1-download.r
similarity index 93%
rename from singh-prostate/1-download.r
rename to singh-2002/1-download.r
index 2254901..174c473 100644
--- a/singh-prostate/1-download.r
+++ b/singh-2002/1-download.r
@@ -19,5 +19,5 @@
 # (B) Prediction of clinical outcome: in this data set, 21 patients were evaluable with respect to recurrence following surgery
 # with 8 patients having relapsed and 13 patients having remained relapse free ("non-relapse") for at least 4 years.
 
-download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "ProstateCancer.zip")
-unzip("ProstateCancer.zip")
\ No newline at end of file
+download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "singh.zip")
+unzip("singh.zip")
diff --git a/singh-2002/2-clean.r b/singh-2002/2-clean.r
new file mode 100644
index 0000000..e01b067
--- /dev/null
+++ b/singh-2002/2-clean.r
@@ -0,0 +1,14 @@
+# Prostate Cancer Data Set from Singh et al. (2002)
+temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE)
+prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)])
+
+singh <- list(
+              x = temp[,-ncol(temp)],
+              y = temp[,ncol(temp)]
+             )
+
+# Removes downloaded files
+unlink("singh.zip")
+unlink("prostate/", recursive = TRUE)
+unlink("prostate*")
+
diff --git a/singh-2002/3-save.r b/singh-2002/3-save.r
new file mode 100644
index 0000000..5b01f5b
--- /dev/null
+++ b/singh-2002/3-save.r
@@ -0,0 +1,5 @@
+# Save a compressed version of the Singh et al. (2002) data set.
+# The 'xz' compression format will compress the data more than the
+# default 'gzip' format. However, the 'xz' takes slightly longer
+# (~2 seconds longer) than 'gzip'.
+save(singh, file = "singh.RData", compress = "xz")
diff --git a/singh-prostate/2-clean.r b/singh-prostate/2-clean.r
deleted file mode 100644
index 803520b..0000000
--- a/singh-prostate/2-clean.r
+++ /dev/null
@@ -1,5 +0,0 @@
-# Prostate Cancer Data Set from Singh et al. (2002)
-temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE)
-prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)])
-
-write.csv(prostate.df, bzfile("prostate.csv.bz2"), row.names = FALSE)
\ No newline at end of file
diff --git a/singh-prostate/3-load.r b/singh-prostate/3-load.r
deleted file mode 100644
index 8d0a050..0000000
--- a/singh-prostate/3-load.r
+++ /dev/null
@@ -1 +0,0 @@
-prostate.df <- read.csv(bzfile("prostate.csv.bz2", "r"))
\ No newline at end of file