Skip to content

Commit

Permalink
Updated all data sets to a named list with elements 'x' and 'y'. The …
Browse files Browse the repository at this point in the history
…data objects match the first author's last name.
  • Loading branch information
ramhiser committed Mar 7, 2012
1 parent 47fa37b commit 56c3d6f
Show file tree
Hide file tree
Showing 27 changed files with 151 additions and 119 deletions.
31 changes: 6 additions & 25 deletions TODO
@@ -1,33 +1,14 @@
Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
Alon
Christensen
Gravier
Shipp
Singh
* Package TODO
** Update the following data sets to the new scheme (see Chiaretti, 2004 for example)
Cho et al. (1998) - Yeast Cell Cycle
Bhattacharjee et al. (2001) - Lung Cancer
Wen et al. (1998) - Rat CNS
Yeoh et al. (2002) - St. Jude Leukemia
** Add data set descriptions to help.r
** Add data set descriptions to github Wiki

For 1000 splits, store the training_obs, test_obs, and the the gene ordering with Dudoit's var_sel.
Alon
Chiaretti
Christensen
Golub
Gravier
Khan
Shipp
Singh
Cho et al. (1998) - Yeast Cell Cycle
Bhattacharjee et al. (2001) - Lung Cancer
Wen et al. (1998) - Rat CNS
Yeoh et al. (2002) - St. Jude Leukemia
Create a helper function that returns the splits along with the reduced dimension.
Takes dataset_name and q.
Add unit test to make sure that q's are specified correctly for each data set.
Also, add unit test to make sure helper function doesn't mess up with other names.
Example: NULL, "Awesome_data", "Asmodean"

Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
* Maybe/Someday
** Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user.
How to download them only once?
More information here: http://www.omegahat.org/
3 changes: 1 addition & 2 deletions alon-colon/1-download.r → alon-1999/1-download.r
@@ -1,7 +1,6 @@
# A number of cancer data sets are on Bioconductor (http://www.bioconductor.org)
# First, we install ALL of Bioconductor's R packages.
source("http://bioconductor.org/biocLite.R")
biocLite()

# Downloading the Alon Colon Cancer Data Set
biocLite("colonCA")
biocLite("colonCA")
11 changes: 4 additions & 7 deletions alon-colon/2-clean.r → alon-1999/2-clean.r
Expand Up @@ -5,11 +5,8 @@ data('colonCA')
# Bioconductor requires exprs() on the data sets.
# We rename the columns of the data matrix because some of the microarray codes
# exceed 256 characters in length, which causes errors in subsequent code.
colon.x <- t(exprs(colonCA))
colnames(colon.x) <- paste("X", seq_len(ncol(colon.x)), sep = "")
colon.labels <- colonCA@phenoData$class

colon.df <- data.frame(labels = colon.labels, colon.x)

write.table(colon.df, "colon-cancer.csv", sep = ",", row = FALSE)
x <- t(exprs(colonCA))
colnames(x) <- paste("X", seq_len(ncol(x)), sep = "")
y <- colonCA@phenoData$class

alon <- list(x = x, y = factor(y))
5 changes: 5 additions & 0 deletions alon-1999/3-save.r
@@ -0,0 +1,5 @@
# Save a compressed version of the Alon et al. (1999) data set.
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(alon, file = "alon.RData", compress = "xz")
2 changes: 1 addition & 1 deletion chiaretti-2004/2-clean.r
Expand Up @@ -31,4 +31,4 @@ if(two_classes) {
y <- y[idx]
}

chiaretti <- list(x = x, y = factor(y))
chiaretti <- list(x = x, y = factor(y))
2 changes: 1 addition & 1 deletion chiaretti-2004/3-save.r
Expand Up @@ -2,4 +2,4 @@
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(chiaretti, file = "chiaretti.RData", compress = "xz")
save(chiaretti, file = "chiaretti.RData", compress = "xz")
Expand Up @@ -7,4 +7,4 @@

download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-19434/E-GEOD-19434.processed.1.zip", destfile = "christensen.zip")
unzip("christensen.zip", exdir = "christensen")
download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt")
38 changes: 38 additions & 0 deletions christensen-2009/2-clean.r
@@ -0,0 +1,38 @@
# Methylation Data Set from Christensen et al. (2009)
library('plyr')

temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
temp <- temp[,c(1,8)]
names(temp) <- c("subject_id", "labels")

# Removes the extraneous " 1" from each subject's ID.
temp$subject_id <- apply(temp, 1, function(subject) {
unlist(strsplit(subject[1], " "))[1]
})

# The paper considers the three groups: "blood", "placenta", and "other"
temp[which(temp$labels == "guthrie blood"),]$labels <- "blood"
temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other"

subjects_files <- dir("christensen")
christensen <- ldply(subjects_files, function(subject_file) {
subject_id <- unlist(strsplit(subject_file, "_"))[1]
subject_data <- read.table(paste("christensen/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
subject_df <- rbind.data.frame(subject_data[,2])
subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
names(subject_df) <- c("labels", subject_data[,1])
subject_df
}, .progress = "text")
christensen <- list(
x = subset(christensen, select=-labels),
y = factor(christensen$labels)
)

# Removes the downloaded, compressed ZIP file along with the meta data.
file.remove("additional_info.txt")
file.remove("christensen.zip")

# Removes the folder 'christensen' that contained the decompressed data
unlink("christensen/", recursive = TRUE)


5 changes: 5 additions & 0 deletions christensen-2009/3-save.r
@@ -0,0 +1,5 @@
# Save a compressed version of the Christensen et al. (2009) data set.
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(christensen, file = "christensen.RData", compress = "xz")
27 changes: 0 additions & 27 deletions christensen-methylation/2-clean.r

This file was deleted.

1 change: 0 additions & 1 deletion christensen-methylation/3-load.r

This file was deleted.

3 changes: 0 additions & 3 deletions golub-1999/2-clean.r
Expand Up @@ -7,9 +7,6 @@ library('golubEsets')
# By default, we only consider the original two classes (i.e. ALL or AML)
two_classes <- TRUE

# TODO:
# Rename file to 2-clean.r

# The training data set.
data('Golub_Train')
x <- t(exprs(Golub_Train))
Expand Down
File renamed without changes.
45 changes: 45 additions & 0 deletions gravier-2010/2-clean.r
@@ -0,0 +1,45 @@
# I'm using a 'trim' function that removes trailing and leading white space from a string.
# The code is from:
# http://stackoverflow.com/questions/2261079/whitespace-in-r

# Breast Cancer Data Set from Gravier et al. (2010)
library('plyr')
temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "")
temp <- temp[,c(1,5)]
names(temp) <- c("subject_id", "labels")
temp <- temp[which(trim(temp$labels) != ""), ]

# Removes the extraneous " 1" from each subject's ID.
temp$subject_id <- apply(temp, 1, function(subject) {
unlist(strsplit(subject[1], " "))[1]
})

# From the paper's abstract:
# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients
# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome.
#
# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5.
temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor")

subjects_files <- dir("gravier")
gravier <- ldply(subjects_files, function(subject_file) {
subject_id <- unlist(strsplit(subject_file, "_"))[1]
subject_data <- read.table(paste("gravier/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
subject_df <- rbind.data.frame(subject_data[,2])
subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df)
names(subject_df) <- c("labels", subject_data[,1])
subject_df
}, .progress = "text")

gravier <- list(
x = subset(gravier, select=-labels),
y = factor(gravier$labels)
)

# Removes the downloaded, compressed ZIP file along with the meta data.
file.remove("additional_info.txt")
file.remove("gravier.zip")

# Removes the folder 'christensen' that contained the decompressed data
unlink("gravier/", recursive = TRUE)

5 changes: 5 additions & 0 deletions gravier-2010/3-save.r
@@ -0,0 +1,5 @@
# Save a compressed version of the Gravier et al. (2010) data set.
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(gravier, file = "gravier.RData", compress = "xz")
34 changes: 0 additions & 34 deletions gravier-breast/2-clean.r

This file was deleted.

1 change: 0 additions & 1 deletion gravier-breast/3-load.r

This file was deleted.

4 changes: 2 additions & 2 deletions shipp-DLBCL/1-download.r → shipp-2002/1-download.r
Expand Up @@ -13,5 +13,5 @@
# patients (labelled as 'cured')while 26 of them are from patients with fatal or refractory disease (labelled as 'fatal').
# The expression profile contains 6817 genes.

download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "DLBCL.zip")
unzip("DLBCL.zip")
download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "shipp.zip")
unzip("shipp.zip")
11 changes: 11 additions & 0 deletions shipp-2002/2-clean.r
@@ -0,0 +1,11 @@
# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002)
temp <- read.csv("DLBCLTumor.data", header = FALSE)

shipp <- list(
x = temp[,-ncol(temp)],
y = temp[,ncol(temp)]
)

# Removes downloaded files
unlink("shipp.zip")
unlink("DLBCL*")
5 changes: 5 additions & 0 deletions shipp-2002/3-save.r
@@ -0,0 +1,5 @@
# Save a compressed version of the Shipp et al. (2002) data set.
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(shipp, file = "shipp.RData", compress = "xz")
5 changes: 0 additions & 5 deletions shipp-DLBCL/2-clean.r

This file was deleted.

1 change: 0 additions & 1 deletion shipp-DLBCL/3-load.r

This file was deleted.

4 changes: 2 additions & 2 deletions singh-prostate/1-download.r → singh-2002/1-download.r
Expand Up @@ -19,5 +19,5 @@
# (B) Prediction of clinical outcome: in this data set, 21 patients were evaluable with respect to recurrence following surgery
# with 8 patients having relapsed and 13 patients having remained relapse free ("non-relapse") for at least 4 years.

download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "ProstateCancer.zip")
unzip("ProstateCancer.zip")
download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "singh.zip")
unzip("singh.zip")
14 changes: 14 additions & 0 deletions singh-2002/2-clean.r
@@ -0,0 +1,14 @@
# Prostate Cancer Data Set from Singh et al. (2002)
temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE)
prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)])

singh <- list(
x = temp[,-ncol(temp)],
y = temp[,ncol(temp)]
)

# Removes downloaded files
unlink("singh.zip")
unlink("prostate/", recursive = TRUE)
unlink("prostate*")

5 changes: 5 additions & 0 deletions singh-2002/3-save.r
@@ -0,0 +1,5 @@
# Save a compressed version of the Singh et al. (2002) data set.
# The 'xz' compression format will compress the data more than the
# default 'gzip' format. However, the 'xz' takes slightly longer
# (~2 seconds longer) than 'gzip'.
save(singh, file = "singh.RData", compress = "xz")
5 changes: 0 additions & 5 deletions singh-prostate/2-clean.r

This file was deleted.

1 change: 0 additions & 1 deletion singh-prostate/3-load.r

This file was deleted.

0 comments on commit 56c3d6f

Please sign in to comment.