From 56c3d6fa83850e11e94e876f2441ea036bc47067 Mon Sep 17 00:00:00 2001 From: John Ramey Date: Tue, 6 Mar 2012 21:36:57 -0800 Subject: [PATCH] Updated all data sets to a named list with elements 'x' and 'y'. The data objects match the first author's last name. --- TODO | 31 +++---------- {alon-colon => alon-1999}/1-download.r | 3 +- {alon-colon => alon-1999}/2-clean.r | 11 ++--- alon-1999/3-save.r | 5 +++ chiaretti-2004/2-clean.r | 2 +- chiaretti-2004/3-save.r | 2 +- .../1-download.r | 2 +- christensen-2009/2-clean.r | 38 ++++++++++++++++ christensen-2009/3-save.r | 5 +++ christensen-methylation/2-clean.r | 27 ----------- christensen-methylation/3-load.r | 1 - golub-1999/2-clean.r | 3 -- {gravier-breast => gravier-2010}/1-download.r | 0 gravier-2010/2-clean.r | 45 +++++++++++++++++++ gravier-2010/3-save.r | 5 +++ gravier-breast/2-clean.r | 34 -------------- gravier-breast/3-load.r | 1 - {shipp-DLBCL => shipp-2002}/1-download.r | 4 +- shipp-2002/2-clean.r | 11 +++++ shipp-2002/3-save.r | 5 +++ shipp-DLBCL/2-clean.r | 5 --- shipp-DLBCL/3-load.r | 1 - {singh-prostate => singh-2002}/1-download.r | 4 +- singh-2002/2-clean.r | 14 ++++++ singh-2002/3-save.r | 5 +++ singh-prostate/2-clean.r | 5 --- singh-prostate/3-load.r | 1 - 27 files changed, 151 insertions(+), 119 deletions(-) rename {alon-colon => alon-1999}/1-download.r (88%) rename {alon-colon => alon-1999}/2-clean.r (52%) create mode 100644 alon-1999/3-save.r rename {christensen-methylation => christensen-2009}/1-download.r (99%) create mode 100644 christensen-2009/2-clean.r create mode 100644 christensen-2009/3-save.r delete mode 100644 christensen-methylation/2-clean.r delete mode 100644 christensen-methylation/3-load.r rename {gravier-breast => gravier-2010}/1-download.r (100%) create mode 100644 gravier-2010/2-clean.r create mode 100644 gravier-2010/3-save.r delete mode 100644 gravier-breast/2-clean.r delete mode 100644 gravier-breast/3-load.r rename {shipp-DLBCL => shipp-2002}/1-download.r (93%) create mode 100644 shipp-2002/2-clean.r create mode 100644 shipp-2002/3-save.r delete mode 100644 shipp-DLBCL/2-clean.r delete mode 100644 shipp-DLBCL/3-load.r rename {singh-prostate => singh-2002}/1-download.r (93%) create mode 100644 singh-2002/2-clean.r create mode 100644 singh-2002/3-save.r delete mode 100644 singh-prostate/2-clean.r delete mode 100644 singh-prostate/3-load.r diff --git a/TODO b/TODO index c537e52..d1b9da1 100644 --- a/TODO +++ b/TODO @@ -1,33 +1,14 @@ -Update the following data sets to the new scheme (see Chiaretti, 2004 for example) - Alon - Christensen - Gravier - Shipp - Singh +* Package TODO +** Update the following data sets to the new scheme (see Chiaretti, 2004 for example) Cho et al. (1998) - Yeast Cell Cycle Bhattacharjee et al. (2001) - Lung Cancer Wen et al. (1998) - Rat CNS Yeoh et al. (2002) - St. Jude Leukemia +** Add data set descriptions to help.r +** Add data set descriptions to github Wiki -For 1000 splits, store the training_obs, test_obs, and the the gene ordering with Dudoit's var_sel. - Alon - Chiaretti - Christensen - Golub - Gravier - Khan - Shipp - Singh - Cho et al. (1998) - Yeast Cell Cycle - Bhattacharjee et al. (2001) - Lung Cancer - Wen et al. (1998) - Rat CNS - Yeoh et al. (2002) - St. Jude Leukemia -Create a helper function that returns the splits along with the reduced dimension. - Takes dataset_name and q. -Add unit test to make sure that q's are specified correctly for each data set. -Also, add unit test to make sure helper function doesn't mess up with other names. - Example: NULL, "Awesome_data", "Asmodean" -Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user. +* Maybe/Someday +** Store data sets with RGoogleStorage (or RAmazonS3) and download them to the end user. How to download them only once? More information here: http://www.omegahat.org/ diff --git a/alon-colon/1-download.r b/alon-1999/1-download.r similarity index 88% rename from alon-colon/1-download.r rename to alon-1999/1-download.r index 7d28ff8..1bbc7ee 100644 --- a/alon-colon/1-download.r +++ b/alon-1999/1-download.r @@ -1,7 +1,6 @@ # A number of cancer data sets are on Bioconductor (http://www.bioconductor.org) # First, we install ALL of Bioconductor's R packages. source("http://bioconductor.org/biocLite.R") -biocLite() # Downloading the Alon Colon Cancer Data Set -biocLite("colonCA") \ No newline at end of file +biocLite("colonCA") diff --git a/alon-colon/2-clean.r b/alon-1999/2-clean.r similarity index 52% rename from alon-colon/2-clean.r rename to alon-1999/2-clean.r index a6bb291..9f4fae6 100644 --- a/alon-colon/2-clean.r +++ b/alon-1999/2-clean.r @@ -5,11 +5,8 @@ data('colonCA') # Bioconductor requires exprs() on the data sets. # We rename the columns of the data matrix because some of the microarray codes # exceed 256 characters in length, which causes errors in subsequent code. -colon.x <- t(exprs(colonCA)) -colnames(colon.x) <- paste("X", seq_len(ncol(colon.x)), sep = "") -colon.labels <- colonCA@phenoData$class - -colon.df <- data.frame(labels = colon.labels, colon.x) - -write.table(colon.df, "colon-cancer.csv", sep = ",", row = FALSE) +x <- t(exprs(colonCA)) +colnames(x) <- paste("X", seq_len(ncol(x)), sep = "") +y <- colonCA@phenoData$class +alon <- list(x = x, y = factor(y)) diff --git a/alon-1999/3-save.r b/alon-1999/3-save.r new file mode 100644 index 0000000..56a7201 --- /dev/null +++ b/alon-1999/3-save.r @@ -0,0 +1,5 @@ +# Save a compressed version of the Alon et al. (1999) data set. +# The 'xz' compression format will compress the data more than the +# default 'gzip' format. However, the 'xz' takes slightly longer +# (~2 seconds longer) than 'gzip'. +save(alon, file = "alon.RData", compress = "xz") diff --git a/chiaretti-2004/2-clean.r b/chiaretti-2004/2-clean.r index e92c0ab..ddc4158 100644 --- a/chiaretti-2004/2-clean.r +++ b/chiaretti-2004/2-clean.r @@ -31,4 +31,4 @@ if(two_classes) { y <- y[idx] } -chiaretti <- list(x = x, y = factor(y)) \ No newline at end of file +chiaretti <- list(x = x, y = factor(y)) diff --git a/chiaretti-2004/3-save.r b/chiaretti-2004/3-save.r index effe9b0..dbe8eab 100644 --- a/chiaretti-2004/3-save.r +++ b/chiaretti-2004/3-save.r @@ -2,4 +2,4 @@ # The 'xz' compression format will compress the data more than the # default 'gzip' format. However, the 'xz' takes slightly longer # (~2 seconds longer) than 'gzip'. -save(chiaretti, file = "chiaretti.RData", compress = "xz") \ No newline at end of file +save(chiaretti, file = "chiaretti.RData", compress = "xz") diff --git a/christensen-methylation/1-download.r b/christensen-2009/1-download.r similarity index 99% rename from christensen-methylation/1-download.r rename to christensen-2009/1-download.r index 0d274fe..2989582 100644 --- a/christensen-methylation/1-download.r +++ b/christensen-2009/1-download.r @@ -7,4 +7,4 @@ download.file(url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-19434/E-GEOD-19434.processed.1.zip", destfile = "christensen.zip") unzip("christensen.zip", exdir = "christensen") -download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt") \ No newline at end of file +download.file(url = "http://www.ebi.ac.uk/arrayexpress/files/E-GEOD-19434/E-GEOD-19434.sdrf.txt", destfile = "additional_info.txt") diff --git a/christensen-2009/2-clean.r b/christensen-2009/2-clean.r new file mode 100644 index 0000000..666c1bb --- /dev/null +++ b/christensen-2009/2-clean.r @@ -0,0 +1,38 @@ +# Methylation Data Set from Christensen et al. (2009) +library('plyr') + +temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "") +temp <- temp[,c(1,8)] +names(temp) <- c("subject_id", "labels") + +# Removes the extraneous " 1" from each subject's ID. +temp$subject_id <- apply(temp, 1, function(subject) { + unlist(strsplit(subject[1], " "))[1] +}) + +# The paper considers the three groups: "blood", "placenta", and "other" +temp[which(temp$labels == "guthrie blood"),]$labels <- "blood" +temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other" + +subjects_files <- dir("christensen") +christensen <- ldply(subjects_files, function(subject_file) { + subject_id <- unlist(strsplit(subject_file, "_"))[1] + subject_data <- read.table(paste("christensen/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE) + subject_df <- rbind.data.frame(subject_data[,2]) + subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df) + names(subject_df) <- c("labels", subject_data[,1]) + subject_df +}, .progress = "text") +christensen <- list( + x = subset(christensen, select=-labels), + y = factor(christensen$labels) + ) + +# Removes the downloaded, compressed ZIP file along with the meta data. +file.remove("additional_info.txt") +file.remove("christensen.zip") + +# Removes the folder 'christensen' that contained the decompressed data +unlink("christensen/", recursive = TRUE) + + diff --git a/christensen-2009/3-save.r b/christensen-2009/3-save.r new file mode 100644 index 0000000..5d1c9cc --- /dev/null +++ b/christensen-2009/3-save.r @@ -0,0 +1,5 @@ +# Save a compressed version of the Christensen et al. (2009) data set. +# The 'xz' compression format will compress the data more than the +# default 'gzip' format. However, the 'xz' takes slightly longer +# (~2 seconds longer) than 'gzip'. +save(christensen, file = "christensen.RData", compress = "xz") diff --git a/christensen-methylation/2-clean.r b/christensen-methylation/2-clean.r deleted file mode 100644 index d7155a8..0000000 --- a/christensen-methylation/2-clean.r +++ /dev/null @@ -1,27 +0,0 @@ -# Methylation Data Set from Christensen et al. (2009) -library(plyr) -temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE) -temp <- temp[,c(1,4)] -names(temp) <- c("subject.id", "labels") - -# Removes the extraneous " 1" from each subject's ID. -temp$subject.id <- apply(temp, 1, function(subject) { - unlist(strsplit(subject[1], " "))[1] -}) - -# The paper considers the three groups: "blood", "placenta", and "other" -temp[which(temp$labels == "guthrie blood"),]$labels <- "blood" -temp[temp$labels != "blood" & temp$labels != "placenta",]$labels <- "other" - -subjects.files <- dir("christensen") -christensen.df <- ldply(subjects.files, function(subject.file) { - subject.id <- unlist(strsplit(subject.file, "_"))[1] - subject.data <- read.table(paste("christensen/", subject.file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE) - subject.df <- rbind.data.frame(subject.data[,2]) - subject.df <- cbind.data.frame(temp[which(temp[,1] == subject.id), 2], subject.df) - names(subject.df) <- c("labels", subject.data[,1]) - subject.df -}, .progress = "text") -christensen.df$labels <- factor(christensen.df$labels) - -write.csv(christensen.df, bzfile("christensen.csv.bz2"), row.names = FALSE) \ No newline at end of file diff --git a/christensen-methylation/3-load.r b/christensen-methylation/3-load.r deleted file mode 100644 index 6b63d0d..0000000 --- a/christensen-methylation/3-load.r +++ /dev/null @@ -1 +0,0 @@ -christensen.df <- read.csv(bzfile("christensen.csv.bz2", "r")) \ No newline at end of file diff --git a/golub-1999/2-clean.r b/golub-1999/2-clean.r index 0aac1bd..5420d1f 100644 --- a/golub-1999/2-clean.r +++ b/golub-1999/2-clean.r @@ -7,9 +7,6 @@ library('golubEsets') # By default, we only consider the original two classes (i.e. ALL or AML) two_classes <- TRUE -# TODO: -# Rename file to 2-clean.r - # The training data set. data('Golub_Train') x <- t(exprs(Golub_Train)) diff --git a/gravier-breast/1-download.r b/gravier-2010/1-download.r similarity index 100% rename from gravier-breast/1-download.r rename to gravier-2010/1-download.r diff --git a/gravier-2010/2-clean.r b/gravier-2010/2-clean.r new file mode 100644 index 0000000..1fb5c41 --- /dev/null +++ b/gravier-2010/2-clean.r @@ -0,0 +1,45 @@ +# I'm using a 'trim' function that removes trailing and leading white space from a string. +# The code is from: +# http://stackoverflow.com/questions/2261079/whitespace-in-r + +# Breast Cancer Data Set from Gravier et al. (2010) +library('plyr') +temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE, comment.char = "") +temp <- temp[,c(1,5)] +names(temp) <- c("subject_id", "labels") +temp <- temp[which(trim(temp$labels) != ""), ] + +# Removes the extraneous " 1" from each subject's ID. +temp$subject_id <- apply(temp, 1, function(subject) { + unlist(strsplit(subject[1], " "))[1] +}) + +# From the paper's abstract: +# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients +# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome. +# +# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5. +temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor") + +subjects_files <- dir("gravier") +gravier <- ldply(subjects_files, function(subject_file) { + subject_id <- unlist(strsplit(subject_file, "_"))[1] + subject_data <- read.table(paste("gravier/", subject_file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE) + subject_df <- rbind.data.frame(subject_data[,2]) + subject_df <- cbind.data.frame(temp[which(temp[,1] == subject_id), 2], subject_df) + names(subject_df) <- c("labels", subject_data[,1]) + subject_df +}, .progress = "text") + +gravier <- list( + x = subset(gravier, select=-labels), + y = factor(gravier$labels) + ) + +# Removes the downloaded, compressed ZIP file along with the meta data. +file.remove("additional_info.txt") +file.remove("gravier.zip") + +# Removes the folder 'christensen' that contained the decompressed data +unlink("gravier/", recursive = TRUE) + diff --git a/gravier-2010/3-save.r b/gravier-2010/3-save.r new file mode 100644 index 0000000..16932bf --- /dev/null +++ b/gravier-2010/3-save.r @@ -0,0 +1,5 @@ +# Save a compressed version of the Gravier et al. (2010) data set. +# The 'xz' compression format will compress the data more than the +# default 'gzip' format. However, the 'xz' takes slightly longer +# (~2 seconds longer) than 'gzip'. +save(gravier, file = "gravier.RData", compress = "xz") diff --git a/gravier-breast/2-clean.r b/gravier-breast/2-clean.r deleted file mode 100644 index b757e32..0000000 --- a/gravier-breast/2-clean.r +++ /dev/null @@ -1,34 +0,0 @@ -# Breast Cancer Data Set from Gravier et al. (2010) -library(plyr) -temp <- read.table("additional_info.txt", header = TRUE, sep ="\t", stringsAsFactors = FALSE) -temp <- temp[,c(1,5)] -names(temp) <- c("subject.id", "labels") -temp <- temp[which(temp$labels != ""), ] - -# Removes the extraneous " 1" from each subject's ID. -temp$subject.id <- apply(temp, 1, function(subject) { - unlist(strsplit(subject[1], " "))[1] -}) - -# From the paper's abstract: -# The authors used Comparative Genomic Hybridization (CGH) array to analyze 168 pT1T2pN0 invasive ductal carcinoma patients -# with either good (no event 5 years after diagnosis: 111 patients) or poor (57 patients with early onset metastasis) outcome. -# -# NOTE: There are only 106 patients marked with "No Event" and there are 62 that had an event. We are off by 5. -temp[,2] <- ifelse(temp[,2] == "No event", "good", "poor") - -subjects.files <- dir("gravier") -gravier.df <- ldply(subjects.files, function(subject.file) { - subject.id <- unlist(strsplit(subject.file, "_"))[1] - subject.data <- read.table(paste("gravier/", subject.file, sep = ""), header = TRUE, sep = "\t", stringsAsFactors = FALSE) - subject.df <- rbind.data.frame(subject.data[,2]) - subject.df <- cbind.data.frame(temp[which(temp[,1] == subject.id), 2], subject.df) - names(subject.df) <- c("labels", subject.data[,1]) - subject.df -}, .progress = "text") -gravier.df$labels <- factor(gravier.df$labels) - -write.csv(gravier.df, bzfile("gravier.csv.bz2"), row.names = FALSE) - - - diff --git a/gravier-breast/3-load.r b/gravier-breast/3-load.r deleted file mode 100644 index 4509e30..0000000 --- a/gravier-breast/3-load.r +++ /dev/null @@ -1 +0,0 @@ -gravier.df <- read.csv(bzfile("gravier.csv.bz2", "r")) \ No newline at end of file diff --git a/shipp-DLBCL/1-download.r b/shipp-2002/1-download.r similarity index 93% rename from shipp-DLBCL/1-download.r rename to shipp-2002/1-download.r index dc3e7e1..cae4a1b 100644 --- a/shipp-DLBCL/1-download.r +++ b/shipp-2002/1-download.r @@ -13,5 +13,5 @@ # patients (labelled as 'cured')while 26 of them are from patients with fatal or refractory disease (labelled as 'fatal'). # The expression profile contains 6817 genes. -download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "DLBCL.zip") -unzip("DLBCL.zip") \ No newline at end of file +download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/DLBCL/DLBCL-Harvard.zip", destfile = "shipp.zip") +unzip("shipp.zip") diff --git a/shipp-2002/2-clean.r b/shipp-2002/2-clean.r new file mode 100644 index 0000000..505eb9c --- /dev/null +++ b/shipp-2002/2-clean.r @@ -0,0 +1,11 @@ +# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002) +temp <- read.csv("DLBCLTumor.data", header = FALSE) + +shipp <- list( + x = temp[,-ncol(temp)], + y = temp[,ncol(temp)] + ) + +# Removes downloaded files +unlink("shipp.zip") +unlink("DLBCL*") diff --git a/shipp-2002/3-save.r b/shipp-2002/3-save.r new file mode 100644 index 0000000..8b2cec5 --- /dev/null +++ b/shipp-2002/3-save.r @@ -0,0 +1,5 @@ +# Save a compressed version of the Shipp et al. (2002) data set. +# The 'xz' compression format will compress the data more than the +# default 'gzip' format. However, the 'xz' takes slightly longer +# (~2 seconds longer) than 'gzip'. +save(shipp, file = "shipp.RData", compress = "xz") diff --git a/shipp-DLBCL/2-clean.r b/shipp-DLBCL/2-clean.r deleted file mode 100644 index 0122758..0000000 --- a/shipp-DLBCL/2-clean.r +++ /dev/null @@ -1,5 +0,0 @@ -# Diffuse Large B-cell Lymphoma (DLBCL) Data Set from Shipp et al. (2002) -temp <- read.csv("DLBCLTumor.data", header = FALSE) -DLBCL.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)]) - -write.csv(DLBCL.df, bzfile("DLBCL.csv.bz2"), row.names = FALSE) \ No newline at end of file diff --git a/shipp-DLBCL/3-load.r b/shipp-DLBCL/3-load.r deleted file mode 100644 index 0edd07f..0000000 --- a/shipp-DLBCL/3-load.r +++ /dev/null @@ -1 +0,0 @@ -DLBCL.df <- read.csv(bzfile("DLBCL.csv.bz2", "r")) \ No newline at end of file diff --git a/singh-prostate/1-download.r b/singh-2002/1-download.r similarity index 93% rename from singh-prostate/1-download.r rename to singh-2002/1-download.r index 2254901..174c473 100644 --- a/singh-prostate/1-download.r +++ b/singh-2002/1-download.r @@ -19,5 +19,5 @@ # (B) Prediction of clinical outcome: in this data set, 21 patients were evaluable with respect to recurrence following surgery # with 8 patients having relapsed and 13 patients having remained relapse free ("non-relapse") for at least 4 years. -download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "ProstateCancer.zip") -unzip("ProstateCancer.zip") \ No newline at end of file +download.file(url = "http://datam.i2r.a-star.edu.sg/datasets/krbd/ProstateCancer/ProstateCancer.zip", destfile = "singh.zip") +unzip("singh.zip") diff --git a/singh-2002/2-clean.r b/singh-2002/2-clean.r new file mode 100644 index 0000000..e01b067 --- /dev/null +++ b/singh-2002/2-clean.r @@ -0,0 +1,14 @@ +# Prostate Cancer Data Set from Singh et al. (2002) +temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE) +prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)]) + +singh <- list( + x = temp[,-ncol(temp)], + y = temp[,ncol(temp)] + ) + +# Removes downloaded files +unlink("singh.zip") +unlink("prostate/", recursive = TRUE) +unlink("prostate*") + diff --git a/singh-2002/3-save.r b/singh-2002/3-save.r new file mode 100644 index 0000000..5b01f5b --- /dev/null +++ b/singh-2002/3-save.r @@ -0,0 +1,5 @@ +# Save a compressed version of the Singh et al. (2002) data set. +# The 'xz' compression format will compress the data more than the +# default 'gzip' format. However, the 'xz' takes slightly longer +# (~2 seconds longer) than 'gzip'. +save(singh, file = "singh.RData", compress = "xz") diff --git a/singh-prostate/2-clean.r b/singh-prostate/2-clean.r deleted file mode 100644 index 803520b..0000000 --- a/singh-prostate/2-clean.r +++ /dev/null @@ -1,5 +0,0 @@ -# Prostate Cancer Data Set from Singh et al. (2002) -temp <- read.csv("prostate/prostate_TumorVSNormal_train.data", header = FALSE) -prostate.df <- data.frame(labels = temp[,ncol(temp)], temp[,-ncol(temp)]) - -write.csv(prostate.df, bzfile("prostate.csv.bz2"), row.names = FALSE) \ No newline at end of file diff --git a/singh-prostate/3-load.r b/singh-prostate/3-load.r deleted file mode 100644 index 8d0a050..0000000 --- a/singh-prostate/3-load.r +++ /dev/null @@ -1 +0,0 @@ -prostate.df <- read.csv(bzfile("prostate.csv.bz2", "r")) \ No newline at end of file