# This notebook is to collect Data for assessing whether time since TB infection predictor generalizes to Adolescent Acquisition study, or whether current positive predictions are from distribution mismatch between RNA-seq and microarray

# Load Libraries

In [2]:
library(MetaIntegrator)
library(GEOquery)

# Pull the TB datasets together

## Functions to help

In [4]:
gses.array = c(
"GSE19491",


"GSE37250",
"GSE39939",
"GSE39940",

'GSE42825', 'GSE42826' ,'GSE42827', 'GSE42830', 'GSE42831', 'GSE42832'
)


test.gses = gses.array

In [5]:
make.total.dataset = function(data.dir, gses) {
    pheno.list = list()
    exprs.list = list()
    for (GSE in gses) {
        gse.dir = paste(data.dir, GSE, sep="/")
        gene_file = paste(gse.dir, "/", GSE, "_genelevel.csv", sep="")
        pheno_file = paste(gse.dir, "/",  GSE, "_pheno.csv", sep="")
        
        exprs = read.csv(gene_file, row.names=1, header=T)
        pheno = read.csv(pheno_file, row.names=1, header=T)
        
        
        exprs.list[[GSE]] = exprs
        pheno.list[[GSE]] = pheno
    }
    
    results = list()
    results$exprs = exprs.list
    results$pheno.list = pheno.list
    return(results)
    
}

In [6]:
data.dir = "/master/rault/bitbucket/deep_tb/GEO_Warsinske_TB/"

## Read in the GSEs data for test set (validation of recent exposure predictor)

In [7]:
test.data = make.total.dataset(data.dir, test.gses)

In [8]:
str(test.data)

List of 2
 $ exprs     :List of 10
  ..$ GSE19491:'data.frame':	15836 obs. of  498 variables:
  .. ..$ GSM484368: num [1:15836] 14.29 9.51 6.16 7.9 11.51 ...
  .. ..$ GSM484369: num [1:15836] 14.12 9.74 6.1 7.8 11.02 ...
  .. ..$ GSM484370: num [1:15836] 13.81 9.59 6.23 7.85 11.43 ...
  .. ..$ GSM484371: num [1:15836] 13.94 9.68 6.3 7.82 11.59 ...
  .. ..$ GSM484372: num [1:15836] 14.08 9.47 6.02 7.8 11.51 ...
  .. ..$ GSM484373: num [1:15836] 14.24 9.7 6.05 8.11 11.41 ...
  .. ..$ GSM484374: num [1:15836] 14.3 9.49 6.04 7.65 11.71 ...
  .. ..$ GSM484375: num [1:15836] 14.32 9.5 5.95 7.58 11.85 ...
  .. ..$ GSM484376: num [1:15836] 13.96 9.22 5.92 7.81 11.33 ...
  .. ..$ GSM484377: num [1:15836] 14.25 9.25 6.02 7.82 11.8 ...
  .. ..$ GSM484378: num [1:15836] 14.11 9.99 5.78 7.82 11.88 ...
  .. ..$ GSM484379: num [1:15836] 13.96 9.37 6.21 7.66 11.57 ...
  .. ..$ GSM484380: num [1:15836] 13.76 10.16 6.06 7.88 11.32 ...
  .. ..$ GSM484381: num [1:15836] 13.65 9.92 5.93 7.94 11.85 ...
  ..

## Format the test phenotype data

"status" is the TB variable. "hiv.status" is the HIV variable. These are all I care about.

In [9]:
lapply(test.data$pheno.list, function (x) {colnames(x)})

### Format GSE19491

- I think some of the TB and control samples may be longitudinal, and in the Duffy et al paper were excluded. Treated TB and anything control shouldn't show recent infection, so for now I will just include them. If needed I can dive deeper into the analysis

In [None]:
# For GSE19491
# Exclude source_name_ch1 in here
"Whole blood from patient with active TB 12 months after treatment started (i.e. after treatment completed)"
"Whole blood from patient with active TB 2 months after treatment started"

# illness.ch1 is the key phenotype
# anything containing "Control" (i.e. grepl) should be "control". anything containing Latent or LATENT TB should be LTBI
# turn PTB into TB
# keep everything else
#


In [11]:
test.data$pheno.list$GSE19491 = droplevels(test.data$pheno.list$GSE19491[!(test.data$pheno.list$GSE19491$source_name_ch1 %in% 
                                                                c("Whole blood from patient with active TB 12 months after treatment started (i.e. after treatment completed)",
                                                                "Whole blood from patient with active TB 2 months after treatment started")),])
test.data$pheno.list$GSE19491$status = as.character(test.data$pheno.list$GSE19491$illness.ch1)

In [12]:
test.data$pheno.list$GSE19491$status = as.character(ifelse(grepl("Control", test.data$pheno.list$GSE19491$status),
                                              "control", ifelse(grepl("Latent|LATENT", test.data$pheno.list$GSE19491$status),
                                                               "LTBI", ifelse(grepl("PTB", test.data$pheno.list$GSE19491$status),
                                                                             "TB", 
                                                                              ifelse(test.data$pheno.list$GSE19491$status == "ASLE", "SLE",
                                                                                    ifelse(is.na(test.data$pheno.list$GSE19491$status), "control",
                                                                                          test.data$pheno.list$GSE19491$status))))))

In [13]:
test.data$pheno.list$GSE19491$status = ifelse(is.na(test.data$pheno.list$GSE19491$status), "control", test.data$pheno.list$GSE19491$status)
sum(is.na(test.data$pheno.list$GSE19491$status))

In [14]:
test.data$pheno.list$GSE19491$status

In [16]:
test.data$pheno.list$GSE19491$status == "TB"

In [17]:
table(test.data$pheno.list$GSE19491$status)


control    LTBI    PSLE     SLE   Staph   Still   Strep      TB 
    133      69      82      28      40      31      12      89 

In [19]:
test.data$pheno.list$GSE19491$hiv.status = "negative"

In [20]:
summary(test.data$pheno.list$GSE19491)

                          title       geo_accession    status         
 CON_LON_test035             :  1   GSM484368:  1   Length:484        
 CON_LON_test035_long_0_022  :  1   GSM484369:  1   Class :character  
 CON_LON_test035_sep_CD4_034 :  1   GSM484370:  1   Mode  :character  
 CON_LON_test035_sep_CD8_035 :  1   GSM484371:  1                     
 CON_LON_test035_sep_Mono_033:  1   GSM484372:  1                     
 CON_LON_test035_sep_Neut_032:  1   GSM484373:  1                     
 (Other)                     :478   (Other)  :478                     
    submission_date    last_update_date  type     channel_count
 Dec 11 2009: 61    Aug 11 2010:387     RNA:484   Min.   :1    
 Dec 12 2009:149    Jan 04 2018: 97               1st Qu.:1    
 Jun 02 2010:193                                  Median :1    
 Jun 03 2010: 81                                  Mean   :1    
                                                  3rd Qu.:1    
                                                

In [24]:
table(test.data$pheno.list$GSE19491$illness.ch1, test.data$pheno.list$GSE19491$source_name_ch1)

                
                 CD4+ cells from healthy control
  ASLE                                         0
  Control                                      0
  Control (BCG-)                               0
  Control (BCG+)                               4
  Latent                                       0
  LATENT TB                                    0
  PSLE                                         0
  PTB                                          0
  Staph                                        0
  Still                                        0
  Strep                                        0
                
                 CD4+ cells from patient with Active TB
  ASLE                                                0
  Control                                             0
  Control (BCG-)                                      0
  Control (BCG+)                                      0
  Latent                                              0
  LATENT TB                               

In [17]:
table(test.data$pheno.list$GSE19491$illness.ch1, test.data$pheno.list$GSE19491$source_name_ch1)

                
                 CD4+ cells from healthy control
  ASLE                                         0
  Control                                      0
  Control (BCG-)                               0
  Control (BCG+)                               4
  Latent                                       0
  LATENT TB                                    0
  PSLE                                         0
  PTB                                          0
  Staph                                        0
  Still                                        0
  Strep                                        0
                
                 CD4+ cells from patient with Active TB
  ASLE                                                0
  Control                                             0
  Control (BCG-)                                      0
  Control (BCG+)                                      0
  Latent                                              0
  LATENT TB                               

### Format GSE37250

- Other disease is a general indication and is not specified, so I will keep it as "other_disease".
- Some individuals are HIV positive in this cohort. I will keep this with TB and examine HIV status later.

In [21]:
test.data$pheno.list$GSE37250$status = as.character(test.data$pheno.list$GSE37250$disease.state.ch1)
test.data$pheno.list$GSE37250$status = ifelse(test.data$pheno.list$GSE37250$status == "active tuberculosis", "TB",
                                              ifelse(test.data$pheno.list$GSE37250$status == "latent TB infection", "LTBI", "other_disease"))

In [22]:
table(test.data$pheno.list$GSE37250$status)


         LTBI other_disease            TB 
          167           175           195 

In [26]:
test.data$pheno.list$GSE37250$hiv.status = as.character(test.data$pheno.list$GSE37250$hiv.status.ch1)
test.data$pheno.list$GSE37250$hiv.status = ifelse(test.data$pheno.list$GSE37250$hiv.status == "HIV negative", "negative", "positive")
table(test.data$pheno.list$GSE37250$hiv.status)


negative positive 
     263      274 

In [27]:
summary(test.data$pheno.list$GSE37250)

                                                  title       geo_accession
 active tuberculosis HIV negative Malawi WB_1_M_39619:  1   GSM914353:  1  
 active tuberculosis HIV negative Malawi WB_1_M_39668:  1   GSM914354:  1  
 active tuberculosis HIV negative Malawi WB_1_M_39827:  1   GSM914355:  1  
 active tuberculosis HIV negative Malawi WB_1_M_39896:  1   GSM914356:  1  
 active tuberculosis HIV negative Malawi WB_1_M_39956:  1   GSM914357:  1  
 active tuberculosis HIV negative Malawi WB_1_M_39985:  1   GSM914358:  1  
 (Other)                                             :531   (Other)  :531  
    status             submission_date    last_update_date  type    
 Length:537         Apr 13 2012:537    Dec 17 2018:537     RNA:537  
 Class :character                                                   
 Mode  :character                                                   
                                                                    
                                               

In [28]:
table(test.data$pheno.list$GSE37250$title)


       active tuberculosis HIV negative Malawi WB_1_M_39619 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_39668 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_39827 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_39896 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_39956 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_39985 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_40029 
                                                          1 
       active tuberculosis HIV negative Malawi WB_1_M_40176 
                                                          1 
       active tuberculo

### Remainder except GSE42831 ('GSE42825', 'GSE42826', 'GSE42827', 'GSE42830', 'GSE42832')

In [29]:
names(test.data$pheno.list)

In [31]:
to.analyze = c('GSE42825', 'GSE42826', 'GSE42827', 'GSE42830', 'GSE42832')

In [36]:
for (gse in to.analyze) {
    pheno = test.data$pheno.list[[gse]]
    pheno$status = as.character(pheno$disease.state.ch1)
    pheno$status = ifelse(grepl("arcoid", pheno$status), "sarcoidosis",
                         ifelse(grepl("Control", pheno$status), "control",
                               ifelse(grepl("lung cancer", pheno$status), "lung_cancer",
                                     ifelse(grepl("Pneumonia|Baseline", pheno$status), "pneumonia", pheno$status))))
    pheno$hiv.status = as.character("negative")
    test.data$pheno.list[[gse]] = pheno
}

In [37]:
lapply(to.analyze, function(x) {table(test.data$pheno.list[[x]][,"status"])})

[[1]]

    control sarcoidosis          TB 
         23          11           8 

[[2]]

    control lung_cancer   pneumonia sarcoidosis          TB 
         52           8           6          25          11 

[[3]]

pneumonia 
       10 

[[4]]

    control lung_cancer   pneumonia sarcoidosis          TB 
         38           8           8          25          16 

[[5]]

    control sarcoidosis          TB 
         30          30          30 


In [38]:
lapply(to.analyze, function(x) {table(test.data$pheno.list[[x]][,"disease.state.ch1"])})

[[1]]

    Active sarcoidosis                Control Non-active sarcoidosis 
                     6                     23                      5 
                    TB 
                     8 

[[2]]

        Active Sarcoid                Control            lung cancer 
                    16                     52                      8 
Non-active sarcoidosis              Pneumonia                     TB 
                     9                      6                     11 

[[3]]

pneumonia 
       10 

[[4]]

        Active Sarcoid               Baseline                Control 
                    17                      8                     38 
           lung cancer Non-active sarcoidosis                     TB 
                     8                      8                     16 

[[5]]

Control Sarcoid      TB 
     30      30      30 


In [39]:
summary(droplevels(test.data$pheno.list$GSE42830[test.data$pheno.list$GSE42830$disease.state.ch1 == "Baseline",]))

                              title      geo_accession    status         
 Pneumonia Sample 18 training set:1   GSM1050945:1     Length:8          
 Pneumonia Sample 33 training set:1   GSM1050960:1     Class :character  
 Pneumonia Sample 36 training set:1   GSM1050963:1     Mode  :character  
 Pneumonia Sample 38 training set:1   GSM1050965:1                       
 Pneumonia Sample 47 training set:1   GSM1050974:1                       
 Pneumonia Sample 63 training set:1   GSM1050990:1                       
 (Other)                         :2   (Other)   :2                       
    submission_date    last_update_date  type   channel_count
 Dec 10 2012:8      Oct 31 2013:8       RNA:8   Min.   :1    
                                                1st Qu.:1    
                                                Median :1    
                                                Mean   :1    
                                                3rd Qu.:1    
                                    

In [40]:
summary(test.data$pheno.list$GSE42825)

                              title       geo_accession    status         
 Control Sample 10 validation set: 1   GSM1050732: 1    Length:42         
 Control Sample 11 validation set: 1   GSM1050733: 1    Class :character  
 Control Sample 12 validation set: 1   GSM1050734: 1    Mode  :character  
 Control Sample 13 validation set: 1   GSM1050735: 1                      
 Control Sample 14 validation set: 1   GSM1050736: 1                      
 Control Sample 15 validation set: 1   GSM1050737: 1                      
 (Other)                         :36   (Other)   :36                      
    submission_date    last_update_date  type    channel_count
 Dec 10 2012:42     Oct 31 2013:42      RNA:42   Min.   :1    
                                                 1st Qu.:1    
                                                 Median :1    
                                                 Mean   :1    
                                                 3rd Qu.:1    
                      

### Format GSE42831

- All samples are sarcoidosis samples. Just take sarcoidosis pre-treatment

In [44]:
test.data$pheno.list$GSE42831$status = as.character("sarcoidosis")
test.data$pheno.list$GSE42831$hiv.status = as.character("negative")
test.data$pheno.list$GSE42831 = droplevels(test.data$pheno.list$GSE42831[test.data$pheno.list$GSE42831$treament.ch1 == "Pre-treatment",])

In [45]:
summary(test.data$pheno.list$GSE42831)

                                             title      geo_accession
 Sarcoid Sample 10 6011 Pre-treatment time pt. 1:1   GSM1051029:1    
 Sarcoid Sample 11 2010 Pre-treatment time pt. 1:1   GSM1051030:1    
 Sarcoid Sample 14 2140 Pre-treatment time pt. 1:1   GSM1051031:1    
 Sarcoid Sample 15 6009 Pre-treatment time pt. 1:1   GSM1051032:1    
 Sarcoid Sample 7 2099 Pre-treatment time pt. 1 :1   GSM1051033:1    
 Sarcoid Sample 8 2018 Pre-treatment time pt. 1 :1   GSM1051036:1    
 Sarcoid Sample 9 2100 Pre-treatment time pt. 1 :1   GSM1051037:1    
    status             submission_date    last_update_date  type  
 Length:7           Dec 10 2012:7      Oct 31 2013:7       RNA:7  
 Class :character                                                 
 Mode  :character                                                 
                                                                  
                                                                  
                                      

In [46]:
table(test.data$pheno.list$GSE42831$title)


Sarcoid Sample 10 6011 Pre-treatment time pt. 1 
                                              1 
Sarcoid Sample 11 2010 Pre-treatment time pt. 1 
                                              1 
Sarcoid Sample 14 2140 Pre-treatment time pt. 1 
                                              1 
Sarcoid Sample 15 6009 Pre-treatment time pt. 1 
                                              1 
 Sarcoid Sample 7 2099 Pre-treatment time pt. 1 
                                              1 
 Sarcoid Sample 8 2018 Pre-treatment time pt. 1 
                                              1 
 Sarcoid Sample 9 2100 Pre-treatment time pt. 1 
                                              1 

### Format GSE39939

In [47]:
summary(test.data$pheno.list$GSE39939)

              title        geo_accession                   status   
 WB_20_KIL_100091:  1   GSM1249177:  1   Public on May 01 2014:157  
 WB_20_KIL_100360:  1   GSM1249178:  1                              
 WB_20_KIL_100398:  1   GSM1249179:  1                              
 WB_20_KIL_100406:  1   GSM1249180:  1                              
 WB_20_KIL_500086:  1   GSM1249181:  1                              
 WB_20_KIL_500119:  1   GSM1249182:  1                              
 (Other)         :151   (Other)   :151                              
    submission_date    last_update_date  type     channel_count
 Oct 23 2013:157    May 01 2014:157     RNA:157   Min.   :1    
                                                  1st Qu.:1    
                                                  Median :1    
                                                  Mean   :1    
                                                  3rd Qu.:1    
                                                  Max.   :1    


In [49]:
test.data$pheno.list$GSE39939$status = as.character(test.data$pheno.list$GSE39939$illness.ch1)
test.data$pheno.list$GSE39939$status = ifelse(grepl("active tuberculosis", test.data$pheno.list$GSE39939$status), "TB",
                                             ifelse(grepl("other disease", test.data$pheno.list$GSE39939$status), "other_disease",
                                                    "LTBI"))
test.data$pheno.list$GSE39939$hiv.status = as.character(test.data$pheno.list$GSE39939$hiv.status.ch1)
test.data$pheno.list$GSE39939$hiv.status = ifelse(test.data$pheno.list$GSE39939$hiv.status == "HIV negative", "negative", "positive")

In [50]:
test.data$pheno.list$GSE39939$hiv.status

In [51]:
table(test.data$pheno.list$GSE39939$status)


         LTBI other_disease            TB 
           14            64            79 

### Format GSE39940

In [52]:
summary(test.data$pheno.list$GSE39940)

           title       geo_accession                   status   
 WB_20_M_40012:  1   GSM981627:  1   Public on May 01 2014:334  
 WB_20_M_40182:  1   GSM981628:  1                              
 WB_20_M_40263:  1   GSM981629:  1                              
 WB_20_M_40376:  1   GSM981630:  1                              
 WB_20_M_40384:  1   GSM981631:  1                              
 WB_20_M_40385:  1   GSM981632:  1                              
 (Other)      :328   (Other)  :328                              
    submission_date    last_update_date  type     channel_count
 Aug 07 2012:334    May 01 2014:334     RNA:334   Min.   :1    
                                                  1st Qu.:1    
                                                  Median :1    
                                                  Mean   :1    
                                                  3rd Qu.:1    
                                                  Max.   :1    
                                

In [54]:
test.data$pheno.list$GSE39940$status = as.character(test.data$pheno.list$GSE39940$disease.status.ch1)
test.data$pheno.list$GSE39940$status = ifelse(test.data$pheno.list$GSE39940$status == "active tuberculosis", "TB",
                                             ifelse(test.data$pheno.list$GSE39940$status == "other disease", "other_disease", "LTBI"))

test.data$pheno.list$GSE39940$hiv.status = as.character(test.data$pheno.list$GSE39940$hiv.status.ch1)
test.data$pheno.list$GSE39940$hiv.status = ifelse(test.data$pheno.list$GSE39940$hiv.status == "HIV negative", "negative", "positive")

In [89]:
table(test.data$pheno.list$GSE39940$status)


         LTBI other_disease            TB 
           54           169           111 

In [55]:
table(test.data$pheno.list$GSE39940$status, test.data$pheno.list$GSE39940$hiv.status)

               
                negative positive
  LTBI                54        0
  other_disease      103       66
  TB                  70       41

## Review that the phenotype formatting is correct and save to csv files (status and GSM identifier)

### Review phenotype data

In [56]:
lapply(test.data$pheno.list, function(x) {table(x$status)})

$GSE19491

control    LTBI    PSLE     SLE   Staph   Still   Strep      TB 
    133      69      82      28      40      31      12      89 

$GSE37250

         LTBI other_disease            TB 
          167           175           195 

$GSE39939

         LTBI other_disease            TB 
           14            64            79 

$GSE39940

         LTBI other_disease            TB 
           54           169           111 

$GSE42825

    control sarcoidosis          TB 
         23          11           8 

$GSE42826

    control lung_cancer   pneumonia sarcoidosis          TB 
         52           8           6          25          11 

$GSE42827

pneumonia 
       10 

$GSE42830

    control lung_cancer   pneumonia sarcoidosis          TB 
         38           8           8          25          16 

$GSE42831

sarcoidosis 
          7 

$GSE42832

    control sarcoidosis          TB 
         30          30          30 


In [57]:
Reduce("+", lapply(test.data$pheno.list, function(x) {sum(x$status == "TB")}))

In [58]:
lapply(test.data$pheno.list, function(x) {sum(x$status == "TB")})

In [74]:
lapply(test.data$pheno.list, function(x) {table(x$status)})

$GSE19491

control    LTBI    PSLE     SLE   Staph   Still   Strep      TB 
    133      69      82      28      40      31      12      89 

$GSE37250

         LTBI other_disease            TB 
          167           175           195 

$GSE39939

         LTBI other_disease            TB 
           14            64            79 

$GSE39940

         LTBI other_disease            TB 
           54           169           111 

$GSE42826

    control lung_cancer   pneumonia sarcoidosis          TB 
         52           8           6          25          11 

$GSE42827

pneumonia 
       10 

$GSE42830

    control lung_cancer   pneumonia sarcoidosis          TB 
         38           8           8          25          16 

$GSE42831

sarcoidosis 
          7 


### Combine phenotype data and GSM's across all tables, excluding GSE42825 and GSE42832 based on fewer number of genes

In [73]:
test.data$pheno.list$GSE42825 = NULL
test.data$pheno.list$GSE42832 = NULL

In [75]:
test.pheno = data.frame()
for (GSE in test.data$pheno.list) {
    test.pheno = rbind(test.pheno, GSE[,c("status", "hiv.status"), drop=F])
}

In [76]:
dim(test.pheno)
table(test.pheno)

               hiv.status
status          negative positive
  control            223        0
  LTBI               220       84
  lung_cancer         16        0
  other_disease      223      185
  pneumonia           24        0
  PSLE                82        0
  sarcoidosis         57        0
  SLE                 28        0
  Staph               40        0
  Still               31        0
  Strep               12        0
  TB                 335      166

In [77]:
write.csv(test.pheno, "data/Illumina_microarray_GSE_validation/Illumina_microarray_validation_pheno.csv")

## Analyze the number of genes in each microarray and determine how to do overlap

In [63]:
lapply(test.data$exprs, function(x) {dim(x)[1]})

In [71]:
lapply(test.data$exprs, function(x) {dim(x)[1]})

### Exclude GSE42832 and GSE42825 based on smaller number of genes. Keep 19491 as common denominator since these were in my RNA-seq validation

In [65]:
test.data$exprs$GSE42825 = NULL
test.data$exprs$GSE42832 = NULL

## Merge the expression datasets

In [83]:
genes.to.keep = intersect(rownames(test.data$exprs$GSE19491), rownames(test.data$exprs$GSE42830))
length(genes.to.keep)
head(genes.to.keep)

In [80]:
merge.exprs = function(datasets, seed.data) {
    begin = datasets[[seed.data]]
    remaining = setdiff(names(datasets), seed.data)
    print(remaining)
    for (GSE in remaining) {
        begin = merge(begin, datasets[[GSE]], by="row.names", all.x=T)#, suffixes = c("",""))
        rownames(begin) = begin$Row.names
        begin$Row.names = NULL
        
    }
    return(begin)
}

In [84]:
test.data$exprs.fil = lapply(test.data$exprs, function(x) {x[rownames(x) %in% genes.to.keep, ]})

In [85]:
lapply(test.data$exprs.fil, function(x) {dim(x)[1]})

In [86]:
test.merge = merge.exprs(test.data$exprs.fil, "GSE19491")

[1] "GSE37250" "GSE39939" "GSE39940" "GSE42826" "GSE42827" "GSE42830" "GSE42831"


In [88]:
length(intersect(rownames(test.pheno), colnames(test.merge)))

In [89]:
test.merge = test.merge[, colnames(test.merge) %in% rownames(test.pheno)]

In [91]:
identical(rownames(test.pheno), colnames(test.merge))

In [90]:
dim(test.pheno)
dim(test.merge)

In [92]:
write.csv(test.merge, "data/Illumina_microarray_GSE_validation/Illumina_microarray_validation_exprs.csv")