# Add datasets from Amato et al, and Liu et al.

In [1]:
getwd()

In [2]:
library(plyr)
library(dplyr)
library(scales)
library(caret)

set.seed(64)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: lattice

Loading required package: ggplot2



## Read Gide Hugo Riaz AntiPD1 data

In [3]:
GideHugoRiaz_AntiPD1 <- read.csv("/home/jp/ICP_Responders/DataCollectionFormatting/MergeTables/GideHugoRiaz_AntiPD1.csv", check.names=FALSE)

In [4]:
head(GideHugoRiaz_AntiPD1)

Unnamed: 0_level_0,Patient,OS,OS.Event,RECIST,Age,Gender,Response,1,503538,2,⋯,11130,7789,158586,79364,440590,79699,7791,23140,26009,Source
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,147,1,0,48,1,0,,,,⋯,,,,,,,,,,GideAntiPD1
2,10,551,0,0,90,0,0,0.44967917,-0.1256063,-0.9428421,⋯,1.5168029,0.2051403085,0.2497006,0.50803986,0.70851142,0.45721398,-0.7961158,0.2429606,1.18213316,GideAntiPD1
3,11,166,1,0,66,0,0,-1.16982847,-0.6828899,-0.8272336,⋯,-1.0120033,0.0005757279,-0.4148035,0.05113678,0.09727642,-0.16385787,-0.2469282,0.1171039,-0.74993078,GideAntiPD1
4,12,199,1,1,52,0,0,-0.04601746,1.5028679,-0.8496055,⋯,-0.6297312,0.1098136507,-0.4097749,-0.18345745,0.16842307,-0.19196153,0.6580495,0.9784882,-0.27105096,GideAntiPD1
5,13,96,1,0,69,0,0,0.67577556,-0.4820313,-1.6330965,⋯,1.1309116,-0.277710882,-0.1297598,-0.0600326,-0.12662805,-0.13856663,-1.2178791,-0.2099152,0.08612588,GideAntiPD1
6,14,993,1,0,58,0,0,-0.11535008,-0.1847109,-0.455336,⋯,-0.1761315,-0.3655941746,-0.7592193,0.41185003,-0.11559656,-0.05134074,-0.2665028,-0.3051941,-0.84401986,GideAntiPD1


## Add Amato and Liu datasets

In [3]:
liu <- read.csv("Liu.csv", check.names=FALSE)
amato <- read.csv("Amato.csv", check.names=FALSE)

In [4]:
dim(liu)

In [5]:
amato[amato$Response == "Y", "Response"] <- 1
amato[amato$Response == "N", "Response"] <- 0
liu[liu$Response == "Y", "Response"] <- 1
liu[liu$Response == "N", "Response"] <- 0

amato$Response <- as.integer(amato$Response)
liu$Response <- as.integer(liu$Response)

liu$Source <- "LiuAntiPD1"
amato$Source <- "AmatoAntiPD1"

In [8]:
common_col_names_pd1 <- Reduce(intersect, list(
                                           names(GideHugoRiaz_AntiPD1),
                                           names(liu),
                                           names(amato)))

GideHugoRiaz_AntiPD1_sub_pd1 <- GideHugoRiaz_AntiPD1[ , common_col_names_pd1]   
liu_sub_pd1 <- liu[ , common_col_names_pd1]  
amato_sub_pd1 <- amato[ , common_col_names_pd1]


GideHugoRiazAmatoLiu_AntiPD1 <- rbind.fill(GideHugoRiaz_AntiPD1_sub_pd1, liu_sub_pd1, amato_sub_pd1)
tail(GideHugoRiazAmatoLiu_AntiPD1)

Unnamed: 0_level_0,Patient,Response,1,503538,2,144571,144568,53947,8086,65985,⋯,11130,7789,158586,79364,440590,79699,7791,23140,26009,Source
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
200,Sample_09_MB_3838,0,5.02768,2.26477661,8.438858,2.7710172,-9.965784,1.09326268,5.823064,1.717662,⋯,1.887584,-1.719409,-0.80585746,3.639116,-1.991938,0.8673825,6.046609,3.393444,2.103484,AmatoAntiPD1
201,Sample_10_MB_2786,0,3.345794,1.67680899,7.548968,-2.4127387,-9.965784,0.07054046,4.600829,3.308489,⋯,3.637587,-1.435475,0.35039541,2.935084,-9.965784,0.9410462,6.247924,3.369159,2.580196,AmatoAntiPD1
202,Sample_11_MB_2855,0,2.264726,-1.39103183,4.350009,-2.6968073,-3.916469,-0.51541843,5.461306,2.77115,⋯,3.362021,-1.918022,-0.02245629,2.469852,-9.965784,0.1489992,7.33731,3.319098,3.273065,AmatoAntiPD1
203,Sample_12_MB_3316,0,3.912621,1.59729756,5.930177,-0.6404878,-5.350815,1.44436493,4.843733,3.206188,⋯,3.530395,-1.511876,-0.06183919,3.48926,-9.965784,0.646559,6.761897,3.321501,2.410561,AmatoAntiPD1
204,Sample_13_MB_3432,0,3.980784,0.57359751,6.099373,-4.7721978,-9.965784,1.18828064,4.656788,2.969366,⋯,5.07931,-1.519859,-0.07129389,3.558721,-9.965784,0.2066057,6.861993,2.619746,2.895762,AmatoAntiPD1
205,Sample_14_MB_4117,0,2.809468,0.09937601,8.358831,-2.3488679,-6.170485,-1.36468951,3.033005,2.617985,⋯,3.661088,-0.449383,0.64732372,2.597178,-9.965784,2.1150033,7.261879,1.52443,2.690998,AmatoAntiPD1


# Make train and test subsets

In [9]:
train_pos <- createDataPartition(GideHugoRiazAmatoLiu_AntiPD1$Response, p = 0.8, list = F)
train <- GideHugoRiazAmatoLiu_AntiPD1[train_pos, ]
test <- GideHugoRiazAmatoLiu_AntiPD1[-train_pos, ]

nrow(train)
nrow(test)
# nrow(GideHugoRiazAmatoLiu_AntiPD1_filtered)

saveRDS(GideHugoRiazAmatoLiu_AntiPD1, "/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/GideHugoRiazAmatoLiu_AntiPD1.rds")

# saveRDS(GideHugoRiazAmatoLiu_AntiPD1_filtered, "/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/GideHugoRiazAmatoLiu_AntiPD1_filtered.rds")

saveRDS(train, 
        "/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/GideHugoRiazAmatoLiu_AntiPD1_train.rds")

saveRDS(test, 
        "/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/GideHugoRiazAmatoLiu_AntiPD1_test.rds")

# Clean Liu

In [6]:
# # Check which column has > 50% NA values
# countNA <- function(x=NULL,cutOff=NULL){
#   output<-FALSE
#   perc<-sum(is.na(x))*100/length(x)
#   if(perc>cutOff){output<-TRUE}
#   output  
# }
# col_nas <- apply(liu,2,function(x){countNA(x, 50)})
# cat("Columns with NAs > 50% = ", sum(col_nas), "\n")
# # all columns have <50% NAs

# # Check which rows has > 50% NA values
# row_nas <- apply(liu,1,function(x){countNA(x, 50)})
# cat("Rows with NAs > 50% = ", sum(row_nas), "\n")
# # all rows have <50% NAs


# cat("Dimensions of the filtered dataset = ", dim(liu))

Columns with NAs > 50% =  0 
Rows with NAs > 50% =  0 
Dimensions of the filtered dataset =  74 20851

# Divide Liu into test and train

In [8]:
# train_pos <- createDataPartition(liu$Response, p = 0.8, list = F)
# liu_train <- liu[train_pos, ]
# liu_test <- liu[-train_pos, ]

# nrow(liu_train)
# nrow(liu_test)
# nrow(liu)

# saveRDS(liu_train, 
#         "/home/jp/ICP_Responders/DataCollectionFormatting/MergeTables/Liu_AntiPD1_train.rds")

# saveRDS(liu_test, 
#         "/home/jp/ICP_Responders/DataCollectionFormatting/MergeTables/Liu_AntiPD1_test.rds")

# Write into csv file

In [13]:
write.csv(GideHugoRiazAmatoLiu_AntiPD1, "/home/jp/ICP_Responders/DataCollectionFormatting/MergeTables/GideHugoRiazAmatoLiu_AntiPD1.csv", row.names = FALSE)

In [14]:
head(GideHugoRiazAmatoLiu_AntiPD1)

Unnamed: 0_level_0,Patient,Response,1,503538,2,144571,144568,53947,8086,65985,⋯,11130,7789,158586,79364,440590,79699,7791,23140,26009,Source
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1,0,,,,,,,,,⋯,,,,,,,,,,GideAntiPD1
2,10,0,0.44967917,-0.1256063,-0.9428421,-0.6138966,0.3001431,-0.8007014,-0.20012314,0.6050572,⋯,1.5168029,0.2051403085,0.2497006,0.50803986,0.70851142,0.45721398,-0.7961158,0.2429606,1.18213316,GideAntiPD1
3,11,0,-1.16982847,-0.6828899,-0.8272336,-0.4466098,4.9701435,1.267812,-0.15970481,2.302274,⋯,-1.0120033,0.0005757279,-0.4148035,0.05113678,0.09727642,-0.16385787,-0.2469282,0.1171039,-0.74993078,GideAntiPD1
4,12,0,-0.04601746,1.5028679,-0.8496055,-0.7025466,0.6318195,1.4189363,0.74020824,0.6934887,⋯,-0.6297312,0.1098136507,-0.4097749,-0.18345745,0.16842307,-0.19196153,0.6580495,0.9784882,-0.27105096,GideAntiPD1
5,13,0,0.67577556,-0.4820313,-1.6330965,0.4909002,2.3912536,-0.7919329,-0.07076524,0.6020135,⋯,1.1309116,-0.277710882,-0.1297598,-0.0600326,-0.12662805,-0.13856663,-1.2178791,-0.2099152,0.08612588,GideAntiPD1
6,14,0,-0.11535008,-0.1847109,-0.455336,0.1180231,3.3019088,-0.4543087,0.24283611,0.1082517,⋯,-0.1761315,-0.3655941746,-0.7592193,0.41185003,-0.11559656,-0.05134074,-0.2665028,-0.3051941,-0.84401986,GideAntiPD1
