# Testing the model on new data

## Import Libraries

In [1]:
library('org.Hs.eg.db')
library(caret)
library(GeneAnswers)
library(openxlsx)

Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: Biobase

Welco

## Download the data

[Liu et al, 2019](https://www.nature.com/articles/s41591-019-0654-5)<br>
[Suplementary Data](https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0654-5/MediaObjects/41591_2019_654_MOESM2_ESM.xlsx)<br>
[tpm matrix](https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0654-5/MediaObjects/41591_2019_654_MOESM3_ESM.txt)<br>
[Data Guidelines](https://academic.oup.com/jnci/article/92/3/205/2965042)

In [2]:
system("wget --content-disposition https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0654-5/MediaObjects/41591_2019_654_MOESM3_ESM.txt")
system("wget --content-disposition https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0654-5/MediaObjects/41591_2019_654_MOESM4_ESM.xlsx")

## Read the data

In [3]:
expr <- read.csv("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1Pre-processingFeatureSelection/Model testing/41591_2019_654_MOESM3_ESM.txt", check.names=FALSE, sep="\t")  # 121 patients(manually verified)
colnames(expr)[1] <- "Patient"
expr <- expr[order(expr$Patient), ]
tail(expr)
dim(expr)

Unnamed: 0_level_0,Patient,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A3GALT2,A4GALT,A4GNT,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
102,Patient87,3.958186,0.4757435,0.05708922,205.5402,0.5328327,0.01902974,0.0,0.9324572,0.30447583,⋯,34.86248,25.499851,1.389171,2.264539,8.201818,0.51380296,16.63199,50.6762,30.37146,29.95281
108,Patient9,17.70744,0.8023684,0.0,102.6201,0.05533575,0.24901087,0.0,2.434773,0.13833937,⋯,14.80231,18.454473,3.403149,4.42686,12.146197,0.0,20.64023,90.47395,38.01566,26.50582
104,Patient94,14.144119,0.7752792,0.01890925,329.5315,0.30254799,2.93093366,0.05672775,1.0967365,0.26472949,⋯,36.47594,60.944511,2.136745,3.744031,9.624808,0.075637,22.44528,71.06096,28.32606,18.6067
105,Patient96,14.90618,1.3364161,0.0,290.5163,0.2570031,0.20560248,0.05140062,0.3084037,0.05140062,⋯,13.77537,1.028012,1.953224,2.672832,9.714717,0.0,14.44357,57.8257,20.09764,12.64455
106,Patient98,13.099469,1.2362082,0.08675145,873.11,0.65063589,0.06506359,0.0,1.0843932,0.21687863,⋯,12.79584,8.545018,2.927862,3.665249,9.065527,0.17350291,19.49739,73.54354,38.58271,22.98913
107,Patient99,19.99728,0.815055,0.0,727.882,0.51177873,0.15163814,0.03790954,1.4595171,0.17059291,⋯,20.41428,15.06904,2.293527,2.615758,8.529645,0.01895477,22.17708,126.23875,38.87623,20.77443


### Convert values to log2(x)

In [4]:
expr[expr == 0] <- 0.001

# convert values to log
expr[2:ncol(expr)] <- log(expr[2:ncol(expr)], 2)

### Map Gene Names to Entrez IDs

In [5]:
# mapping gene names to Entrez IDs
colnames(expr)[2:ncol(expr)] <- mapIds(org.Hs.eg.db, colnames(expr)[2:ncol(expr)], 'ENTREZID', 'ALIAS')

'select()' returned 1:many mapping between keys and columns



## Clinical Data

In [10]:
clinical <- read.xlsx("41591_2019_654_MOESM4_ESM.xlsx", sheet=1, startRow = 3)
clinical <- clinical[1:144,]
colnames(clinical)[1] <- "Patient"
clinical <- clinical[,c("Patient", "gender.(Male=1,.Female=0)", "BR", "priorCTLA4")]
clinical <- clinical[clinical$priorCTLA4 == 0, ] # ONLY 84 out of 144 have no prior AntiCTLA4 therapy
clinical <- clinical[order(clinical$Patient), ]
colnames(clinical)[2:3] <- c("Gender", "Response")
clinical[clinical == "PD"] <- "N"
clinical[clinical == "SD"] <- "N"
clinical[clinical == "PR"] <- "Y"
clinical[clinical == "MR"] <- "Y"
clinical[clinical == "CR"] <- "Y"
clinical$Response <- as.factor(clinical$Response)
table(clinical$Response)
dim(clinical)
head(clinical)

Unnamed: 0_level_0,X1,total_muts,nonsyn_muts,clonal_muts,subclonal_muts,heterogeneity,total_neoantigens,CNA_prop,"gender.(Male=1,.Female=0)",biopsy.site,⋯,postCTLA4,postMAPKTx,postCombinedCTLA_PD1,numPriorTherapies,biopsy.site_categ,biopsyContext.(1=Pre-Ipi;.2=On-Ipi;.3=Pre-PD1;.4=On-PD1),daysBiopsyToPD1,daysBiopsyAfterIpiStart,purity,ploidy
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
1,Patient1,34,22,12,10,0.4545455,49,0.32141702,0,skin,⋯,0,0,0,1,skin,3,-84,unk,0.92,1.73
2,Patient10,96,71,48,22,0.3142857,230,0.39138395,0,skin,⋯,1,1,0,2,skin,3,-12,107,0.83,1.84
3,Patient100,200,126,98,24,0.1967213,301,0.02944673,0,skin,⋯,0,0,0,1,skin,3,-94,33,0.11,2.17
4,Patient102,370,246,215,26,0.1078838,825,0.16938949,1,brain,⋯,0,0,0,0,brain,3,-64,na,0.7,3.24
5,Patient104,130,96,65,28,0.3010753,329,0.20651842,0,lymph node,⋯,0,0,0,0,lymph node,3,-57,na,0.86,4.58
6,Patient105,185,125,85,23,0.212963,334,0.39430586,0,skin,⋯,0,0,1,0,skin,3,-22,na,0.86,2.42


# Mapping response to expression

In [7]:
expr_res <- merge(expr, clinical[, c("Patient", "Response")], by = 'Patient')
head(expr_res)

“column names ‘146’, ‘205’, ‘26289’, ‘215’, ‘283373’, ‘56899’, ‘23780’, ‘334’, ‘231’, ‘26286’, ‘57584’, ‘51326’, ‘379’, ‘158’, ‘146712’, ‘587’, ‘55859’, ‘40’, ‘552900’, ‘220869’, ‘1238’, ‘1238’, ‘1016’, ‘55602’, ‘1066’, ‘29082’, ‘1120’, ‘29097’, ‘1371’, ‘1513’, ‘100130361’, ‘1543’, ‘1548’, ‘1564’, ‘26’, ‘8642’, ‘1649’, ‘1663’, ‘100287029’, ‘55789’, ‘27351’, ‘26220’, ‘1719’, ‘420’, ‘49860’, ‘1808’, ‘1824’, ‘8655’, ‘1942’, ‘5610’, ‘1965’, ‘8663’, ‘3692’, ‘8507’, ‘54869’, ‘23265’, ‘1272’, ‘2175’, ‘1056’, ‘55294’, ‘2213’, ‘84824’, ‘2260’, ‘286380’, ‘6624’, ‘2553’, ‘79623’, ‘1258’, ‘2657’, ‘2645’, ‘2720’, ‘2776’, ‘55889’, ‘23015’, ‘643699’, ‘2865’, ‘1798’, ‘653188’, ‘3030’, ‘328’, ‘9734’, ‘220296’, ‘10614’, ‘3017’, ‘3113’, ‘2775’, ‘3077’, ‘2034’, ‘773’, ‘1659’, ‘1665’, ‘726’, ‘10643’, ‘8570’, ‘688’, ‘8507’, ‘55975’, ‘3853’, ‘57830’, ‘3846’, ‘22866’, ‘7852’, ‘1962’, ‘11025’, ‘96626’, ‘NA’, ‘1056’, ‘3990’, ‘8048’, ‘80856’, ‘3936’, ‘4038’, ‘4034’, ‘4034’, ‘4053’, ‘2108’, ‘10219’, ‘4102’, ‘5143

Unnamed: 0_level_0,Patient,1,503538,29974,2,144571,144568,127550,53947,51146,⋯,11130,7789,158586,79364,440590,79699,7791,23140,26009,Response
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,Patient102,4.327359,-0.8475669,-2.262604,9.663733,-1.7320897,-9.965784,-9.965784,1.2113268,-0.5745484,⋯,2.701897,1.4003606,1.834257,3.762227,-3.847567,4.796289,7.912183,5.170169,4.024851,Y
2,Patient105,3.701457,-9.9657843,-9.965784,6.76131,-0.6234737,-4.208436,-3.208436,-0.8161188,-2.8865081,⋯,2.946382,-0.6848742,1.020383,2.209416,-4.208436,4.649545,4.322945,3.519484,5.063027,Y
3,Patient106,4.19874,-0.147657,-9.965784,6.551687,-3.3955845,-3.073656,-9.965784,-1.2256595,-3.0736564,⋯,3.462396,0.9263436,1.659698,3.092256,-5.395585,4.280373,5.412575,4.915028,4.160922,Y
4,Patient107,-9.965784,-9.9657843,-9.965784,9.078291,-9.9657843,-9.965784,-9.965784,-9.9657843,-9.9657843,⋯,-9.965784,-9.9657843,-9.965784,-9.965784,-9.965784,-9.965784,8.560465,4.292071,-9.965784,Y
5,Patient108,-0.722509,-9.9657843,-9.965784,9.443654,-9.9657843,-9.965784,-9.965784,-9.9657843,-9.9657843,⋯,-9.965784,-9.9657843,-1.137546,3.703756,-9.965784,2.921347,6.015583,5.499078,2.581272,N
6,Patient112,4.152404,0.2525682,-3.104984,8.719776,-1.1049838,3.024299,-9.965784,2.3044072,-9.9657843,⋯,2.966479,1.5243729,2.064941,3.819829,-9.965784,3.386869,6.673093,5.017844,3.922922,N


## Clean data

In [8]:
write.csv(expr_res, "/home/jp/ICP_Responders/DataCollectionFormatting/MergeTables/Liu.csv", row.names = FALSE)

In [9]:
dim(expr_res)

# Modelling

In [7]:
expr_mat <- expr_res[,-c(ncol(expr_res))] # to remove last column i.e. Response

## GBM

### MergedRank200

In [8]:
gbm_mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_mergedRank.rds")
feature.mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.mergedRank.rds")

res <- list()
tmp <- feature.mergedRank[, -which(colnames(feature.mergedRank) %in% "Response")]


for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_mergedRank <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_mergedRank) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_mergedRank[gbm_inp_mergedRank == 0] <- 0.001

# # convert values to log
# gbm_inp_mergedRank <- log(gbm_inp_mergedRank, 2)

gbm_predictions_mergedRank <- predict(gbm_mergedRank, gbm_inp_mergedRank)
confusionMatrix(data = gbm_predictions_mergedRank, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  9  3
         Y 32 30
                                          
               Accuracy : 0.527           
                 95% CI : (0.4075, 0.6443)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.7214          
                                          
                  Kappa : 0.1184          
                                          
 Mcnemar's Test P-Value : 2.214e-06       
                                          
            Sensitivity : 0.9091          
            Specificity : 0.2195          
         Pos Pred Value : 0.4839          
         Neg Pred Value : 0.7500          
             Prevalence : 0.4459          
         Detection Rate : 0.4054          
   Detection Prevalence : 0.8378          
      Balanced Accuracy : 0.5643          
                                          
       'Positive' Class : Y               
                                    

### Variance

In [9]:
gbm_top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_top200var.rds")
feature.top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.top200var.rds")

res <- list()
tmp <- feature.top200var[, -which(colnames(feature.top200var) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_top200var <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_top200var) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_top200var[gbm_inp_top200var == 0] <- 0.001

# # convert values to log
# gbm_inp_top200var <- log(gbm_inp_top200var, 2)

gbm_predictions_top200var <- predict(gbm_top200var, gbm_inp_top200var)
confusionMatrix(data = gbm_predictions_top200var, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 11 11
         Y 30 22
                                          
               Accuracy : 0.4459          
                 95% CI : (0.3302, 0.5661)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.976264        
                                          
                  Kappa : -0.0616         
                                          
 Mcnemar's Test P-Value : 0.004937        
                                          
            Sensitivity : 0.6667          
            Specificity : 0.2683          
         Pos Pred Value : 0.4231          
         Neg Pred Value : 0.5000          
             Prevalence : 0.4459          
         Detection Rate : 0.2973          
   Detection Prevalence : 0.7027          
      Balanced Accuracy : 0.4675          
                                          
       'Positive' Class : Y               
                                    

### FCBF

In [10]:
gbm_fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_fcbf.rds")
feature.fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.fcbf.rds")

res <- list()
tmp <- feature.fcbf[, -which(colnames(feature.fcbf) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_fcbf <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_fcbf) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_fcbf[gbm_inp_fcbf == 0] <- 0.001

# # convert values to log
# gbm_inp_fcbf <- log(gbm_inp_fcbf, 2)

gbm_predictions_fcbf <- predict(gbm_fcbf, gbm_inp_fcbf)
confusionMatrix(data = gbm_predictions_fcbf, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  9  4
         Y 32 29
                                          
               Accuracy : 0.5135          
                 95% CI : (0.3944, 0.6315)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.7938          
                                          
                  Kappa : 0.0908          
                                          
 Mcnemar's Test P-Value : 6.795e-06       
                                          
            Sensitivity : 0.8788          
            Specificity : 0.2195          
         Pos Pred Value : 0.4754          
         Neg Pred Value : 0.6923          
             Prevalence : 0.4459          
         Detection Rate : 0.3919          
   Detection Prevalence : 0.8243          
      Balanced Accuracy : 0.5492          
                                          
       'Positive' Class : Y               
                                    

### Limma

In [11]:
gbm_limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_limma.rds")
feature.limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.limma.rds")

res <- list()
tmp <- feature.limma[, -which(colnames(feature.limma) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_limma <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_limma) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_limma[gbm_inp_limma == 0] <- 0.001

# # convert values to log
# gbm_inp_limma <- log(gbm_inp_limma, 2)

gbm_predictions_limma <- predict(gbm_limma, gbm_inp_limma)
confusionMatrix(data = gbm_predictions_limma, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 13  4
         Y 28 29
                                          
               Accuracy : 0.5676          
                 95% CI : (0.4472, 0.6823)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.4552          
                                          
                  Kappa : 0.1829          
                                          
 Mcnemar's Test P-Value : 4.785e-05       
                                          
            Sensitivity : 0.8788          
            Specificity : 0.3171          
         Pos Pred Value : 0.5088          
         Neg Pred Value : 0.7647          
             Prevalence : 0.4459          
         Detection Rate : 0.3919          
   Detection Prevalence : 0.7703          
      Balanced Accuracy : 0.5979          
                                          
       'Positive' Class : Y               
                                    

### Random Forest Permutation

In [12]:
gbm_ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_ranger.rds")
feature.ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.ranger.rds")

res <- list()
tmp <- feature.ranger[, -which(colnames(feature.ranger) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_ranger <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_ranger) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_ranger[gbm_inp_ranger == 0] <- 0.001

# # convert values to log
# gbm_inp_ranger <- log(gbm_inp_ranger, 2)

gbm_predictions_ranger <- predict(gbm_ranger, gbm_inp_ranger)
confusionMatrix(data = gbm_predictions_ranger, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 22 10
         Y 19 23
                                          
               Accuracy : 0.6081          
                 95% CI : (0.4877, 0.7196)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.2070          
                                          
                  Kappa : 0.2275          
                                          
 Mcnemar's Test P-Value : 0.1374          
                                          
            Sensitivity : 0.6970          
            Specificity : 0.5366          
         Pos Pred Value : 0.5476          
         Neg Pred Value : 0.6875          
             Prevalence : 0.4459          
         Detection Rate : 0.3108          
   Detection Prevalence : 0.5676          
      Balanced Accuracy : 0.6168          
                                          
       'Positive' Class : Y               
                                    

### Mutual Information

In [14]:
gbm_jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_jmim.rds")
feature.jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.jmim.rds")

res <- list()
tmp <- feature.jmim[, -which(colnames(feature.jmim) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_jmim <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_jmim) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_jmim[gbm_inp_jmim == 0] <- 0.001

# # convert values to log
# gbm_inp_jmim <- log(gbm_inp_jmim, 2)

gbm_predictions_jmim <- predict(gbm_jmim, gbm_inp_jmim)
confusionMatrix(data = gbm_predictions_jmim, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 19 12
         Y 22 21
                                          
               Accuracy : 0.5405          
                 95% CI : (0.4207, 0.6571)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.6384          
                                          
                  Kappa : 0.0969          
                                          
 Mcnemar's Test P-Value : 0.1227          
                                          
            Sensitivity : 0.6364          
            Specificity : 0.4634          
         Pos Pred Value : 0.4884          
         Neg Pred Value : 0.6129          
             Prevalence : 0.4459          
         Detection Rate : 0.2838          
   Detection Prevalence : 0.5811          
      Balanced Accuracy : 0.5499          
                                          
       'Positive' Class : Y               
                                    

### Literature Survey

In [15]:
gbm_litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_litSur.rds")
feature.litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.litSur.rds")

res <- list()
tmp <- feature.litSur[, -which(colnames(feature.litSur) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_litSur <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_litSur) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_litSur[gbm_inp_litSur == 0] <- 0.001

# # convert values to log
# gbm_inp_litSur <- log(gbm_inp_litSur, 2)

gbm_predictions_litSur <- predict(gbm_litSur, gbm_inp_litSur)
confusionMatrix(data = gbm_predictions_litSur, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 31 21
         Y 10 12
                                          
               Accuracy : 0.5811          
                 95% CI : (0.4606, 0.6949)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.36440         
                                          
                  Kappa : 0.1238          
                                          
 Mcnemar's Test P-Value : 0.07249         
                                          
            Sensitivity : 0.3636          
            Specificity : 0.7561          
         Pos Pred Value : 0.5455          
         Neg Pred Value : 0.5962          
             Prevalence : 0.4459          
         Detection Rate : 0.1622          
   Detection Prevalence : 0.2973          
      Balanced Accuracy : 0.5599          
                                          
       'Positive' Class : Y               
                                    

### Prat et al.

In [16]:
gbm_prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/gbm_prat.rds")
feature.prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.prat.rds")

res <- list()
tmp <- feature.prat[, -which(colnames(feature.prat) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

gbm_inp_prat <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(gbm_inp_prat) %in% 0.001)

# # Replace 0 with 0.001
# gbm_inp_prat[gbm_inp_prat == 0] <- 0.001

# # convert values to log
# gbm_inp_prat <- log(gbm_inp_prat, 2)

gbm_predictions_prat <- predict(gbm_prat, gbm_inp_prat)
confusionMatrix(data = gbm_predictions_prat, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  9  6
         Y 32 27
                                          
               Accuracy : 0.4865          
                 95% CI : (0.3685, 0.6056)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9005          
                                          
                  Kappa : 0.035           
                                          
 Mcnemar's Test P-Value : 5.002e-05       
                                          
            Sensitivity : 0.8182          
            Specificity : 0.2195          
         Pos Pred Value : 0.4576          
         Neg Pred Value : 0.6000          
             Prevalence : 0.4459          
         Detection Rate : 0.3649          
   Detection Prevalence : 0.7973          
      Balanced Accuracy : 0.5188          
                                          
       'Positive' Class : Y               
                                    

## Cforest

### MergedRank200

In [17]:
cforest_mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_mergedRank.rds")
feature.mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.mergedRank.rds")

res <- list()
tmp <- feature.mergedRank[, -which(colnames(feature.mergedRank) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_mergedRank <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_mergedRank) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_mergedRank[cforest_inp_mergedRank == 0] <- 0.001

# # convert values to log
# cforest_inp_mergedRank <- log(cforest_inp_mergedRank, 2)

cforest_predictions_mergedRank <- predict(cforest_mergedRank, cforest_inp_mergedRank)
confusionMatrix(data = cforest_predictions_mergedRank, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 39 30
         Y  2  3
                                          
               Accuracy : 0.5676          
                 95% CI : (0.4472, 0.6823)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.4552          
                                          
                  Kappa : 0.0459          
                                          
 Mcnemar's Test P-Value : 1.815e-06       
                                          
            Sensitivity : 0.09091         
            Specificity : 0.95122         
         Pos Pred Value : 0.60000         
         Neg Pred Value : 0.56522         
             Prevalence : 0.44595         
         Detection Rate : 0.04054         
   Detection Prevalence : 0.06757         
      Balanced Accuracy : 0.52106         
                                          
       'Positive' Class : Y               
                                    

### Variance

In [18]:
cforest_top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_top200var.rds")
feature.top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.top200var.rds")

res <- list()
tmp <- feature.top200var[, -which(colnames(feature.top200var) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_top200var <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_top200var) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_top200var[cforest_inp_top200var == 0] <- 0.001

# # convert values to log
# cforest_inp_top200var <- log(cforest_inp_top200var, 2)

cforest_predictions_top200var <- predict(cforest_top200var, cforest_inp_top200var)
confusionMatrix(data = cforest_predictions_top200var, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 29 23
         Y 12 10
                                          
               Accuracy : 0.527           
                 95% CI : (0.4075, 0.6443)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.72140         
                                          
                  Kappa : 0.0107          
                                          
 Mcnemar's Test P-Value : 0.09097         
                                          
            Sensitivity : 0.3030          
            Specificity : 0.7073          
         Pos Pred Value : 0.4545          
         Neg Pred Value : 0.5577          
             Prevalence : 0.4459          
         Detection Rate : 0.1351          
   Detection Prevalence : 0.2973          
      Balanced Accuracy : 0.5052          
                                          
       'Positive' Class : Y               
                                    

### FCBF

In [19]:
cforest_fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_fcbf.rds")
feature.fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.fcbf.rds")

res <- list()
tmp <- feature.fcbf[, -which(colnames(feature.fcbf) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_fcbf <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_fcbf) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_fcbf[cforest_inp_fcbf == 0] <- 0.001

# # convert values to log
# cforest_inp_fcbf <- log(cforest_inp_fcbf, 2)

cforest_predictions_fcbf <- predict(cforest_fcbf, cforest_inp_fcbf)
confusionMatrix(data = cforest_predictions_fcbf, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 17 10
         Y 24 23
                                          
               Accuracy : 0.5405          
                 95% CI : (0.4207, 0.6571)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.63839         
                                          
                  Kappa : 0.1072          
                                          
 Mcnemar's Test P-Value : 0.02578         
                                          
            Sensitivity : 0.6970          
            Specificity : 0.4146          
         Pos Pred Value : 0.4894          
         Neg Pred Value : 0.6296          
             Prevalence : 0.4459          
         Detection Rate : 0.3108          
   Detection Prevalence : 0.6351          
      Balanced Accuracy : 0.5558          
                                          
       'Positive' Class : Y               
                                    

### Limma

In [20]:
cforest_limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_limma.rds")
feature.limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.limma.rds")

res <- list()
tmp <- feature.limma[, -which(colnames(feature.limma) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_limma <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_limma) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_limma[cforest_inp_limma == 0] <- 0.001

# # convert values to log
# cforest_inp_limma <- log(cforest_inp_limma, 2)

cforest_predictions_limma <- predict(cforest_limma, cforest_inp_limma)
confusionMatrix(data = cforest_predictions_limma, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 40 30
         Y  1  3
                                          
               Accuracy : 0.5811          
                 95% CI : (0.4606, 0.6949)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.3644          
                                          
                  Kappa : 0.0728          
                                          
 Mcnemar's Test P-Value : 4.932e-07       
                                          
            Sensitivity : 0.09091         
            Specificity : 0.97561         
         Pos Pred Value : 0.75000         
         Neg Pred Value : 0.57143         
             Prevalence : 0.44595         
         Detection Rate : 0.04054         
   Detection Prevalence : 0.05405         
      Balanced Accuracy : 0.53326         
                                          
       'Positive' Class : Y               
                                    

### Random Forest Permutation

In [21]:
cforest_ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_ranger.rds")
feature.ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.ranger.rds")

res <- list()
tmp <- feature.ranger[, -which(colnames(feature.ranger) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_ranger <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_ranger) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_ranger[cforest_inp_ranger == 0] <- 0.001

# # convert values to log
# cforest_inp_ranger <- log(cforest_inp_ranger, 2)

cforest_predictions_ranger <- predict(cforest_ranger, cforest_inp_ranger)
confusionMatrix(data = cforest_predictions_ranger, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 33 20
         Y  8 13
                                          
               Accuracy : 0.6216          
                 95% CI : (0.5013, 0.7319)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.14617         
                                          
                  Kappa : 0.2061          
                                          
 Mcnemar's Test P-Value : 0.03764         
                                          
            Sensitivity : 0.3939          
            Specificity : 0.8049          
         Pos Pred Value : 0.6190          
         Neg Pred Value : 0.6226          
             Prevalence : 0.4459          
         Detection Rate : 0.1757          
   Detection Prevalence : 0.2838          
      Balanced Accuracy : 0.5994          
                                          
       'Positive' Class : Y               
                                    

### Mutual Information

In [22]:
cforest_jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_jmim.rds")
feature.jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.jmim.rds")

res <- list()
tmp <- feature.jmim[, -which(colnames(feature.jmim) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_jmim <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_jmim) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_jmim[cforest_inp_jmim == 0] <- 0.001

# # convert values to log
# cforest_inp_jmim <- log(cforest_inp_jmim, 2)

cforest_predictions_jmim <- predict(cforest_jmim, cforest_inp_jmim)
confusionMatrix(data = cforest_predictions_jmim, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 37 26
         Y  4  7
                                          
               Accuracy : 0.5946          
                 95% CI : (0.4741, 0.7073)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.280480        
                                          
                  Kappa : 0.1225          
                                          
 Mcnemar's Test P-Value : 0.000126        
                                          
            Sensitivity : 0.21212         
            Specificity : 0.90244         
         Pos Pred Value : 0.63636         
         Neg Pred Value : 0.58730         
             Prevalence : 0.44595         
         Detection Rate : 0.09459         
   Detection Prevalence : 0.14865         
      Balanced Accuracy : 0.55728         
                                          
       'Positive' Class : Y               
                                    

### Literature Survey

In [23]:
cforest_litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_litSur.rds")
feature.litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.litSur.rds")

res <- list()
tmp <- feature.litSur[, -which(colnames(feature.litSur) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_litSur <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_litSur) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_litSur[cforest_inp_litSur == 0] <- 0.001

# # convert values to log
# cforest_inp_litSur <- log(cforest_inp_litSur, 2)

cforest_predictions_litSur <- predict(cforest_litSur, cforest_inp_litSur)
confusionMatrix(data = cforest_predictions_litSur, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 30 24
         Y 11  9
                                          
               Accuracy : 0.527           
                 95% CI : (0.4075, 0.6443)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.72140         
                                          
                  Kappa : 0.0046          
                                          
 Mcnemar's Test P-Value : 0.04252         
                                          
            Sensitivity : 0.2727          
            Specificity : 0.7317          
         Pos Pred Value : 0.4500          
         Neg Pred Value : 0.5556          
             Prevalence : 0.4459          
         Detection Rate : 0.1216          
   Detection Prevalence : 0.2703          
      Balanced Accuracy : 0.5022          
                                          
       'Positive' Class : Y               
                                    

### Prat et al.

In [43]:
cforest_prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/cforest_prat.rds")
feature.prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.prat.rds")

res <- list()
tmp <- feature.prat[, -which(colnames(feature.prat) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

cforest_inp_prat <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(cforest_inp_prat) %in% 0.001)

# # Replace 0 with 0.001
# cforest_inp_prat[cforest_inp_prat == 0] <- 0.001

# # convert values to log
# cforest_inp_prat <- log(cforest_inp_prat, 2)

cforest_predictions_prat <- predict(cforest_prat, cforest_inp_prat)
confusionMatrix(data = cforest_predictions_prat, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  3  1
         Y 38 32
                                          
               Accuracy : 0.473           
                 95% CI : (0.3557, 0.5925)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9354          
                                          
                  Kappa : 0.0386          
                                          
 Mcnemar's Test P-Value : 8.185e-09       
                                          
            Sensitivity : 0.96970         
            Specificity : 0.07317         
         Pos Pred Value : 0.45714         
         Neg Pred Value : 0.75000         
             Prevalence : 0.44595         
         Detection Rate : 0.43243         
   Detection Prevalence : 0.94595         
      Balanced Accuracy : 0.52143         
                                          
       'Positive' Class : Y               
                                    

# Naive Bayes

## MergedRank200

In [25]:
nb_mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_mergedRank.rds")
feature.mergedRank <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.mergedRank.rds")

res <- list()
tmp <- feature.mergedRank[, -which(colnames(feature.mergedRank) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_mergedRank <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_mergedRank) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_mergedRank[nb_inp_mergedRank == 0] <- 0.001

# # convert values to log
# nb_inp_mergedRank <- log(nb_inp_mergedRank, 2)

nb_predictions_mergedRank <- predict(nb_mergedRank, nb_inp_mergedRank)
confusionMatrix(data = nb_predictions_mergedRank, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 27 21
         Y 14 12
                                          
               Accuracy : 0.527           
                 95% CI : (0.4075, 0.6443)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.7214          
                                          
                  Kappa : 0.0226          
                                          
 Mcnemar's Test P-Value : 0.3105          
                                          
            Sensitivity : 0.3636          
            Specificity : 0.6585          
         Pos Pred Value : 0.4615          
         Neg Pred Value : 0.5625          
             Prevalence : 0.4459          
         Detection Rate : 0.1622          
   Detection Prevalence : 0.3514          
      Balanced Accuracy : 0.5111          
                                          
       'Positive' Class : Y               
                                    

## Variance

In [26]:
nb_top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_top200var.rds")
feature.top200var <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.top200var.rds")

res <- list()
tmp <- feature.top200var[, -which(colnames(feature.top200var) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_top200var <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_top200var) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_top200var[nb_inp_top200var == 0] <- 0.001

# # convert values to log
# nb_inp_top200var <- log(nb_inp_top200var, 2)

nb_predictions_top200var <- predict(nb_top200var, nb_inp_top200var)
confusionMatrix(data = nb_predictions_top200var, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  6  7
         Y 35 26
                                          
               Accuracy : 0.4324          
                 95% CI : (0.3177, 0.5528)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9866          
                                          
                  Kappa : -0.0608         
                                          
 Mcnemar's Test P-Value : 3.097e-05       
                                          
            Sensitivity : 0.7879          
            Specificity : 0.1463          
         Pos Pred Value : 0.4262          
         Neg Pred Value : 0.4615          
             Prevalence : 0.4459          
         Detection Rate : 0.3514          
   Detection Prevalence : 0.8243          
      Balanced Accuracy : 0.4671          
                                          
       'Positive' Class : Y               
                                    

## FCBF

In [27]:
nb_fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_fcbf.rds")
feature.fcbf <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.fcbf.rds")

res <- list()
tmp <- feature.fcbf[, -which(colnames(feature.fcbf) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_fcbf <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_fcbf) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_fcbf[nb_inp_fcbf == 0] <- 0.001

# # convert values to log
# nb_inp_fcbf <- log(nb_inp_fcbf, 2)

nb_predictions_fcbf <- predict(nb_fcbf, nb_inp_fcbf)
confusionMatrix(data = nb_predictions_fcbf, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  2  2
         Y 39 31
                                          
               Accuracy : 0.4459          
                 95% CI : (0.3302, 0.5661)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9763          
                                          
                  Kappa : -0.0107         
                                          
 Mcnemar's Test P-Value : 1.885e-08       
                                          
            Sensitivity : 0.93939         
            Specificity : 0.04878         
         Pos Pred Value : 0.44286         
         Neg Pred Value : 0.50000         
             Prevalence : 0.44595         
         Detection Rate : 0.41892         
   Detection Prevalence : 0.94595         
      Balanced Accuracy : 0.49409         
                                          
       'Positive' Class : Y               
                                    

## Limma

In [28]:
nb_limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_limma.rds")
feature.limma <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.limma.rds")

res <- list()
tmp <- feature.limma[, -which(colnames(feature.limma) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_limma <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_limma) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_limma[nb_inp_limma == 0] <- 0.001

# # convert values to log
# nb_inp_limma <- log(nb_inp_limma, 2)

nb_predictions_limma <- predict(nb_limma, nb_inp_limma)
confusionMatrix(data = nb_predictions_limma, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 32 23
         Y  9 10
                                          
               Accuracy : 0.5676          
                 95% CI : (0.4472, 0.6823)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.45517         
                                          
                  Kappa : 0.0871          
                                          
 Mcnemar's Test P-Value : 0.02156         
                                          
            Sensitivity : 0.3030          
            Specificity : 0.7805          
         Pos Pred Value : 0.5263          
         Neg Pred Value : 0.5818          
             Prevalence : 0.4459          
         Detection Rate : 0.1351          
   Detection Prevalence : 0.2568          
      Balanced Accuracy : 0.5418          
                                          
       'Positive' Class : Y               
                                    

## Random Forest Permutation

In [29]:
nb_ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_ranger.rds")
feature.ranger <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.ranger.rds")

res <- list()
tmp <- feature.ranger[, -which(colnames(feature.ranger) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_ranger <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_ranger) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_ranger[nb_inp_ranger == 0] <- 0.001

# # convert values to log
# nb_inp_ranger <- log(nb_inp_ranger, 2)

nb_predictions_ranger <- predict(nb_ranger, nb_inp_ranger)
confusionMatrix(data = nb_predictions_ranger, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  8  5
         Y 33 28
                                          
               Accuracy : 0.4865          
                 95% CI : (0.3685, 0.6056)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9005          
                                          
                  Kappa : 0.0403          
                                          
 Mcnemar's Test P-Value : 1.187e-05       
                                          
            Sensitivity : 0.8485          
            Specificity : 0.1951          
         Pos Pred Value : 0.4590          
         Neg Pred Value : 0.6154          
             Prevalence : 0.4459          
         Detection Rate : 0.3784          
   Detection Prevalence : 0.8243          
      Balanced Accuracy : 0.5218          
                                          
       'Positive' Class : Y               
                                    

## Mutual Information

In [30]:
nb_jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_jmim.rds")
feature.jmim <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.jmim.rds")

res <- list()
tmp <- feature.jmim[, -which(colnames(feature.jmim) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_jmim <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_jmim) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_jmim[nb_inp_jmim == 0] <- 0.001

# # convert values to log
# nb_inp_jmim <- log(nb_inp_jmim, 2)

nb_predictions_jmim <- predict(nb_jmim, nb_inp_jmim)
confusionMatrix(data = nb_predictions_jmim, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  1  0
         Y 40 33
                                          
               Accuracy : 0.4595          
                 95% CI : (0.3429, 0.5793)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9599          
                                          
                  Kappa : 0.0218          
                                          
 Mcnemar's Test P-Value : 6.984e-10       
                                          
            Sensitivity : 1.00000         
            Specificity : 0.02439         
         Pos Pred Value : 0.45205         
         Neg Pred Value : 1.00000         
             Prevalence : 0.44595         
         Detection Rate : 0.44595         
   Detection Prevalence : 0.98649         
      Balanced Accuracy : 0.51220         
                                          
       'Positive' Class : Y               
                                    

## Literature Survey

In [31]:
nb_litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_litSur.rds")
feature.litSur <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.litSur.rds")

res <- list()
tmp <- feature.litSur[, -which(colnames(feature.litSur) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_litSur <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_litSur) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_litSur[nb_inp_litSur == 0] <- 0.001

# # convert values to log
# nb_inp_litSur <- log(nb_inp_litSur, 2)

nb_predictions_litSur <- predict(nb_litSur, nb_inp_litSur)
confusionMatrix(data = nb_predictions_litSur, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N 21 15
         Y 20 18
                                          
               Accuracy : 0.527           
                 95% CI : (0.4075, 0.6443)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.7214          
                                          
                  Kappa : 0.0568          
                                          
 Mcnemar's Test P-Value : 0.4990          
                                          
            Sensitivity : 0.5455          
            Specificity : 0.5122          
         Pos Pred Value : 0.4737          
         Neg Pred Value : 0.5833          
             Prevalence : 0.4459          
         Detection Rate : 0.2432          
   Detection Prevalence : 0.5135          
      Balanced Accuracy : 0.5288          
                                          
       'Positive' Class : Y               
                                    

## Prat et al.

In [32]:
nb_prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/nb_prat.rds")
feature.prat <- readRDS("/home/jp/ICP_Responders/ML/GideHugoRiaz_AntiPD1/Pre-processingFeatureSelection/Model testing/SavedModels/feature.prat.rds")

res <- list()
tmp <- feature.prat[, -which(colnames(feature.prat) %in% "Response")]

for (i in 1:ncol(tmp)){
    gene_name <- colnames(tmp)[i]
    check_gene <- length(which(colnames(expr_mat) %in% gene_name))
    if(check_gene == 1){
        res[[i]] <- expr_mat[,which(colnames(expr_mat) %in% gene_name)]
    }else{
        res[[i]] <- rep(0.001, nrow(expr_mat))
    }
    names(res)[i] <- gene_name 
}

nb_inp_prat <- do.call(cbind, res)

# which columns have missing genes
which(colMeans(nb_inp_prat) %in% 0.001)

# # Replace 0 with 0.001
# nb_inp_prat[nb_inp_prat == 0] <- 0.001

# # convert values to log
# nb_inp_prat <- log(nb_inp_prat, 2)

nb_predictions_prat <- predict(nb_prat, nb_inp_prat)
confusionMatrix(data = nb_predictions_prat, reference = expr_res$Response, positive = "Y")

Confusion Matrix and Statistics

          Reference
Prediction  N  Y
         N  0  0
         Y 41 33
                                          
               Accuracy : 0.4459          
                 95% CI : (0.3302, 0.5661)
    No Information Rate : 0.5541          
    P-Value [Acc > NIR] : 0.9763          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : 4.185e-10       
                                          
            Sensitivity : 1.0000          
            Specificity : 0.0000          
         Pos Pred Value : 0.4459          
         Neg Pred Value :    NaN          
             Prevalence : 0.4459          
         Detection Rate : 0.4459          
   Detection Prevalence : 1.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : Y               
                                    

In [None]:
# ncol(mod_inp_mat)
# # sort(table(colnames(mod_inp_mat)), decreasing=TRUE)

# getSymbols("163059", 'org.Hs.eg.db')
# colnames(expression)[grep("ZNF433", colnames(expression))]