In [69]:
load('.RData')

In [70]:
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(scatterplot3d)
library(reshape2)
library(MASS)
library(caret)
library(klaR)

In [71]:
fit_control <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10,
                           classProbs=TRUE)

In [72]:
### LDA tiene problemas con NA values, eliminamos las columnas con NA
na_colnames <- c('BAD_N', 'BCL2_N', 'pCFOS_N', 'H3AcK18_N', 'EGR1_N')
colnames <- colnames(df[, c(!colnames(df) %in% na_colnames)])
df <- na.omit(df[, colnames])

In [73]:
df.sc <- as.data.frame(scale(df[, 6:77]))
df.sc$class <- df$class

In [74]:
head(df.sc)

Unnamed: 0_level_0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,⋯,PSD95_N,SNCA_N,Ubiquitin_N,pGSK3B_Tyr216_N,SHH_N,pS6_N,SYP_N,H3MeK4_N,CaNA_N,class
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
1,0.547726,0.74697648,2.31466,1.5387589,2.467996,-0.2328129,-0.02204335,-0.830639,0.69547783,0.9874968,⋯,-0.7299433,-2.059763,-1.067441,-0.160245748,-1.334753,-0.9300461,-0.14752255,-1.382456,1.027899,cCSm
2,0.613498,0.43280233,1.9450962,1.4616414,2.129321,-0.4193892,-0.21344088,-0.8893354,0.53711574,0.5721689,⋯,-0.7686801,-1.996836,-1.280355,0.032113311,-0.928804,-0.9102021,0.06695985,-1.330374,1.232157,cCSm
3,0.5809273,0.65538591,2.0763853,1.1673928,2.059414,-0.4874763,-0.09634079,-0.8956755,0.63586312,0.4779545,⋯,-0.7225655,-2.061349,-1.359435,0.004295111,-1.1649196,-0.7918839,-0.01900458,-1.395854,1.781646,cCSm
4,0.1788796,0.04373118,0.8779864,0.5339503,1.34618,-0.1276231,-0.06649765,-0.9899385,-0.06501135,0.5687006,⋯,-0.9473704,-1.575652,-1.399614,-0.141565084,-1.2201852,-1.1459113,-0.67192451,-1.050128,1.102773,cCSm
5,0.1359247,0.04564276,0.8815183,0.243011,1.056666,-0.3812694,-0.18079441,-1.0030937,-0.51269761,0.3245529,⋯,-0.7516929,-1.590564,-1.353809,0.351345465,-0.7460477,-1.0352628,-0.04304257,-1.023869,1.521065,cCSm
6,0.2112447,0.10372175,1.0539193,0.3009735,1.155408,-0.2393552,-0.07573764,-0.9978648,-0.4213405,0.1583996,⋯,-0.7727167,-1.542239,-1.824557,-0.028601767,-1.2779017,-0.918218,0.04106952,-1.133952,1.450907,cCSm


In [80]:
set.seed(99)
index <- createDataPartition(df.sc$class, p = 0.70, list = FALSE)
train_data <- df.sc[index, ]
test_data <- df.sc[-index, ]
sprintf("The trainining set contains %d rows", nrow(train_data))
sprintf("The test set contains %d rows", nrow(test_data))

In [81]:
table(train_data$class)
train_balanced <- upSample(x = train_data, y = train_data$class)
train_balanced <- train_balanced[, !(colnames(train_balanced) %in% c("Class"))]

table(train_balanced$class)


cCSm cCSs cSCm cSCs tCSm tCSs tSCm tSCs 
  63   53   63   63   74   74   63   93 


cCSm cCSs cSCm cSCs tCSm tCSs tSCm tSCs 
  93   93   93   93   93   93   93   93 

In [82]:
#El modelo da error por variables coliniares.
#eliminamos las variables con alta correlacion
correlationMatrix <- cor(df.sc[, 1:72])
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.9)
highCorrelatedColumns <- colnames(correlationMatrix[, highlyCorrelated])
highCorrelatedColumns
train_data <- train_data[, !colnames(train_data) %in% c(highCorrelatedColumns, class)]
test_data <- test_data[, !colnames(test_data) %in% c(highCorrelatedColumns, class)]


In [83]:
lda_model <- train(class ~ ., 
                   data = train_data, 
                   method = 'lda', 
                   trControl = fit_control)
lda_model

Linear Discriminant Analysis 

546 samples
 66 predictor
  8 classes: 'cCSm', 'cCSs', 'cSCm', 'cSCs', 'tCSm', 'tCSs', 'tSCm', 'tSCs' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 492, 492, 492, 493, 493, 493, ... 
Resampling results:

  Accuracy   Kappa    
  0.9944681  0.9936505


In [84]:
lda_preds <- predict(lda_model, newdata = test_data)
confusionMatrix(lda_preds, test_data$class)

Confusion Matrix and Statistics

          Reference
Prediction cCSm cCSs cSCm cSCs tCSm tCSs tSCm tSCs
      cCSm   26    0    0    0    1    0    0    0
      cCSs    0   22    0    0    0    0    0    0
      cSCm    0    0   27    0    0    0    0    0
      cSCs    0    0    0   27    0    0    0    0
      tCSm    1    0    0    0   30    0    0    0
      tCSs    0    0    0    0    0   31    0    0
      tSCm    0    0    0    0    0    0   27    0
      tSCs    0    0    0    0    0    0    0   39

Overall Statistics
                                          
               Accuracy : 0.9913          
                 95% CI : (0.9691, 0.9989)
    No Information Rate : 0.1688          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9901          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: cCSm Class: cCSs Class: cSCm C

In [87]:
# Warning message:
# “model fit failed for Fold01.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
#   some group is too small for 'qda'



In [86]:
qda_model <- train(class ~ ., 
                   data = train_data, 
                   method = 'qda', 
                   trControl = fit_control)
qda_model

“model fit failed for Fold01.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold02.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold03.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold04.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold05.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold06.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold07.Rep01: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold08.Rep01: parameter=none Error in qda.default(x, grouping

“model fit failed for Fold07.Rep06: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold08.Rep06: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold09.Rep06: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold10.Rep06: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold01.Rep07: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold02.Rep07: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold03.Rep07: parameter=none Error in qda.default(x, grouping, ...) : 
  some group is too small for 'qda'
”
“model fit failed for Fold04.Rep07: parameter=none Error in qda.default(x, grouping

Something is wrong; all the Accuracy metric values are missing:
    Accuracy       Kappa    
 Min.   : NA   Min.   : NA  
 1st Qu.: NA   1st Qu.: NA  
 Median : NA   Median : NA  
 Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA  
 Max.   : NA   Max.   : NA  
 NA's   :1     NA's   :1    


ERROR: Error: Stopping


In [59]:
summary(train_data)

     BDNF_N             NR2A_N             pAKT_N            pBRAF_N         
 Min.   :-2.36197   Min.   :-2.25239   Min.   :-2.77009   Min.   :-2.840172  
 1st Qu.:-0.71282   1st Qu.:-0.77192   1st Qu.:-0.70944   1st Qu.:-0.708365  
 Median :-0.10030   Median :-0.18207   Median :-0.11318   Median :-0.001692  
 Mean   :-0.04791   Mean   :-0.07682   Mean   :-0.03136   Mean   :-0.002824  
 3rd Qu.: 0.50305   3rd Qu.: 0.51222   3rd Qu.: 0.56938   3rd Qu.: 0.639969  
 Max.   : 3.04751   Max.   : 3.72296   Max.   : 3.00342   Max.   : 5.600817  
   pCAMKII_N           pCREB_N             pELK_N             pERK_N        
 Min.   :-1.53558   Min.   :-2.46295   Min.   :-2.56470   Min.   :-1.49216  
 1st Qu.:-0.87186   1st Qu.:-0.72335   1st Qu.:-0.57953   1st Qu.:-0.74584  
 Median :-0.22525   Median :-0.06933   Median :-0.15214   Median :-0.35667  
 Mean   :-0.03701   Mean   :-0.02292   Mean   :-0.05853   Mean   :-0.02908  
 3rd Qu.: 0.69428   3rd Qu.: 0.67183   3rd Qu.: 0.37824   3rd Qu.: 0.