In [1]:
load('.RData')

In [59]:
library(ggplot2)
library(ggpubr)
library(caret)
library(gmodels)

In [4]:
fit_control <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10,
                           classProbs=TRUE)

In [14]:
df.sc <- as.data.frame(scale(df[, 1:11]))
df.sc$k2 <- as.factor(paste0('c', df$k2))
df.sc$k3 <- as.factor(paste0('c', df$k3))

## - LDA K2

In [15]:
head(df.sc)

Unnamed: 0_level_0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,k2,k3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>
1,0.35387757,-0.1245628,0.1045781,-0.819306,0.5731841,-0.3695676,0.2751009,-0.2236219,-0.6400338,1.19385673,-0.09933544,c2,c1
2,0.84484966,-0.1247789,0.2316356,0.28266546,0.2111517,-1.1178159,-0.4335474,-0.2236155,-1.3345612,-1.04006015,-0.01580629,c2,c1
3,0.6811923,-0.1150527,-0.1495369,-0.73453897,-1.1541126,0.5033889,-0.9423206,-0.2236089,0.2439103,0.2165181,-0.26639374,c1,c2
4,0.84484966,-0.1245628,-0.2130657,-0.77692249,0.7340873,1.0645752,1.3289883,-0.2236207,-0.5768949,-0.20234132,-0.26639374,c2,c1
5,0.02656285,-0.1228337,0.8669233,-0.0987862,-1.1549171,-1.6790022,-1.5419462,-0.2236334,-1.1451446,0.00708839,1.65477673,c1,c2
6,-0.30075187,-0.1256435,-0.7212958,-0.01401916,0.9754422,0.2539728,-0.1791608,-0.2236153,0.4964657,-0.69101063,-0.93462694,c2,c1


In [31]:
set.seed(100)
index <- createDataPartition(df.sc$k2, p = 0.70, list = FALSE)
train_data <- df.sc[index, ]
test_data <- df.sc[-index, ]
sprintf("The trainining set contains %d rows", nrow(train_data))
sprintf("The test set contains %d rows", nrow(test_data))

In [32]:
table(train_data$k2)
train_balanced <- upSample(x = train_data, y = train_data$k2)
train_balanced <- train_balanced[, !(colnames(train_balanced) %in% c("Class"))]
table(train_balanced$k2)


 c1  c2 
136 320 


 c1  c2 
320 320 

In [33]:
lda_model.k2 <- train(k2 ~ ., 
                   data = train_balanced, 
                   method = 'lda', 
                   trControl = fit_control)
lda_model.k2

Linear Discriminant Analysis 

640 samples
 12 predictor
  2 classes: 'c1', 'c2' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 576, 576, 576, 576, 576, 576, ... 
Resampling results:

  Accuracy   Kappa    
  0.9339063  0.8678125


In [48]:
lda_preds.k2 <- predict(lda_model.k2, newdata = test_data)
confusionMatrix(lda_preds.k2, test_data$k2)

Confusion Matrix and Statistics

          Reference
Prediction  c1  c2
        c1  51   0
        c2   9 134
                                          
               Accuracy : 0.9536          
                 95% CI : (0.9138, 0.9786)
    No Information Rate : 0.6907          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8867          
                                          
 Mcnemar's Test P-Value : 0.007661        
                                          
            Sensitivity : 0.8500          
            Specificity : 1.0000          
         Pos Pred Value : 1.0000          
         Neg Pred Value : 0.9371          
             Prevalence : 0.3093          
         Detection Rate : 0.2629          
   Detection Prevalence : 0.2629          
      Balanced Accuracy : 0.9250          
                                          
       'Positive' Class : c1              
                              

In [60]:
CrossTable(test_data$k2 ,lda_preds.k2)


 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  194 

 
             | lda_preds.k2 
test_data$k2 |        c1 |        c2 | Row Total | 
-------------|-----------|-----------|-----------|
          c1 |        51 |         9 |        60 | 
             |    78.673 |    28.058 |           | 
             |     0.850 |     0.150 |     0.309 | 
             |     1.000 |     0.063 |           | 
             |     0.263 |     0.046 |           | 
-------------|-----------|-----------|-----------|
          c2 |         0 |       134 |       134 | 
             |    35.227 |    12.563 |           | 
             |     0.000 |     1.000 |     0.691 | 
             |     0.000 |     0.937 |           | 
             |     0.000 |     0.691 |           | 
-------------|-----------|-----------|--

In [49]:
paste('training error = ', round(mean(test_data$k2 != lda_preds.k2) * 100, 2), '%')

## - LDA K3

In [43]:
set.seed(100)
index <- createDataPartition(df.sc$k3, p = 0.70, list = FALSE)
train_data <- df.sc[index, ]
test_data <- df.sc[-index, ]
sprintf("The trainining set contains %d rows", nrow(train_data))
sprintf("The test set contains %d rows", nrow(test_data))

In [44]:
table(train_data$k3)
train_balanced <- upSample(x = train_data, y = train_data$k3)
train_balanced <- train_balanced[, !(colnames(train_balanced) %in% c("Class"))]
table(train_balanced$k3)


 c1  c2  c3 
225 112 119 


 c1  c2  c3 
225 225 225 

In [45]:
lda_model.k3 <- train(k3 ~ ., 
                   data = train_balanced, 
                   method = 'lda', 
                   trControl = fit_control)
lda_model.k3

Linear Discriminant Analysis 

675 samples
 12 predictor
  3 classes: 'c1', 'c2', 'c3' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times) 
Summary of sample sizes: 608, 606, 609, 606, 608, 608, ... 
Resampling results:

  Accuracy  Kappa   
  0.95126   0.926886


In [46]:
lda_preds.k3 <- predict(lda_model.k3, newdata = test_data)
confusionMatrix(lda_preds.k3, test_data$k3)

Confusion Matrix and Statistics

          Reference
Prediction c1 c2 c3
        c1 80  0  2
        c2 12 48  0
        c3  4  0 48

Overall Statistics
                                          
               Accuracy : 0.9072          
                 95% CI : (0.8573, 0.9441)
    No Information Rate : 0.4948          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.8562          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: c1 Class: c2 Class: c3
Sensitivity             0.8333    1.0000    0.9600
Specificity             0.9796    0.9178    0.9722
Pos Pred Value          0.9756    0.8000    0.9231
Neg Pred Value          0.8571    1.0000    0.9859
Prevalence              0.4948    0.2474    0.2577
Detection Rate          0.4124    0.2474    0.2474
Detection Prevalence    0.4227    0.3093    0.2680
Balanced Accuracy       0.906

In [50]:
paste('training error = ', round(mean(test_data$k3 != lda_preds.k3) * 100, 2), '%')