In [None]:
#install.packages("googledrive") #only need to install occasionally
#install.packages("httpuv") 
#install.packages("kohonen")
#install.packages("caret")

suppressMessages(install.packages("prediction"))
suppressMessages(install.packages("easystats"))
suppressMessages(install.packages("GGally"))
suppressMessages(install.packages("reshape2") )
suppressMessages(install.packages("naivebayes"))
suppressMessages(install.packages("e1071"))
suppressMessages(install.packages("randomForest"))
suppressMessages(install.packages("adabag"))
suppressMessages(install.packages("nnet"))
suppressMessages(install.packages("psych"))

In [None]:
#library(MASS)
#library(reshape) 
#library(caret)


suppressMessages(library(prediction))
suppressMessages(library(easystats))
suppressMessages(library(GGally))
suppressMessages(library(reshape2)) 
suppressMessages(library(ggplot2))
suppressMessages(library(stats))
suppressMessages(library(datasets))

suppressMessages(library(naivebayes))
suppressMessages(library(dplyr))
suppressMessages(library(psych))

suppressMessages(library(e1071))
suppressMessages(library(randomForest))
suppressMessages(library(adabag))
suppressMessages(library(nnet))

In [None]:
# Always check path to ensure current python version
#if (file.exists("/usr/local/lib/python3.8/dist-packages/google/colab/_ipython.py")) { #may update python version  
#                                       #occasionally
#  import_library("R.utils")
#  library("R.utils")
#  library("httr")
#  my_check <- function() {return(TRUE)}
#  reassignInPackage("is_interactive", pkgName = "httr", my_check) 
#  options(rlang_interactive=TRUE)
#} else {
#  print('Failed')
#}

In [1]:
# Download messidor_features.arff file to /content

download.file('https://archive.ics.uci.edu/ml/machine-learning-databases/00329/messidor_features.arff','/content/messidor_features.arff')

In [None]:
library(foreign)

loadData = function(path){
    
    df = read.arff(path)
    colnames(df) <- c(
        "q",      #  0 The binary result of quality assessment. 0 = bad quality 1 = sufficient quality
        "ps",     #  1 The binary result of pre-screening, 1 indicates severe retinal abnormality and 0 its lack
        "nma.a",  #  2 Number of MAs found at the confidence levels alpha = 0.5
        "nma.b",  #  3 Number of MAs found at the confidence levels alpha = 0.6
        "nma.c",  #  4 Number of MAs found at the confidence levels alpha = 0.7
        "nma.d",  #  5 Number of MAs found at the confidence levels alpha = 0.8
        "nma.e",  #  6 Number of MAs found at the confidence levels alpha = 0.9
        "nma.f",  #  7 Number of MAs found at the confidence levels alpha = 1.0
        "nex.a",  #  8 Number of Exudates found at the confidence levels alpha = 0.5
        "nex.b",  #  9 Number of Exudates found at the confidence levels alpha = 0.6
        "nex.c",  # 10 Number of Exudates found at the confidence levels alpha = 0.7
        "nex.d",  # 11 Number of Exudates found at the confidence levels alpha = 0.8
        "nex.e",  # 12 Number of Exudates found at the confidence levels alpha = 0.9
        "nex.g",  # 13 Number of Exudates found at the confidence levels alpha = 1.0
        "nex.f",  # 14 Number of Exudates found at the confidence levels alpha = 1.0
        "nex.h",  # 15 Number of Exudates found at the confidence levels alpha = 1.0
        "dd",     # 16 The euclidean distance of the center of the macula and the center of the optic disc
        "dm",     # 17 The diameter of the optic disc
        "amfm",   # 18 The binary result of the AM/FM-based classification
        "class"   # 19 Class label. 1 = contains signs of DR, 0 = no signs of DR
    )
    odf = df
    
    colnames(df)
    numericFeats = c(3:16)
    eyeFeats = c(17,18)
    df[, c(numericFeats, eyeFeats)] = scale(df[, c(numericFeats, eyeFeats)])
    
    df$class = as.numeric(df$class)
    #newList = list("nf" = numericFeats, "ef" = eyeFeats, "odf" = odf, "df" = df)
    return(df)
}

In [None]:
# Load file into variable df
df = loadData("./messidor_features.arff")

In [None]:
# Unpivots a DataFrame from wide format to long format
long = melt(df[,c(1:ncol(df)-1)])

No id variables; using all as measure variables



In [None]:
ggplot(long) + 
    geom_boxplot(aes(variable, value)) + 
    coord_flip() +
    labs(title="Unimodal feature distribution", x='Feature', y='Scaled value')

In [None]:
ggcorr(df) + labs(title="Feature covariance matrix")

In [None]:
ggplot(df) + geom_point(aes(nma.a, nma.b, color=class)) + facet_wrap(~amfm)

In [None]:
ggplot(df) + geom_point(aes(nma.a, nma.f, color=class)) + facet_wrap(~amfm)

In [None]:
ggplot(df) + geom_point(aes(nex.a, nex.b, color=class)) + facet_wrap(~amfm)

In [None]:
ggplot(df) + geom_point(aes(nex.a, nex.h, color=class)) + facet_wrap(~amfm)

In [None]:
#require(kohonen)
#cols = colnames(df)[1:(ncol(df)-1)]
#base = df[,cols]
#som_grid = somgrid(xdim=15, ydim=15, topo="hexagonal")
#som_model = som(as.matrix(base), grid=som_grid, rlen=200)

#plot(som_model, type="property", property = df$nma.a)
#title("SOM colored for nma.a feature")

In [None]:
f1 = as.formula(class ~ ps + 
                    nma.a + nma.b + nma.c + nma.d + nma.e + nma.f + 
                    nex.a + nex.b + nex.c + nex.d + nex.e + nex.g + nex.f + nex.h + 
                    dd + dm + amfm)

data = df
k = 10
ks = c(1:k)
folds = createFolds(data$class, k, list=FALSE)
result = rep(0, k)

In [None]:
# SVM
# svm = kfoldValidate(f1, data=df, learner=fitpredict.svm, performance=perf.auc)$mean,

kernel="polynomial"
degree=2
coef0=1

for (i in ks){
    trainingKs = ks[ks!=i]
    validationKs = ks[ks==i]
    trainingSet = data[which(folds %in% trainingKs), ]
    validationSet = data[which(folds %in% validationKs), ]
}

model = svm(f1, trainingSet, kernel=kernel, degree=degree, coef0=coef0)

#pred = as.integer(predict(model, validationSet))
#real = as.integer(validationSet[,all.vars(f1)[1]])

pred = as.integer(predict(model, validationSet))
real = as.integer(validationSet[,all.vars(f1)[1]])

#new_data = data.frame(pred=pred, real=real, ok=(pred==real))

performance(prediction(model, validationSet))@y.values[[1]]



“Objects of class `prediction` are not supported model objects.”


ERROR: ignored

In [None]:
helpers.result = function(pred, real){
    return(data.frame(pred=pred, real=real, ok=(pred==real)))
}

fitpredict.svm = function(formula, trainingSet, validationSet, kernel="polynomial", degree=2, coef0=1){
    model = svm(formula, trainingSet, kernel=kernel, degree=degree, coef0=coef0)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.forest = function(formula, trainingSet, validationSet, ntree=100){
    model = randomForest(formula, trainingSet, ntree=ntree)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.knn = function(formula, trainingSet, validationSet, k=5){
    cols = all.vars(formula)[2:length(all.vars(formula))]
    pred = as.integer(knn(trainingSet[,cols], validationSet[,cols], trainingSet$class, k=k))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.adaboost = function(formula, trainingSet, validationSet, boos=TRUE, mfinal=10, coeflearn='Breiman'){
    model = boosting(formula, trainingSet, boos=boos, mfinal=mfinal, coeflearn=coeflearn)
    pred = as.integer(predict.boosting(model, validationSet)$class)+1
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.nnet = function(formula, trainingSet, validationSet, size=10){
    model = nnet(formula, trainingSet, size=size, trace=FALSE)
    pred = as.integer(round(predict(model, validationSet)))+1
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.naiveBayes = function(formula, trainingSet, validationSet){
    model = naiveBayes(formula, trainingSet)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.lda = function(formula, trainingSet, validationSet){
    model = lda(formula, trainingSet)
    pred = as.integer(data.frame(predict(model, validationSet))$class)
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

In [None]:
perf.auc = function(yy){
    return(performance(prediction(yy$real, yy$pred), "auc")@y.values[[1]])
}

In [None]:
kfoldValidate = function(formula, data, learner, performance, ...){
    
    k=10
    ks = c(1:k)
    folds = createFolds(data$class, k, list=FALSE)
    result = rep(0, k)
    
    for (i in ks){
        
        trainingKs = ks[ks!=i]
        validationKs = ks[ks==i]
        trainingSet = data[which(folds %in% trainingKs), ]
        validationSet = data[which(folds %in% validationKs), ]
        result[i] = performance(learner(formula, trainingSet, validationSet, ...))
    }
    
    return(list(mean=mean(result), sd=sd(result), results=c(result)))
}

In [None]:
f1 = as.formula(class ~ ps + 
                    nma.a + nma.b + nma.c + nma.d + nma.e + nma.f + 
                    nex.a + nex.b + nex.c + nex.d + nex.e + nex.g + nex.f + nex.h + 
                    dd + dm + amfm)

resultsF1v = c(
    svm        = kfoldValidate(f1, data=df, learner=fitpredict.svm,        performance=perf.auc)$mean,
    forest     = kfoldValidate(f1, data=df, learner=fitpredict.forest,     performance=perf.auc)$mean,
    knn        = kfoldValidate(f1, data=df, learner=fitpredict.knn,        performance=perf.auc)$mean,
    adaboost   = kfoldValidate(f1, data=df, learner=fitpredict.adaboost,   performance=perf.auc)$mean,
    nnet       = kfoldValidate(f1, data=df, learner=fitpredict.nnet,       performance=perf.auc)$mean,
    naiveBayes = kfoldValidate(f1, data=df, learner=fitpredict.naiveBayes, performance=perf.auc)$mean,
    lda        = kfoldValidate(f1, data=df, learner=fitpredict.lda,        performance=perf.auc)$mean
)
resultsF1 = data.frame(f1_AUC=resultsF1v)
kable(resultsF1, caption="Model performance for F1")

ERROR: ignored