In [1]:
import os
import numpy as np
import pandas as pd
import plotly as plot
from sklearn.preprocessing import scale
from plotnine import *
from plotnine.data import *
from plotnine.data import economics
from plotnine import ggplot, aes, geom_line
from plotnine.geoms.geom_boxplot import geom_boxplot
from plotnine.geoms.geom_point import geom_point
from plotnine.geoms.geom_rug import coord_flip
from plotnine.labels import labs
from google.colab import drive

In [None]:
# Check if file already exists
matches = [match for match in os.listdir() if "messidor_features.arff" in match]

# If file is missing, then download it
if not matches:
  !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00329/messidor_features.arff

--2022-12-03 02:29:49--  https://archive.ics.uci.edu/ml/machine-learning-databases/00329/messidor_features.arff
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 117224 (114K) [application/x-httpd-php]
Saving to: ‘messidor_features.arff’


2022-12-03 02:29:49 (806 KB/s) - ‘messidor_features.arff’ saved [117224/117224]



In [None]:
# Load the data and convert to dataframe.
from scipy.io import arff
data = arff.loadarff('/content/messidor_features.arff')
df = pd.DataFrame(data[0])

In [None]:
col_names = ['q', 'ps', 'nma.a', 'nma.b', 'nma.c', 'nma.d', 'nma.e', 'nma.f', 
             'nex.a', 'nex.b', 'nex.c', 'nex.d', 'nex.e', 'nex.f', 'nex.g', 
             'nex.h', 'dd', 'dm', 'amfm', 'class_label']

df.columns = col_names

In [None]:
# numericFeats = list(range(3,17))
# eyeFeats = list(range(17,19))

In [None]:
# df[, c(numericFeats, eyeFeats)] = scale(df[, c(numericFeats, eyeFeats)])
df.iloc[:, 2:17] = scale(df.iloc[:, 2:17])

# df$class = as.factor(df$class)
# df['new_factor'], _ = pd.factorize(df['old_categorical'], sort=True)

df["class_label"], _ = pd.factorize(df["class_label"], sort=True)

In [None]:
# long = melt(df[,c(1:ncol(df)-1)])
long = pd.melt(df.iloc[:, 0:len(df.columns)-1], col_level=0)

In [None]:
# ggplot(long) + geom_boxplot(aes(variable, value)) + coord_flip() + labs(title="Unimodal feature distribution", x='Feature', y='Scaled value')
ggplot(long) + geom_boxplot(aes(x='variable', y='value')) + coord_flip() + labs(title="Unimodal feature distribution", x='Feature', y='Scaled value')

In [None]:
# ggcorr(df) + labs(title="Feature covariance matrix")
df.corr().style.background_gradient(cmap='PuBu')


In [None]:
ggplot(df) + geom_point(aes(x="nma.a", y="nma.b", color="class_label")) + facet_wrap("~amfm")

In [None]:
ggplot(df) + geom_point(aes(x="nma.a", y="nma.f", color="class_label")) + facet_wrap("~amfm")

In [None]:
ggplot(df) + geom_point(aes(x="nex.a", y="nex.b", color="class_label")) + facet_wrap("~amfm")

In [None]:
ggplot(df) + geom_point(aes(x="nex.a", y="nex.h", color="class_label")) + facet_wrap("~amfm")

In [None]:
ggplot(df) + geom_point(aes(x="nma.a", y="nex.h", color="class_label")) + facet_wrap("~amfm")

In [None]:
helpers.result = function(pred, real){
    return(data.frame(pred=pred, real=real, ok=(pred==real)))
}

fitpredict.svm = function(formula, trainingSet, validationSet, kernel="polynomial", degree=2, coef0=1){
    model = svm(formula, trainingSet, kernel=kernel, degree=degree, coef0=coef0)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.forest = function(formula, trainingSet, validationSet, ntree=100){
    model = randomForest(formula, trainingSet, ntree=ntree)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.knn = function(formula, trainingSet, validationSet, k=5){
    cols = all.vars(formula)[2:length(all.vars(formula))]
    pred = as.integer(knn(trainingSet[,cols], validationSet[,cols], trainingSet$class, k=k))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.adaboost = function(formula, trainingSet, validationSet, boos=TRUE, mfinal=10, coeflearn='Breiman'){
    model = boosting(formula, trainingSet, boos=boos, mfinal=mfinal, coeflearn=coeflearn)
    pred = as.integer(predict.boosting(model, validationSet)$class)+1
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.nnet = function(formula, trainingSet, validationSet, size=10){
    model = nnet(formula, trainingSet, size=size, trace=FALSE)
    pred = as.integer(round(predict(model, validationSet)))+1
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.naiveBayes = function(formula, trainingSet, validationSet){
    model = naiveBayes(formula, trainingSet)
    pred = as.integer(predict(model, validationSet))
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

fitpredict.lda = function(formula, trainingSet, validationSet){
    model = lda(formula, trainingSet)
    pred = as.integer(data.frame(predict(model, validationSet))$class)
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(pred, real))
}

SyntaxError: ignored

In [None]:
perf.auc = function(yy){
    
    return(performance(prediction(yy$real, yy$pred), "auc")@y.values[[1]])
}

In [None]:
kfoldValidate = function(formula, data, learner, performance, ...){
    
    k=10
    ks = c(1:k)
    folds = createFolds(data$class, k, list=FALSE)
    result = rep(0, k)
    
    for (i in ks){
        
        trainingKs = ks[ks!=i]
        validationKs = ks[ks==i]
        trainingSet = data[which(folds %in% trainingKs), ]
        validationSet = data[which(folds %in% validationKs), ]
        result[i] = performance(learner(formula, trainingSet, validationSet, ...))
    }
    
    return(list(mean=mean(result), sd=sd(result), results=c(result)))
}

In [None]:
f1 = as.formula(class ~ ps + 
                    nma.a + nma.b + nma.c + nma.d + nma.e + nma.f + 
                    nex.a + nex.b + nex.c + nex.d + nex.e + nex.g + nex.f + nex.h + 
                    dd + dm + amfm)

resultsF1v = c(
    svm        = kfoldValidate(f1, data=df, learner=fitpredict.svm,        performance=perf.auc)$mean,
    forest     = kfoldValidate(f1, data=df, learner=fitpredict.forest,     performance=perf.auc)$mean,
    knn        = kfoldValidate(f1, data=df, learner=fitpredict.knn,        performance=perf.auc)$mean,
    adaboost   = kfoldValidate(f1, data=df, learner=fitpredict.adaboost,   performance=perf.auc)$mean,
    nnet       = kfoldValidate(f1, data=df, learner=fitpredict.nnet,       performance=perf.auc)$mean,
    naiveBayes = kfoldValidate(f1, data=df, learner=fitpredict.naiveBayes, performance=perf.auc)$mean,
    lda        = kfoldValidate(f1, data=df, learner=fitpredict.lda,        performance=perf.auc)$mean
)
resultsF1 = data.frame(f1_AUC=resultsF1v)
kable(resultsF1, caption="Model performance for F1")

In [None]:
cols = all.vars(f1)[2:length(all.vars(f1))]
fit = prcomp(df[,cols], center=T, scale=T)

df = cbind(df, fit$x)

In [None]:
f2 = as.formula(class ~ PC1 + PC2 +  PC3 +  PC4 +  PC5 +  PC6 +  PC7 + PC8 + PC9 + PC10 + PC11 + PC12 )

resultsF2v = c(
    svm        = kfoldValidate(f2, data=df, learner=fitpredict.svm,        performance=perf.auc)$mean,
    forest     = kfoldValidate(f2, data=df, learner=fitpredict.forest,     performance=perf.auc)$mean,
    knn        = kfoldValidate(f2, data=df, learner=fitpredict.knn,        performance=perf.auc)$mean,
    adaboost   = kfoldValidate(f2, data=df, learner=fitpredict.adaboost,   performance=perf.auc)$mean,
    nnet       = kfoldValidate(f2, data=df, learner=fitpredict.nnet,       performance=perf.auc)$mean,
    naiveBayes = kfoldValidate(f2, data=df, learner=fitpredict.naiveBayes, performance=perf.auc)$mean,
    lda        = kfoldValidate(f2, data=df, learner=fitpredict.lda,        performance=perf.auc)$mean
)

resultsF2 = data.frame(f2_AUC=resultsF2v)
results = cbind(resultsF2, resultsF1)
results$improvement=results$f2_AUC - results$f1_AUC
kable(results, caption="Model performance for F1 vs F2")

In [None]:
f3 = as.formula(class ~ ps + 
                    nma.a + nma.b + nma.c + nma.d + nma.e + nma.f + 
                    nex.a + nex.b + nex.c + nex.d + nex.e + nex.g + nex.f + nex.h + 
                    dd + dm + amfm + 
                    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12)


resultsF3v = c(
    svm        = kfoldValidate(f3, data=df, learner=fitpredict.svm,        performance=perf.auc)$mean,
    forest     = kfoldValidate(f3, data=df, learner=fitpredict.forest,     performance=perf.auc)$mean,
    knn        = kfoldValidate(f3, data=df, learner=fitpredict.knn,        performance=perf.auc)$mean,
    adaboost   = kfoldValidate(f3, data=df, learner=fitpredict.adaboost,   performance=perf.auc)$mean,
    nnet       = kfoldValidate(f3, data=df, learner=fitpredict.nnet,       performance=perf.auc)$mean,
    naiveBayes = kfoldValidate(f3, data=df, learner=fitpredict.naiveBayes, performance=perf.auc)$mean,
    lda        = kfoldValidate(f3, data=df, learner=fitpredict.lda,        performance=perf.auc)$mean
)

resultsF3 = data.frame(f3_AUC=resultsF3v)
results = cbind(resultsF3, resultsF2)
results$improvement=results$f3_AUC - results$f2_AUC
kable(results, caption="Model performance for F2 vs F3")

In [None]:
ind <- sample(2, nrow(df), replace=TRUE, prob=c(0.5, 0.5))

trainingDf   = df[ind==1,]
validationDf = df[ind==2,]

In [None]:
selectModel = function(formula, data, learner, performance, hyperParameters){
    
    if(is.null(hyperParameters)){
    
        return(list(model=NULL, data=NULL))
    }
    
    results = cbind(hyperParameters, performance=rep(0, nrow(hyperParameters)))
    
    for (i in 1:nrow(results)){
        hyper = c(hyperParameters[i, ])
        arguments = c(list(formula=formula, data=data, learner=learner, performance=performance), hyper)
        results[i, 'performance'] = do.call(kfoldValidate, arguments)$mean
    }
    
    selectedModel = results[which.max(results$performance), colnames(hyperParameters)]
    names(selectedModel) = colnames(hyperParameters)
    
    return(list(model=as.list(selectedModel), data=results))
}

In [None]:
candidateParameters = list(
    svm        = buildHyperDf(kernel=c("polynomial"), degree=c(1,2), coef0=seq(1, 3, by=1)),
    forest     = buildHyperDf(ntree=seq(10, 20, by=10)),
    knn        = buildHyperDf(k=seq(2, 4, by=1)),
    nnet       = buildHyperDf(size=seq(1, 3, by=1))
)

params = list(
    svm    = selectModel(f3, trainingDf, fitpredict.svm,    perf.auc, candidateParameters$svm),
    forest = selectModel(f3, trainingDf, fitpredict.forest, perf.auc, candidateParameters$forest),
    knn    = selectModel(f3, trainingDf, fitpredict.knn,    perf.auc, candidateParameters$knn),
    nnet   = selectModel(f3, trainingDf, fitpredict.nnet,   perf.auc, candidateParameters$nnet)
)

kable(params$svm$data[order(-params$svm$data$performance),], 
    caption="SVM performance for different variable combinations"
)

In [None]:
svmResult = kfoldValidate(f3, validationDf, learner=fitpredict.svm, performance=perf.auc, 
    kernel=params$svm$model$kernel,
    degree=params$svm$model$degree,
    coef0=params$svm$model$coef0
)$mean


kable(params$forest$data[order(-params$forest$data$performance),], 
    caption="Random Forest performance for different variable combinations"
)

In [None]:
forestResult = kfoldValidate(f3, validationDf, learner=fitpredict.forest, performance=perf.auc, 
    ntree=params$forest$model$ntree
)$mean


kable(params$knn$data[order(-params$knn$data$performance),], 
    caption="k-NN performance for different variable combinations"
)

In [None]:
knnResult = kfoldValidate(f3, validationDf, learner=fitpredict.knn, performance=perf.auc, 
    k=params$knn$model$k
)$mean

kable(params$nnet$data[order(-params$nnet$data$performance),], 
    caption="ANN performance for different variable combinations"
)

In [None]:
nnetResult = kfoldValidate(f3, validationDf, learner=fitpredict.nnet, performance=perf.auc, 
    size=params$nnet$model$size
)$mean

kable(data.frame(
    model=c('SVM', 'Random Forest', 'k-NN', 'ANN'), 
    auc=c(svmResult, forestResult, knnResult, nnetResult)),
    caption="Model performance for the best model paramenters for each model class"
)

In [None]:
selectModelFormula = function(formula, data, learner, performance, hyperParameters=NULL){
    
    class = all.vars(formula)[1]
    features = all.vars(formula)[2:length(all.vars(formula))]
    
    results = data.frame(left=c('full', features), result=rep(NA, length(features)+1))
    
    if(!is.null(hyperParameters)){
        
        selectedParameters = selectModel(formula, data, learner, performance, hyperParameters)$model
        
        arguments = c(
            list(formula=formula, data=data, learner=learner, performance=performance),
            selectedParameters    
        )
        
    } else {
        
        selectedParameters = NULL
        arguments = list(formula=formula, data=data, learner=learner, performance=performance)
    }
    
    bestRoundPerformance = tryCatch(do.call(kfoldValidate, arguments)$mean, error=function(cond){ return(0) })
    featureLeftOut = NULL
    bestRoundParameters = selectedParameters
    
    results[results$left=='full', 'result'] = bestRoundPerformance
    
    for(i in 1:length(features)){
        
        try({
            
            roundFormula = as.formula(paste(class, "~", paste(features[-c(i)], collapse="+")))
            
            if(!is.null(hyperParameters)){
                
                arguments = c(
                    list(formula=roundFormula, data=data, learner=learner, performance=performance),
                    bestRoundParameters    
                )
                
            } else {
                
                bestRoundParameters = NULL
                arguments = list(formula=formula, data=data, learner=learner, performance=performance)
            }
            
            roundPerformance = do.call(kfoldValidate, arguments)$mean
            results[results$left==features[i], 'result'] = roundPerformance
        
            if(roundPerformance > bestRoundPerformance){
                
                bestRoundPerformance = roundPerformance
                featureLeftOut = i
            }
        })
    }

    if(!is.null(featureLeftOut) && length(features) > 2){
        
        selectedFormula = as.formula(paste(class, "~", paste(features[-c(featureLeftOut)], collapse="+")))    
        return(selectModelFormula(selectedFormula, data, learner, performance, hyperParameters))
        
    } else {
        
        return(list(
            formula=formula,
            params=bestRoundParameters
        ))
    }
}

In [None]:
params = selectModelFormula(f3, trainingDf, fitpredict.knn, perf.auc, candidateParameters$knn)

kable(data.frame(features=all.vars(params$formula)), caption="Selected features using leave-one-out algorithm")

In [None]:
kable(data.frame(params$params), caption="Selected model parameters")

In [None]:
result = kfoldValidate(params$formula, validationDf, learner=fitpredict.knn, performance=perf.auc, 
    k=params$params
)$mean

kable(data.frame(auc=result), caption="k-NN performance for the optimized formula and model")

In [None]:
fitpredict.votingEnsemble = function(formula, trainingSet, validationSet, learners, params){
    
    ensembleResults = data.frame(majority=rep(NA, nrow(validationSet)))
    
    for (name in names(learners)){
        
        if(!is.null(params)){
            
            arguments = c(
                list(params[name][[1]]$formula, trainingSet=trainingSet, validationSet=validationSet),
                params[name][[1]]$params
            )
            
        } else {
            
            arguments = list(params[name][[1]]$formula, trainingSet=trainingSet, validationSet=validationSet)
        }
        
        modelResult = list(predicted=do.call(learners[name][[1]], arguments)$pred)
        names(modelResult) = c(name)
        ensembleResults = cbind(ensembleResults, modelResult)
    }
    
    ensembleResults$majority = apply(ensembleResults[, names(learners)], 1, FUN=function(x){ 
        as(names(which.max(table(x))), mode(x))  
    })
    
    real = as.integer(validationSet[,all.vars(formula)[1]])
    return(helpers.result(ensembleResults$majority, real))
}

In [None]:
selectVotingEnsemble = function(formula, data, performance, learners, learnersParams){
    
    learnerNames = names(learners)
    results = data.frame(left=c('full', learnerNames), result=rep(NA, length(learners)+1))
    
    bestRoundPerformance = kfoldValidate(formula, data, fitpredict.votingEnsemble, performance=perf.auc,
        learners=learners, 
        params=learnersParams
    )$mean    
    
    modelLeftOut = NULL
    
    results[results$left=='full', 'result'] = bestRoundPerformance
    
    for(i in 1:length(learnerNames)){
        
        try({
                
            roundPerformance = kfoldValidate(formula, data, fitpredict.votingEnsemble, perf.auc,
              learners=learners[-c(i)], 
              params=learnersParams[-c(i)]
            )$mean
            
            results[results$left==learnerNames[i], 'result'] = roundPerformance
            
            if(roundPerformance > bestRoundPerformance){
                
                bestRoundPerformance = roundPerformance
                modelLeftOut = i
            }
        })
    }
    
    if(!is.null(modelLeftOut) && length(learners) > 2){
        
        return(selectVotingEnsemble(formula, data, performance,
            learners[-c(modelLeftOut)], 
            learnersParams[-c(modelLeftOut)]
        ))
        
    } else {
        
        return(learners)
    }
}

In [None]:
learners = list(
    svm        = fitpredict.svm,
    forest     = fitpredict.forest,
    knn        = fitpredict.knn,
    nnet       = fitpredict.nnet,
    naiveBayes = fitpredict.naiveBayes,
    lda        = fitpredict.lda
)

candidateParameters = list(
    svm        = list(formula=f3, params=NULL),
    forest     = list(formula=f3, params=NULL),
    knn        = list(formula=f3, params=NULL),
    nnet       = list(formula=f3, params=NULL),
    naiveBayes = list(formula=f3, params=NULL),
    lda        = list(formula=f3, params=NULL)
)

finalEnsemble = selectVotingEnsemble(f3, trainingDf, perf.auc, learners, candidateParameters)

kable(data.frame(models=names(finalEnsemble)), caption = "Models considered by the naive ensemble")

In [None]:
result = kfoldValidate(f3, validationDf, fitpredict.votingEnsemble, perf.auc, finalEnsemble,
    candidateParameters[names(finalEnsemble)]
)$mean

kable(data.frame(auc=result), caption="Naive Ensemble Performance")

In [None]:
candidateParameters = list(
    svm        = buildHyperDf(kernel=c("polynomial", "radial"), degree=c(1,2), coef0=c(1,10,50,100)),
    forest     = buildHyperDf(ntree=seq(100, 500, by=100)),
    knn        = buildHyperDf(k=seq(2, 30, by=2)),
    nnet       = buildHyperDf(size=seq(5, 20, by=5))
)

svmParams        = selectModelFormula(f3, trainingDf, fitpredict.svm,        perf.auc, candidateParameters$svm)
forestParams     = selectModelFormula(f3, trainingDf, fitpredict.forest,     perf.auc, candidateParameters$forest)
nnetParams       = selectModelFormula(f3, trainingDf, fitpredict.nnet,       perf.auc, candidateParameters$nnet)
knnParams        = selectModelFormula(f3, trainingDf, fitpredict.knn,        perf.auc, candidateParameters$knn)
naiveBayesParams = selectModelFormula(f3, trainingDf, fitpredict.naiveBayes, perf.auc)
ldaParams        = selectModelFormula(f3, trainingDf, fitpredict.lda,        perf.auc)
adaboostParams   = list(
    formula=f3,
    params=NULL
)

learners = list(
    svm        = fitpredict.svm,
    forest     = fitpredict.forest,
    knn        = fitpredict.knn,
    nnet       = fitpredict.nnet,
    adaboost   = fitpredict.adaboost,
    naiveBayes = fitpredict.naiveBayes,
    lda        = fitpredict.lda
)

candidateModels = list(
    svm        = svmParams,
    forest     = forestParams,
    knn        = knnParams,
    nnet       = nnetParams,
    adaboost   = adaboostParams,
    naiveBayes = naiveBayesParams,
    lda        = ldaParams
)

finalLearners = selectVotingEnsemble(f3, trainingDf, perf.auc, learners, candidateModels)

kable(data.frame(models=names(finalEnsemble)), caption = "Models considered by the final ensemble")

In [None]:
resultTraining = kfoldValidate(f3, trainingDf, fitpredict.votingEnsemble, perf.auc, finalLearners,
    candidateModels[names(finalLearners)]
)$mean

resultValidation = kfoldValidate(f3, validationDf, fitpredict.votingEnsemble, perf.auc, finalLearners,
    candidateModels[names(finalLearners)]
)$mean

resultComplete = kfoldValidate(f3, df, fitpredict.votingEnsemble, perf.auc, finalLearners,
    candidateModels[names(finalLearners)]
)$mean

kable(data.frame(
    set=c('Training', 'Validation', 'All'), 
    auc=c(resultTraining, resultValidation, resultComplete)
), caption="Final ensemble performance")