# Analysis of Appendix B.3.2

In [1]:
library(dplyr)
library(ggplot2)
library(ggpubr)
source("helper.r")
theme_set(theme_pubr(legend = "none"))

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'GGally' was built under R version 3.6.3"Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
"package 'gridExtra' was built under R version 3.6.3"
Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine



### Data Preparation 

In [38]:
# path to folder, with folders for dataset results
path = "../data/runs/mlp/"
datasets = list.files(path)

# create list with one list containing one dataframe per dataset
data.list = lapply(datasets, function(data){
  
  data.path = paste0(path, data, "/2_3_effects_and_trees/")
  objectives = list.files(data.path)
  
  for(i in 1:length(objectives)){
 
    res = readRDS(paste0(data.path, objectives[i]))
    df.sub = res$result[[1]]$eval
    df.sub$objective = res$objective
    
    if(i == 1) df = df.sub
    else df = rbind(df, df.sub)
  }
  
  return(df)
})
names(data.list) = datasets

ERROR: Error in rbind(deparse.level, ...): numbers of columns of arguments do not match


### Create Table  5

In [22]:
# relative confidence improvement on dataset level
df.datasets.conf = create_table_datasets(data.list, "SS_L2", "conf.rel", 7)

# count features with highest and lowest improvements (for Table 7)
df.feat.low      = as.data.frame(df.datasets.conf %>% group_by(feature.1) %>% summarise(n()))
df.feat.high     = as.data.frame(df.datasets.conf %>% group_by(feature) %>% summarise(n()))
df.feat.conf     = left_join(df.feat.high, df.feat.low, by = c("feature"= "feature.1"))

df.datasets.conf[,c("mean","sd","mean.1", "mean.2")] = round(df.datasets.conf[,c("mean","sd","mean.1", "mean.2")]*100,0)
names(df.datasets.conf)[4:7] = c("feat.high", "mean.high", "feat.low", "mean.low")
df.datasets.conf[,c(1:3,5,7)]

dataset,mean,sd,mean.high,mean.low
adult,34,6,38,25
airlines,49,20,61,3
albert,57,26,78,14
Amazon_employee_access,58,17,69,21
APSFailure,46,17,60,22
Australian,41,7,46,32
bank-marketing,29,13,45,15
blood-transfusion-service-center,34,20,39,13
car,44,17,51,32
christine,47,14,54,19


### Create Table 6

In [23]:
# relative negative loglikelihood improvement on dataset level
df.datasets.nll  = create_table_datasets(data.list, "SS_L2", "neg_loglik.rel", 7)

# count features with highest and lowest improvements (for Table 7)
df.feat.low      = as.data.frame(df.datasets.nll %>% group_by(feature.1) %>% summarise(n()))
df.feat.high     = as.data.frame(df.datasets.nll %>% group_by(feature) %>% summarise(n()))
df.feat.nll      = left_join(df.feat.high, df.feat.low, by = c("feature"= "feature.1"))

df.datasets.nll[,c("mean","sd","mean.1", "mean.2")] = round(df.datasets.nll[,c("mean","sd","mean.1", "mean.2")]*100,0)
names(df.datasets.nll)[4:7] = c("feat.high", "mean.high", "feat.low", "mean.low")
df.datasets.nll[,c(1:3,5,7)]


dataset,mean,sd,mean.high,mean.low
adult,13,6,23,8
airlines,17,9,23,1
albert,31,13,40,6
Amazon_employee_access,0,36,29,-35
APSFailure,15,7,23,6
Australian,12,14,23,-4
bank-marketing,7,9,17,-1
blood-transfusion-service-center,6,17,10,-8
car,26,32,35,10
christine,10,11,17,1


### Create Table 7 

In [25]:
df.feat = left_join(df.feat.conf, df.feat.nll, by = "feature")
names(df.feat) = c("hyperparameter","#MC.high", "#MC.low", "#NLL.high", "#NLL.low") 
df.feat

hyperparameter,#MC.high,#MC.low,#NLL.high,#NLL.low
batch_size,1,3.0,3,4.0
learning_rate,6,2.0,6,3.0
max_dropout,9,1.0,2,1.0
max_units,4,,7,
momentum,8,,7,3.0
num_layers,3,14.0,9,11.0
weight_decay,4,15.0,1,13.0


## Analyse different objective functions 

### Create Table 8 

In [29]:
targets = c("conf.rel", "conf.rel.opt.1", "neg_loglik.rel")
tab.obj = lapply(targets, function(target){
            tab.l2 = create_table_features(data.list = data.list, objective = "SS_L2", target = target, depth = 7)[,1:2]
            tab.ar = create_table_features(data.list = data.list, objective = "SS_area", target = target, depth = 7)[,1:2]
            tab.sd = create_table_features(data.list = data.list, objective = "SS_sd", target = target, depth = 7)[,1:2]
            tab = left_join(left_join(tab.l2, tab.ar, by = "feature"), tab.sd, by = "feature")
            tab[,2:4] = tab[,2:4]*100
            names(tab) = c("hyperparameter", "l2", "area", "sd")
            tab
        })
names(tab.obj) = targets
tab.obj


hyperparameter,l2,area,sd
batch_size,40.83266,39.82681,37.70826
learning_rate,50.15802,49.97155,49.62267
max_dropout,49.70382,49.10916,47.18126
max_units,51.08953,50.58538,50.40577
momentum,51.67007,51.1626,50.68335
num_layers,30.61865,30.07455,28.91955
weight_decay,36.26012,35.00402,33.95084

hyperparameter,l2,area,sd
batch_size,61.91107,57.49427,55.32652
learning_rate,57.62318,57.01493,56.91343
max_dropout,62.38183,60.56626,57.85173
max_units,58.62245,57.58086,57.60857
momentum,58.26209,57.07259,56.90484
num_layers,50.90655,46.13669,46.20734
weight_decay,61.03452,52.73897,50.70079

hyperparameter,l2,area,sd
batch_size,19.81016,19.23348,15.72165
learning_rate,17.88621,17.93516,17.50975
max_dropout,17.40855,17.5442,16.77193
max_units,24.62329,24.07848,24.79464
momentum,19.72032,20.3543,20.22457
num_layers,13.8333,15.33407,15.26577
weight_decay,11.94863,11.56474,11.12226
