# Analysis of Appendix B.3.2

In [1]:
library(dplyr)
library(ggplot2)
library(ggpubr)
source("helper.r")
theme_set(theme_pubr(legend = "none"))

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'ggpubr' was built under R version 3.6.3"

### Data Preparation 

In [2]:
# path to folder, with folders for dataset results
path = "../data/mlp_eval/"
datasets = list.files(path)

# create list with one list containing one dataframe per dataset
data.list = lapply(datasets, function(data){
  
  data.path = paste0(path, data, "/")
  objectives = list.files(data.path)
  
  for(i in 1:length(objectives)){
 
    res = readRDS(paste0(data.path, objectives[i]))
    df.sub = res$result[[1]]$eval
    df.sub$objective = res$objective
    
    if(i == 1) df = df.sub
    else df = rbind(df, df.sub)
  }
  
  return(df)
})
names(data.list) = datasets

In [7]:
# analysis of confidence improvement
df_datasets = create_table_datasets(data.list, "SS_L2", "conf.rel", 7)
df_feat_worst = as.data.frame(df_datasets %>% group_by(feature.1) %>% summarise(n()))
df_feat_best = as.data.frame(df_datasets %>% group_by(feature) %>% summarise(n()))
df_feat = left_join(df_feat_best, df_feat_worst, by = c("feature"= "feature.1"))

df_datasets[,c("mean","sd","mean.1", "mean.2")] = round(df_datasets[,c("mean","sd","mean.1", "mean.2")]*100,0)
names(df_datasets)
df_feat

dataset,mean,sd,feature,mean.1,feature.1,mean.2
adult,34,6,momentum,38,weight_decay,25
airlines,49,20,max_units,61,weight_decay,3
albert,57,26,momentum,78,num_layers,14
Amazon_employee_access,58,17,learning_rate,69,num_layers,21
APSFailure,46,17,num_layers,60,weight_decay,22
Australian,41,7,weight_decay,46,learning_rate,32
bank-marketing,29,13,momentum,45,weight_decay,15
blood-transfusion-service-center,34,20,max_units,39,num_layers,13
car,44,17,learning_rate,51,num_layers,32
christine,47,14,momentum,54,num_layers,19


feature,n().x,n().y
batch_size,1,3.0
learning_rate,6,2.0
max_dropout,9,1.0
max_units,4,
momentum,8,
num_layers,3,14.0
weight_decay,4,15.0


In [None]:
# analysis of neg loglik improvement
df_datasets = create_table_datasets(data.all, "SS_L1", "neg_loglik.rel", 7)
df_feat_worst = as.data.frame(df_datasets %>% group_by(feature.1) %>% summarise(n()))
df_feat_best = as.data.frame(df_datasets %>% group_by(feature) %>% summarise(n()))
df_feat_loglik = left_join(df_feat_best, df_feat_worst, by = c("feature"= "feature.1"))
df_feat = left_join(df_feat, df_feat_loglik, by = "feature")

df_datasets[,c("mean","sd","mean.1", "mean.2")] = df_datasets[,c("mean","sd","mean.1", "mean.2")]*100
print(xtable(df_datasets[,-c(4,6)], digits = 0), include.rownames=FALSE)
print(xtable(df_feat), include.rownames=FALSE)


In [None]:
data_combined = do.call("rbind", data.all)
ggplot(data_combined, aes_string(x = "objective", y = "gt.abs")) + geom_boxplot(aes(fill = as.factor(depth)))  + facet_grid(. ~ feature)

data_SS_L1 = data_combined[data_combined$objective=="SS_L1",]
data_SS_sd = data_combined[data_combined$objective=="SS_sd",]
data_SS_L1$conf.diff_sd = data_combined[data_combined$objective=="SS_sd",]$conf.diff
data_SS_L1$conf.diff_L2 = data_combined[data_combined$objective=="SS_L2",]$conf.diff
data_SS_L1$conf.diff_area = data_combined[data_combined$objective=="SS_area",]$conf.diff

data_SS_L1$conf.diff.opt_sd = data_combined[data_combined$objective=="SS_sd",]$gt.abs

for(i in 1:nrow(data_SS_L1)){
  ranks = rank(data_SS_L1[i, c("conf.diff","conf.diff_sd","conf.diff_L2","conf.diff_area")])
  data_SS_L1$L1_rank[i] = ranks[1]
  data_SS_L1$sd_rank[i] = ranks[2]
  data_SS_L1$L2_rank[i] = ranks[3]
  data_SS_L1$area_rank[i] = ranks[4]
}


for(i in 1:nrow(data_SS_L1)){
  ranks = rank(data_SS_L1[i, c("conf.diff","conf.diff_sd")])
  data_SS_L1$L1_rank[i] = ranks[1]
  data_SS_L1$sd_rank[i] = ranks[2]
  #data_SS_L1$L2_rank[i] = ranks[3]
  #data_SS_L1$area_rank[i] = ranks[4]
}

data_SS_L1_7 = data_SS_L1[data_SS_L1$depth==8,]
ggplot(data = data_SS_L1_7, aes(x = sd_rank))  + geom_bar(aes(y = (..count..)/sum(..count..))) 

summary(data_SS_L1$conf.diff_sd)
