In [1]:
if (!require(reticulate)) {
  install.packages("reticulate")
  library(reticulate)
}
if (!require(coin)) {
  install.packages("coin")
  library(coin)
}
use_virtualenv("env/", required = TRUE)
source_python("analysis/analysis_utils.py")

Lade nötiges Paket: reticulate

Lade nötiges Paket: coin

Lade nötiges Paket: survival



In [None]:
med_helper <- function(scores) {
  scores <- unlist(scores)
  median(scores)
}

range_helper <- function(scores) {
  scores <- unlist(scores)
  max(scores) - min(scores)
}

custom_snr <- function(median, range) {
  1 / (-(median * range))
}

preprocess_results <- function(results) {
  results$salinity <- sapply(
    sapply(results$setting, (\(x) x$salinity)), factor
  )
  results$crange <- sapply(results$setting, (\(x) x$conc_range))
  results$temp <- sapply(results$setting, (\(x) x$temperature))
  results$sorbent <- sapply(
    sapply(results$setting, (\(x) x$sorbent)), factor
  )
  results$model <- sapply(results$model, factor)
  results$fingerprint <- sapply(results$embedding, factor)
  results$score.median <- sapply(results$score, med_helper)
  results$score.range <- sapply(results$score, range_helper)
  results <- na.omit(results)
  results <- subset(results, !sapply(results$score, is.null))
  results$stat <- mapply(custom_snr, results$score.median, results$score.range)
  results$smf_inter <- with(
    results, interaction(sorbent, model, fingerprint), drop = TRUE
  )
  results$mf_inter <- with(
    results, interaction(model, fingerprint), drop = TRUE
  )
  results$stc_inter <- with(
    results, interaction(salinity, temp, crange), drop = TRUE
  )
  results
}

all_results <- preprocess_results(
  load_results(
    "models/", flatten = TRUE, r2_cutoff = 0.85
  )
)
all_results_kf <- all_results[all_results$target == "kf", ]
all_results_n <- all_results[all_results$target == "n", ]

In [5]:
print("Results for stat ~ model + sorbent + fingerprint | stc_inter")
it <- independence_test(
  stat ~ model + sorbent + fingerprint | stc_inter,
  data=all_results_kf,
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method = "step-down"))

it <- independence_test(
  stat ~ model + sorbent + fingerprint | stc_inter,
  data = all_results_n,
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))


print("Results for stat ~ salinity + temp + crange | mf_inter")
it <- independence_test(
  stat ~ salinity + temp + crange | mf_inter,
  data = all_results_kf[all_results_kf$model != "pls",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method="step-down"))

it <- independence_test(
  stat ~ salinity + temp + crange | mf_inter,
  data = all_results_kf[all_results_kf$model != "pls",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))

[1] "Results for stat ~ model + sorbent + fingerprint | stc_inter"
[1] "Results for kf"

	Approximative General Independence Test

data:  stat by
	 model, sorbent, fingerprint 
	 stratified by stc_inter
maxT = 13.284, p-value < 1e-05
alternative hypothesis: two.sided

                             
model.rf              0.00002
model.gp              0.99733
model.pls            <0.00001
model.krr            <0.00001
model.mlp             0.41668
sorbent.name          0.57654
fingerprint.mol2vec   0.16684
fingerprint.maccs     0.99733
fingerprint.pubchem   0.16684
fingerprint.rdkit     0.99733
fingerprint.abrahams  0.99733
[1] "Results for n"

	Approximative General Independence Test

data:  stat by
	 model, sorbent, fingerprint 
	 stratified by stc_inter
maxT = 12.655, p-value < 1e-05
alternative hypothesis: two.sided

                             
model.rf             <0.00001
model.gp              0.02948
model.pls            <0.00001
model.krr            <0.00001
model.mlp           

In [6]:
print("Results for stat ~ sorbent + fingerprint | stc_inter for the KRR models only")
it <- independence_test(
  stat ~ sorbent + fingerprint | stc_inter,
  data=all_results_kf[all_results_kf$model == "krr",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method = "step-down"))

it <- independence_test(
  stat ~ sorbent + fingerprint | stc_inter,
  data = all_results_n[all_results_kf$model == "krr",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))


print("Results for stat ~ salinity + temp + crange | fingerprint for the KRR models only")
it <- independence_test(
  stat ~ salinity + temp + crange | fingerprint,
  data = all_results_kf[all_results_kf$model == "krr",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method="step-down"))

it <- independence_test(
  stat ~ salinity + temp + crange | fingerprint,
  data = all_results_kf[all_results_kf$model == "krr",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))

[1] "Results for stat ~ sorbent + fingerprint | stc_inter for the KRR models only"
[1] "Results for kf"

	Approximative General Independence Test

data:  stat by
	 sorbent, fingerprint 
	 stratified by stc_inter
maxT = 2.7797, p-value = 0.03029
alternative hypothesis: two.sided

                            
sorbent.name         0.25762
fingerprint.mol2vec  0.25762
fingerprint.maccs    0.25762
fingerprint.pubchem  0.21141
fingerprint.rdkit    0.25762
fingerprint.abrahams 0.03029
[1] "Results for n"

	Approximative General Independence Test

data:  stat by
	 sorbent, fingerprint 
	 stratified by stc_inter
maxT = 1.6839, p-value = 0.4257
alternative hypothesis: two.sided

                            
sorbent.name         0.97445
fingerprint.mol2vec  0.93949
fingerprint.maccs    0.42567
fingerprint.pubchem  0.97445
fingerprint.rdkit    0.86545
fingerprint.abrahams 0.45816
[1] "Results for stat ~ salinity + temp + crange | fingerprint for the KRR models only"
[1] "Results for kf"

	Approxim

In [7]:
print("Results for stat ~ sorbent + fingerprint | stc_inter for the RF models only")
it <- independence_test(
  stat ~ sorbent + fingerprint | stc_inter,
  data=all_results_kf[all_results_kf$model == "rf",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method = "step-down"))

it <- independence_test(
  stat ~ sorbent + fingerprint | stc_inter,
  data = all_results_n[all_results_kf$model == "rf",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))


print("Results for stat ~ salinity + temp + crange | fingerprint for the RF models only")
it <- independence_test(
  stat ~ salinity + temp + crange | fingerprint,
  data = all_results_kf[all_results_kf$model == "rf",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for kf")
print(it)
print(pvalue(it, method="step-down"))

it <- independence_test(
  stat ~ salinity + temp + crange | fingerprint,
  data = all_results_kf[all_results_kf$model == "rf",],
  distribution = approximate(nresample = 100000),
  alternative = "two.sided"
)
print("Results for n")
print(it)
print(pvalue(it, method="step-down"))

[1] "Results for stat ~ sorbent + fingerprint | stc_inter for the RF models only"
[1] "Results for kf"

	Approximative General Independence Test

data:  stat by
	 sorbent, fingerprint 
	 stratified by stc_inter
maxT = 4.0368, p-value = 0.00034
alternative hypothesis: two.sided

                            
sorbent.name         0.93813
fingerprint.mol2vec  0.00034
fingerprint.maccs    0.52195
fingerprint.pubchem  0.00078
fingerprint.rdkit    0.93813
fingerprint.abrahams 0.52195
[1] "Results for n"

	Approximative General Independence Test

data:  stat by
	 sorbent, fingerprint 
	 stratified by stc_inter
maxT = 2.1075, p-value = 0.1789
alternative hypothesis: two.sided

                            
sorbent.name         0.48892
fingerprint.mol2vec  0.17893
fingerprint.maccs    0.26392
fingerprint.pubchem  0.26392
fingerprint.rdkit    0.60363
fingerprint.abrahams 0.26392
[1] "Results for stat ~ salinity + temp + crange | fingerprint for the RF models only"
[1] "Results for kf"

	Approximat