In [113]:
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
from random import uniform, seed
seed(42069)

In [114]:
emip_metadata = pd.read_csv("../../datasets/emip-fixations/metadata/emip_metadata.csv")
cscw_metadata = pd.read_csv("../../datasets/cscw/metadata/CSCW - Sheet1.csv", sep=";")
fractions_metadata = pd.read_csv("../../datasets/fractions/metadata/DataSet_PrePost.csv")

In [115]:
emip_labels = emip_metadata["correct_vehicle"] + emip_metadata["correct_rectangle"]
cscw_labels = cscw_metadata["Posttest.Score"]
fractions_labels = fractions_metadata["Post_SumOfCorrect_NewSum"]

In [116]:
emip_labels = (emip_labels-emip_labels.min())/(emip_labels.max()-emip_labels.min())
cscw_labels = (cscw_labels-cscw_labels.min())/(cscw_labels.max()-cscw_labels.min())
fractions_labels = (fractions_labels-fractions_labels.min())/(fractions_labels.max()-fractions_labels.min())

In [117]:
def get_baseline(labels):
    error = labels - labels.mean()
    error_squared = (error**2).mean()
    baseline = math.sqrt(error_squared)
    return baseline

def get_random_prediction(labels):
    prediction = [uniform(0, 1) for i in labels]
    rmse_baseline = mean_squared_error(labels, prediction, squared=False)
    return rmse_baseline

def get_baseline(labels):
    rmses = [get_random_prediction(labels) for i in range(1000)]
    return sum(rmses)/len(rmses)

In [118]:
print(get_baseline(cscw_labels), "CSCW baseline")
print(get_baseline(emip_labels), "EMIP baseline")
print(get_baseline(fractions_labels), "Fractions labels")
print(get_baseline(pd.concat((emip_labels, fractions_labels))), "fractions and emip baseline")
print(get_baseline(pd.concat((cscw_labels, fractions_labels))), "fractions and cscw baseline")
print(get_baseline(pd.concat((emip_labels, cscw_labels))), "cscw and emip baseline")
print(get_baseline(pd.concat((emip_labels, cscw_labels, fractions_labels))), "fractions, cscw and emip baseline")

0.35388904448897335 CSCW baseline
0.424636036995045 EMIP baseline
0.39366298809771244 Fractions labels
0.416377979296539 fractions and emip baseline
0.37585405622572404 fractions and cscw baseline
0.40853019640189875 cscw and emip baseline
0.40540125832240065 fractions, cscw and emip baseline


In [119]:
errors = {
    "cscw": get_baseline(cscw_labels),
    "emip": get_baseline(emip_labels),
    "fractions": get_baseline(fractions_labels),
    "emip_fractions": get_baseline(pd.concat((emip_labels, fractions_labels))),
    "fractions_cscw": get_baseline(pd.concat((cscw_labels, fractions_labels))),
    "emip_cscw": get_baseline(pd.concat((emip_labels, cscw_labels)))
}

In [120]:
errors

{'cscw': 0.35408194888273653,
 'emip': 0.4241358322246148,
 'fractions': 0.39295164906188074,
 'emip_fractions': 0.4170846635199361,
 'fractions_cscw': 0.37619048780763514,
 'emip_cscw': 0.40841293420269603}