In [None]:
#%load_ext autoreload
#%autoreload 2

import pandas as pd
import pickle
from glob import glob
from sleep_eval import evaluation_summary
from scipy.stats import ttest_ind

TASK = 2

baselines = ["gt", "always1", "always0", "binterval", "sleep"] # binterval is the "Manual Annotation alg" and "sleep" is the "device algo"
defaultalgs = ["sazonov", "cole", "sadeh", "oakley", "kripke", "webster"] # "sazonov2", "time_based",
defaultml = ["ExtraTrees", "SGD_perceptron", "SGD_log", "SGD_hinge"] # "SGD_huber"
defaultdl = ["LSTM_20_raw", "LSTM_50_raw", "LSTM_100_raw", "CNN_20_raw", "CNN_50_raw", "CNN_100_raw"]

summary = pd.read_csv("./summaries/task%d_summary.csv" % (TASK))
summary = summary.set_index("Unnamed: 0")

with open("./results/task%d_results.pkl" % (TASK), "rb") as f:
    results = pickle.load(f)
    
metrics = ["Accuracy", "Specificity", "Precision", "Recall", "F1"]

def get_group_average(t, metrics):

    for m in metrics:
        t[m] = t[m].apply(lambda x: np.float(x.split("+-")[0]))
    t = pd.concat((t.mean(), t.std()), axis=1)
    t.columns = ["Mean", "Std"]
    t = t.apply(lambda x: "%.1f +- %.1f" % (x["Mean"],x["Std"]), axis=1)
    return t.to_frame("Group Average").T.to_latex()

def pvalue(results, alg1, alg2, metric):
    return ttest_ind(results[alg1][metric], results[alg2][metric])[1]



In [None]:
bls = [alg for alg in baselines]
    
print summary.loc[bls].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()

In [None]:
traditional = [alg for alg in defaultalgs]
print summary.loc[traditional].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()

t = summary.loc[traditional].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
traditionalresc = ["resc_" + alg for alg in defaultalgs]
print(summary.loc[traditionalresc].sort_values(by="Accuracy", ascending=False)[metrics].to_latex())

t = summary.loc[traditionalresc].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
ml = [alg for alg in defaultml]
print summary.loc[ml].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()


t = summary.loc[ml].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
mlresc = ["resc_" + alg for alg in defaultml]
print summary.loc[mlresc].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()

t = summary.loc[mlresc].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
dl = [alg for alg in defaultdl]
print summary.loc[dl].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()

t = summary.loc[dl].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
dlresc = ["resc_" + alg for alg in defaultdl]
print summary.loc[dlresc].sort_values(by="Accuracy", ascending=False)[metrics].to_latex()

t = summary.loc[dlresc].sort_values(by="Accuracy", ascending=False)[metrics]
print(get_group_average(t, metrics))

In [None]:
# T-tests made for Task 1
# binterval is the manual algorithm
# sleep is the device algorithm

pvalue(results, "sleep", "kripke", "F1")
pvalue(results, "kripke", "binterval", "F1")
pvalue(results, "SGD_hinge", "SGD_perceptron", "Recall")
for m in ["F1", "Accuracy"]:
    print pvalue(results, "sleep", "ExtraTrees", m)
    
for m in ["F1", "Accuracy"]:
    print pvalue(results, "binterval", "ExtraTrees", m)

print ("Compare with device:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "sleep", alg, "F1"))

print ("Compare with manual:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "binterval", alg, "F1"))
    
print "p=%.3f" % (pvalue(results, "LSTM_20_raw", "LSTM_50_raw", "Accuracy"))
print "p=%.3f" % (pvalue(results, "LSTM_50_raw", "LSTM_100_raw", "Accuracy"))
print "p=%.3f" % (pvalue(results, "LSTM_20_raw", "LSTM_100_raw", "Accuracy"))

print "p=%.3f" % (pvalue(results, "CNN_20_raw", "CNN_100_raw", "Accuracy"))

print "p=%.3f" % (pvalue(results, "LSTM_20_raw", "LSTM_100_raw", "F1"))
print "p=%.3f" % (pvalue(results, "CNN_20_raw", "CNN_100_raw", "F1"))

print "p=%.3f" % (pvalue(results, "CNN_100_raw", "LSTM_100_raw", "F1"))
print "p=%.3f" % (pvalue(results, "CNN_100_raw", "LSTM_100_raw", "Accuracy"))


In [None]:
# T-tests made for Task 2
print ("Compare with device:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "sleep", alg, "F1"))

print ("Compare with manual:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "binterval", alg, "F1"))

    
print ("Compare with device:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "sleep", alg, "Accuracy"))

print ("Compare with manual:")
for alg in defaultdl:
    print "Alg: %s, p=%.3f" % (alg, pvalue(results, "binterval", alg, "Accuracy"))
    
    
print "p=%.3f" % (pvalue(results, "CNN_100_raw", "LSTM_100_raw", "F1"))
print "p=%.3f" % (pvalue(results, "CNN_100_raw", "LSTM_100_raw", "Accuracy"))
