In [1]:
import pandas as pd
from pandas.io.json import json_normalize 
import os
import json 

def read_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))

    return json_normalize(data)

def nice_name(row):
    if (row["model"] in ["RandomForestClassifier", "ExtraTreesClassifier", "GradientBoostingClassifier"]): 
        model_name = "{} with T = {}".format(row["model"], row["n_estimators"])
    elif row["model"] == "RiverModel":
        model_name = "{} with {}".format(row["model"], row["river_model"])
    elif (row["model"] == "BiasedProxEnsemble"):
        if int(row["max_trees"]) == 0:
            model_name = "{} with λ = {}, max_depth = {}, mode = {}, stepsize = {}".format(row["model"],row["l_reg"], row["max_depth"], row["mode"], row["step_size"])
        else:
            model_name = "{} with T = {}, max_depth = {}, mode = {}, stepsize = {} with λ = {}".format(row["model"], row["max_trees"], row["max_depth"], row["step_size"], row["mode"], row["l_reg"])
    elif row["model"] == "JaxModel":
        model_name = "{} with T = {}, max_depth = {}, with temp_scaling = {}".format(row["model"], row["n_trees"], row["max_depth"], row["temp_scaling"])
    else:
        model_name = "{} with T = {}, max_depth = {}, modes = {}/{}, stepsize = {}".format(row["model"], row["max_trees"], row["max_depth"], row["train_mode"],row["next_mode"], row["step_size"])
    
    return model_name

dataset = "magic"
dataset = os.path.join(dataset, "results")
all_subdirs = [os.path.join(dataset,d) for d in os.listdir(dataset) if os.path.isdir(os.path.join(dataset, d))]
print(all_subdirs)
latest_folder = max(all_subdirs, key=os.path.getmtime)

df = read_jsonl(os.path.join(latest_folder, "results.jsonl"))

df["nice_name"] = df.apply(nice_name, axis=1)
df = df.round(decimals = 3)
df

['magic/results/19-12-2020-00:29:11', 'magic/results/19-12-2020-00:30:20', 'magic/results/19-12-2020-00:29:00', 'magic/results/19-12-2020-00:44:39', 'magic/results/19-12-2020-00:29:59', 'magic/results/19-12-2020-00:46:17', 'magic/results/19-12-2020-00:28:09', 'magic/results/19-12-2020-00:28:39', 'magic/results/19-12-2020-00:32:42', 'magic/results/19-12-2020-00:41:45', 'magic/results/19-12-2020-00:32:08', 'magic/results/19-12-2020-00:31:24', 'magic/results/19-12-2020-00:29:27', 'magic/results/19-12-2020-00:30:42', 'magic/results/19-12-2020-00:31:46', 'magic/results/19-12-2020-00:44:17', 'magic/results/19-12-2020-00:34:05', 'magic/results/19-12-2020-00:30:12']


KeyError: 'mode'

In [37]:
from IPython.display import display, HTML
df.columns
tabledf = df[["nice_name", "scores.mean_test_accuracy", "scores.mean_n_estimators", "scores.mean_fit_time", "scores.mean_n_parameters"]]
tabledf = tabledf.sort_values(by=['scores.mean_test_accuracy'], ascending = False)
#display(tabledf)
display(HTML(tabledf.to_html()))

Unnamed: 0,nice_name,scores.mean_test_accuracy,scores.mean_n_estimators,scores.mean_fit_time,scores.mean_n_parameters
17,BiasedProxEnsemble with T = 128 and max_depth = 7 and mode = trainwith λ = 0.02,86.072,127.2,31.963,64872.0
20,BiasedProxEnsemble with λ = 0.04 and max_depth = 7 and mode = train,85.977,466.6,63.423,237966.0
6,BiasedProxEnsemble with T = 128 and max_depth = 7 and mode = trainwith λ = 0.01,85.967,128.0,31.467,65280.0
33,BiasedProxEnsemble with T = 128 and max_depth = 7 and mode = trainwith λ = 0.04,85.656,127.8,31.262,65178.0
4,BiasedProxEnsemble with T = 128 and max_depth = 5 and mode = trainwith λ = 0.04,84.694,127.4,33.485,16052.4
11,BiasedProxEnsemble with T = 128 and max_depth = 3 and mode = trainwith λ = 0.02,84.137,127.2,38.048,3816.0
34,BiasedProxEnsemble with T = 128 and max_depth = 3 and mode = trainwith λ = 0.04,84.121,126.4,39.154,3792.0
31,BiasedProxEnsemble with T = 128 and max_depth = 5 and mode = trainwith λ = 0.06,83.643,44.8,23.971,5644.8
25,BiasedProxEnsemble with λ = 0.04 and max_depth = 5 and mode = train,82.896,25.0,21.273,3150.0
10,BiasedProxEnsemble with T = 128 and max_depth = 7 and mode = trainwith λ = 0.04,82.796,26.2,19.491,13362.0


In [34]:
import numpy as np
pd.set_option('expand_frame_repr', True)
pd.set_option('max_colwidth', 110)
#tabledf = df[["nice_name", "scores.mean_test_accuracy", "scores.mean_n_estimators", "scores.mean_n_nodes", "scores.mean_fit_time"]]
dff = df.sort_values(by=['scores.mean_test_accuracy'], ascending = False)

#dff["nice_name"].head(n=10)
methods = [
    "RiverModel with BaggingClassifier(ExtremelyFastDecisionTreeClassifier)",
    "BiasedProxEnsemble with T = 128.0 and max_depth = 7.0 and mode = trainwith λ = 0.03",
    "SGDEnsemble with T = 64.0 and max_depth = 5.0 and mode = train",
    "JaxModel with T = 5.0 and max_depth = 5.0 and with temp_scaling = 1.0"
]

traindfs = []
for m in methods:
    #print("M: ", m)
    experiment_path = dff.loc[ df["nice_name"] == m ]["out_path"].values[0]
    traindf = None
    
    # TODO not hardcode 5 here
    accuracies = []
    losses = []
    num_nodes = []
    total_item_cnt = None
    for i in range(5):
        path = os.path.join(experiment_path, str(i), "training.jsonl")
        tdf = read_jsonl(path)
        losses.append(tdf["test_loss"].values)
        accuracies.append(tdf["test_accuracy"].values)
        num_nodes.append(tdf["test_num_parameters"].values)
        if total_item_cnt is None:
            total_item_cnt = tdf["total_item_cnt"]
    

    # TODO add std 
    d = {
        "total_item_cnt":total_item_cnt,
        "test_loss":np.mean(losses, axis=0),
        "test_accuracy":np.mean(accuracies, axis=0),
        "test_num_parameters":np.mean(num_nodes, axis=0),
    }
    traindf = pd.DataFrame(d)
    
    traindfs.append(traindf)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=9
paired = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] 
colors = {}
for m,c in zip(methods, paired):
    colors[m] = c

fig = make_subplots(rows=3, cols=1, subplot_titles=["Magic"], horizontal_spacing = 0.03, vertical_spacing = 0.02)

for tdf, m in zip(traindfs, methods):
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_loss"], mode="lines", name = m, marker=dict(color = colors[m])), row = 1, col = 1)
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_accuracy"], mode="lines", name = m, showlegend = False, marker=dict(color = colors[m])), row = 2, col = 1)
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_num_parameters"], mode="lines", name = m, showlegend = False, marker=dict(color = colors[m])), row = 3, col = 1)

fig.update_xaxes(title_text="Number of items", row=3, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Test loss", row=1, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Test accuracy", row=2, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Numb of trainable parameters", row=3, col=1, title_font = {"size": 16})

fig.update_layout(
    template="simple_white",
    legend=dict(orientation="h",yanchor="bottom",y=-0.2,xanchor="left",x=0.15),
    margin={'l': 5, 'r': 20, 't': 20, 'b': 5},
    height=900, width=1100
)
fig.show()
    
