In [1]:
import pandas as pd
from pandas.io.json import json_normalize 
import os
import json 

def read_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))

    return json_normalize(data)

def nice_name(row):
    if (row["model"] in ["RandomForestClassifier", "ExtraTreesClassifier", "GradientBoostingClassifier"]): 
        model_name = "{} with T = {}".format(row["model"], row["n_estimators"])
    elif row["model"] == "RiverModel":
        model_name = "{} with {}".format(row["model"], row["river_model"])
    elif (row["model"] == "BiasedProxEnsemble"):
        if int(row["max_trees"]) == 0:
            model_name = "{} with λ = {}".format(row["model"],row["l_reg"])
        else:
            model_name = "{} with T = {} and with λ = {}".format(row["model"], row["max_trees"], row["l_reg"])
    elif row["model"] == "JaxModel":
        model_name = "{} with T = {}".format(row["model"], row["n_trees"])
    else:
        model_name = "{} with T = {}".format(row["model"], row["max_trees"])
    
    return model_name

dataset = "magic"
dataset = os.path.join(dataset, "results")
all_subdirs = [os.path.join(dataset,d) for d in os.listdir(dataset) if os.path.isdir(os.path.join(dataset, d))]
print(all_subdirs)
latest_folder = max(all_subdirs, key=os.path.getmtime)

df = read_jsonl(os.path.join(latest_folder, "results.jsonl"))

df["nice_name"] = df.apply(nice_name, axis=1)
df = df.round(decimals = 3)
df

['magic/results/08-12-2020-22:45:42']


Unnamed: 0,X,Y,bootstrap,experiment_id,idx,loss,max_depth,max_samples,model,n_estimators,...,l_reg,max_trees,mode,seed,step_size,eval_loss,subsample,river_model,n_trees,nice_name
0,X,Y,True,0,idx,mse,5,32.0,ExtraTreesClassifier,16.0,...,,,,,,,,,,ExtraTreesClassifier with T = 16.0
1,X,Y,True,6,idx,mse,5,32.0,RandomForestClassifier,32.0,...,,,,,,,,,,RandomForestClassifier with T = 32.0
2,X,Y,,2,idx,mse,5,,SGDEnsemble,,...,0.06,16.0,train,12345.0,0.5,,,,,SGDEnsemble with T = 16.0
3,X,Y,True,9,idx,mse,5,32.0,ExtraTreesClassifier,32.0,...,,,,,,,,,,ExtraTreesClassifier with T = 32.0
4,X,Y,,3,idx,mse,5,,SGDEnsemble,,...,0.06,32.0,fully-random,12345.0,0.5,,,,,SGDEnsemble with T = 32.0
5,X,Y,,8,idx,mse,5,,SGDEnsemble,,...,0.06,16.0,fully-random,12345.0,0.5,,,,,SGDEnsemble with T = 16.0
6,X,Y,,4,idx,mse,5,,BiasedProxEnsemble,,...,0.04,0.0,random,12345.0,0.25,,,,,BiasedProxEnsemble with λ = 0.04
7,X,Y,,7,idx,mse,5,,BiasedProxEnsemble,,...,0.04,0.0,train,12345.0,0.25,,,,,BiasedProxEnsemble with λ = 0.04
8,X,Y,,12,idx,mse,5,,SGDEnsemble,,...,0.06,32.0,random,12345.0,0.5,,,,,SGDEnsemble with T = 32.0
9,X,Y,,13,idx,mse,5,,SGDEnsemble,,...,0.06,64.0,fully-random,12345.0,0.5,,,,,SGDEnsemble with T = 64.0


In [2]:
df.columns
tabledf = df[["nice_name", "scores.mean_test_accuracy", "scores.mean_n_estimators", "scores.mean_n_nodes", "scores.mean_fit_time"]]
tabledf = tabledf.sort_values(by=['scores.mean_test_accuracy'], ascending = False)
tabledf

Unnamed: 0,nice_name,scores.mean_test_accuracy,scores.mean_n_estimators,scores.mean_n_nodes,scores.mean_fit_time
11,BiasedProxEnsemble with λ = 0.02,85.073,321.0,20223.0,119.954
7,BiasedProxEnsemble with λ = 0.04,84.326,210.6,13267.8,92.308
10,SGDEnsemble with T = 64.0,84.31,64.0,4032.0,54.238
25,BiasedProxEnsemble with λ = 0.06,84.147,158.8,10004.4,77.625
20,SGDEnsemble with T = 32.0,83.227,32.0,2016.0,49.548
9,SGDEnsemble with T = 64.0,82.339,64.0,4032.0,52.202
2,SGDEnsemble with T = 16.0,81.45,16.0,1008.0,43.192
27,RiverModel with ExtremelyFastDecisionTreeClass...,81.129,1.0,1498.2,563.218
13,BiasedProxEnsemble with λ = 0.02,80.446,153.2,9651.6,66.914
15,GradientBoostingClassifier with T = 16.0,80.304,16.0,503.2,0.073


In [30]:
import numpy as np
pd.set_option('expand_frame_repr', True)
pd.set_option('max_colwidth', 110)
#tabledf = df[["nice_name", "scores.mean_test_accuracy", "scores.mean_n_estimators", "scores.mean_n_nodes", "scores.mean_fit_time"]]
dff = df.sort_values(by=['scores.mean_test_accuracy'], ascending = False)

#dff["nice_name"].head(n=10)
methods = ["BiasedProxEnsemble with λ = 0.02", "SGDEnsemble with T = 64.0", "RiverModel with ExtremelyFastDecisionTreeClassifier", "JaxModel with T = 16.0"]
traindfs = []
for m in methods:
    #print("M: ", m)
    experiment_path = dff.loc[ df["nice_name"] == m ]["out_path"].values[0]
    traindf = None
    
    # TODO not hardcode 5 here
    accuracies = []
    losses = []
    num_nodes = []
    total_item_cnt = None
    for i in range(5):
        path = os.path.join(experiment_path, str(i), "training.jsonl")
        tdf = read_jsonl(path)
        losses.append(tdf["test_loss"].values)
        accuracies.append(tdf["test_accuracy"].values)
        num_nodes.append(tdf["test_num_nodes"].values)
        if total_item_cnt is None:
            total_item_cnt = tdf["total_item_cnt"]
    

    # TODO add std 
    d = {
        "total_item_cnt":total_item_cnt,
        "test_loss":np.mean(losses, axis=0),
        "test_accuracy":np.mean(accuracies, axis=0),
        "test_num_nodes":np.mean(num_nodes, axis=0),
    }
    traindf = pd.DataFrame(d)
    
    traindfs.append(traindf)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=9
paired = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928'] 
colors = {}
for m,c in zip(methods, paired):
    colors[m] = c

fig = make_subplots(rows=3, cols=1, subplot_titles=["Magic"], horizontal_spacing = 0.03, vertical_spacing = 0.02)

for tdf, m in zip(traindfs, methods):
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_loss"], mode="lines", name = m, marker=dict(color = colors[m])), row = 1, col = 1)
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_accuracy"], mode="lines", name = m, showlegend = False, marker=dict(color = colors[m])), row = 2, col = 1)
    fig = fig.add_trace(go.Scatter(x=tdf["total_item_cnt"], y = tdf["test_num_nodes"], mode="lines", name = m, showlegend = False, marker=dict(color = colors[m])), row = 3, col = 1)

fig.update_xaxes(title_text="Number of items", row=3, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Test loss", row=1, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Test accuracy", row=2, col=1, title_font = {"size": 16})
fig.update_yaxes(title_text="Numb of nodes", row=3, col=1, title_font = {"size": 16})

fig.update_layout(
    template="simple_white",
    legend=dict(orientation="h",yanchor="bottom",y=-0.1,xanchor="left",x=0.15),
    margin={'l': 5, 'r': 20, 't': 20, 'b': 5},
    height=900, width=1100
)
fig.show()
    
