In [None]:
import seml
import pandas as pd
from run_seml import run
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 120)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
seml_results = seml.get_results('rgnn_rpprgo_papers100M', to_data_frame=True,
                                  fields=['batch_id', 'slurm', 'config', 'result'])

In [None]:
len(seml_results)

In [None]:
relevant_columns = [#'_id', 
       'config.model_params.label',
       'config.model_params.model',
       'config.dataset', 
       'config.seed',
        'config.model_params.mean',
       'result.accuracy',
        'config.model_params.hidden_size',
        'config.model_params.nlayers',
       #'config.model_params.n_filters',
       #'config.model_params.gdc_params', 
       #'config.model_params.svd_params',
       #'config.model_params.jaccard_params',
        'config.model_params.dropout', 
        'config.model_params.alpha',
        'config.model_params.eps',
        'config.model_params.topk', 
       'config.model_params.mean_kwargs.temperature',
       #'config.model_params.mean_kwargs.k',
       #'config.model_params.mean_kwargs.with_weight_correction',
       #'config.model_params.do_cache_adj_prep', 
       "config.normalize",
       'config.model_params.ppr_normalization',
       'config.train_params.lr',
       'config.train_params.weight_decay', 'config.train_params.patience',
       #'config.train_params.max_epochs', 'config.train_params.batch_mult_val',
       #'config.train_params.batch_size', 'config.binary_attr', 
       #'config.artifact_dir', 'config.model_storage_type', 'config.device',
       #'config.display_steps', 'config.data_device', 'config.data_dir',
       # 'result.trace_val', 'result.trace_train',
       #'result.model_path'
       ]
seml_results.columns

In [None]:
seml_results[relevant_columns].sort_values("result.accuracy", ascending=False) 

In [None]:
groups = [
"config.dataset",
'config.model_params.label',
"config.model_params.model",
#"alpha.quantile",
#"config.model_params.mean",
#'config.model_params.mean_kwargs.temperature',
#"config.model_params.n_filters",
#"config.binary_attr",
#"config.normalize",
"config.model_params.ppr_normalization",
#"config.model_params.hidden_size",
#"config.model_params.nlayers",
"config.model_params.dropout",
"config.model_params.alpha",
#"config.model_params.topk",
#"config.model_params.eps",
#"config.train_params.lr",
#"config.train_params.weight_decay",
]
# seml_results = seml_results[seml_results["config.binary_attr"]==False]
# seml_results = seml_results[seml_results["config.normalize"]==False]
#seml_results = seml_results[seml_results['config.model_params.mean_kwargs.temperature']!=5.0]
seml_results = seml_results[seml_results['config.train_params.weight_decay']!=0.05]
#seml_results = seml_results[seml_results['config.model_params.label']=="Vanilla PPRGo"]
seml_results["config.model_params.mean_kwargs.temperature"].fillna(0.0, inplace=True)
seml_results["config.model_params.mean"].fillna("None", inplace=True)
#seml_results['alpha.quantile'] = pd.qcut(seml_results['config.model_params.alpha'], q=15, precision=0)

# condition = (seml_results['config.model_params.mean'] == "soft_k_medoid") \
#             & (seml_results['config.model_params.mean_kwargs.temperature'] == 0.2)
# seml_results.loc[condition, "config.model_params.label"] = "Soft Medoid RPPRGo (T=0.2)"

# condition = (seml_results['config.model_params.mean'] == "soft_median") \
#             & (seml_results['config.model_params.mean_kwargs.temperature'] == 0.2)
# seml_results.loc[condition, "config.model_params.label"] = "Soft Median  RPPRGo (T=0.2)"

seml_results.groupby(groups).mean()[["result.accuracy"]]

In [None]:
seml_results["config.model_params.label"].unique()

In [None]:
cora_results = seml_results[list(seml_results["config.dataset"] == "cora_ml")]
citeseer_results = seml_results[list(seml_results["config.dataset"] == "citeseer")]

In [None]:

cora_median_results = cora_results[cora_results["config.model_params.mean"] == "soft_median"]
cora_mediod_results = cora_results[cora_results["config.model_params.mean"] == "soft_k_medoid"]

citeseer_median_results = citeseer_results[citeseer_results["config.model_params.mean"] == "soft_median"]
citeseer_mediod_results = citeseer_results[citeseer_results["config.model_params.mean"] == "soft_k_medoid"]

len(cora_median_results),len(cora_mediod_results),len(citeseer_median_results),len(citeseer_mediod_results)                    

In [None]:
metric = "result.accuracy"
treshold = 1e-6
model_labels = seml_results["config.model_params.label"].unique()
best_results = None
for label in model_labels:
    bylabel_results = seml_results[seml_results["config.model_params.label"] == label]
    if best_results is None:
        best_results = bylabel_results[bylabel_results[metric].max() - bylabel_results[metric] <= treshold]
    else:
        best_results = pd.concat([
            best_results,
            bylabel_results[bylabel_results[metric].max() - bylabel_results[metric] <= treshold]])
            
best_results[relevant_columns].sort_values(["config.model_params.label",metric]).drop_duplicates()

In [None]:
metric = "result.accuracy"
treshold = 1e-2
best_results = cora_median_results[cora_median_results[metric].max() - cora_median_results[metric] <= treshold]

best_results = pd.concat([
    best_results,
    cora_mediod_results[cora_mediod_results[metric].max() - cora_mediod_results[metric] <= treshold]])
    
best_results = pd.concat([
    best_results,
    citeseer_median_results[citeseer_median_results[metric].max() - citeseer_median_results[metric]<= treshold]])

best_results = pd.concat([
    best_results,
    citeseer_mediod_results[citeseer_mediod_results[metric].max() - citeseer_mediod_results[metric] <= treshold]])

#best_results[relevant_columns]

In [None]:
print(best_results[relevant_columns].to_markdown())

In [None]:
def epoch_mean(trace, epoch_num):
    epoch_mean = list()
    batches_per_epoch = int(len(trace) / epoch_num)
    for i in range(epoch_num):
        epoch_mean.append(sum(trace[i * batches_per_epoch:(i+1) * batches_per_epoch]) / batches_per_epoch)
    return epoch_mean

In [None]:
ids = [26, 30, 34, 28, 32]#list(range(12, 18))#[2,0,10,4,8]
ids

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2,2, figsize=(25, 20))
max_epoch_plot = 30
for i in ids:
    exp = seml_results.loc[i]
    exp_id = exp["_id"]
    lr = exp["config.train_params.lr"]
    weight_decay = exp["config.train_params.weight_decay"]
    alpha = exp["config.model_params.alpha"]
    nlayers = exp["config.model_params.nlayers"]
    hidden_size = exp["config.model_params.hidden_size"]
    temperature = exp["config.model_params.mean_kwargs.temperature"]
    dropout = exp["config.model_params.dropout"]
    name_suffix = f"ID{i:04d} LR{lr:.0e} WD{weight_decay:.0e} A{alpha:.02} L{nlayers} H{hidden_size:03d} T{temperature:.0e} D{dropout:.0e}"

    epoch_num = exp["config.train_params.max_epochs"]
    val_loss = epoch_mean(exp["result.trace_val.loss"], epoch_num)
    val_acc = epoch_mean(exp["result.trace_val.acc"], epoch_num)
    train_loss = epoch_mean(exp["result.trace_train.loss"], epoch_num)
    train_acc = epoch_mean(exp["result.trace_train.acc"], epoch_num)

    axs[0, 0].plot(train_loss[:max_epoch_plot], label= f"{name_suffix} train ")
    axs[0, 1].plot(val_loss[:max_epoch_plot], label= f"{name_suffix} validation ")
    axs[1, 0].plot(train_acc[:max_epoch_plot], label= f"{name_suffix} train ")
    axs[1, 1].plot(val_acc[:max_epoch_plot], label= f"{name_suffix} validation ")

axs[0, 0].set_title("Loss")
axs[0, 0].legend(loc="best")
axs[0, 1].set_title("Loss")
axs[0, 1].legend(loc="best")
axs[1, 0].set_title("Accuracy")
axs[1, 0].legend(loc="best")
axs[1, 1].set_title("Accuracy")
axs[1, 1].legend(loc="best")
plt.show()

In [None]:
exp
# _id config.model_params.nlayers  config.model_params.hidden_size config.model_params.mean_kwargs.temperature config.model_params.dropout