In [None]:
from typing import Callable, List, Optional, Iterable
from pathlib import Path
import os
from copy import deepcopy
import json
import glob
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import multiprocessing as mp
from tabulate import tabulate
import warnings

import scipy

warnings.filterwarnings("ignore")

DATA_DIR = "./expt_rslts/"
PICKLE_NAME = "nn_rslts_df.pkl"
IMG_OUTPUT_DIR = Path("imgs/")
IMG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TABLE_OUTPUT_DIR = Path("tables/")
TABLE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

#OPT_OF_INTEREST = ("sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")
#DISCRIMINATE_FGD = False
#if DISCRIMINATE_FGD:
#    OPT_OF_INTEREST = ("fgd", "sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")

# Truncate the opts of interest...
OPT_OF_INTEREST = ("sgd", "fr", "bfgs")


DROP_N_EQUALS_TWO = True

NAME_DICT = {
    "sgd": "Stochastic Gradient Descent",
    "fgd": "Full Gradient Descent",
    "kn": "Krylov-Newton",
    "fr": "Fletcher-Reeves",
    "pr": "Polak-Ribiere",
    "hs": "Hestenes-Stiefel",
    "dy": "Dai-Yuan",
    "bfgs": "BFGS",
    "bfgsi": "BFGS Inverse",
    "dfp": "Davidon-Fletcher-Powell",
    "dfpi": "Davidon-Fletcher-Powell Inverse",
    "sr1": "Symmetric Rank-One",
    "sr1d": "Symmetric Rank-One Dual",
    "levenberg": "Levenberg"
}

def read_json_to_df(fname):
    try:
        with open(fname) as f:
            tmp_df = pd.json_normalize(json.load(f))
            return tmp_df
    except Exception:
        return None

if not os.path.exists(PICKLE_NAME):
    print("Creating dataframe!")
    assert(os.path.exists(DATA_DIR))
    f_list = glob.glob(os.path.join(DATA_DIR, "TEST_*.json"))
    print("Number of files:", len(f_list))
    with mp.Pool() as p:
        dframes = p.map(read_json_to_df, f_list)
    #dframes = [read_json_to_df(i) for i in f_list]
    dframes = [i for i in dframes if i is not None]
    monolith = pd.concat(dframes)
    monolith.to_pickle(PICKLE_NAME)

else:
    print("Reading dataframe!")
    monolith = pd.read_pickle(PICKLE_NAME)

monolith.keys()
cifar = monolith.loc[monolith["specs.dataset"] == "cifar10"]
cifar["specs.batch_size_train"] = cifar["specs.batch_size_train"].apply(int)

In [None]:
def max_acc_lambda(row):
    try:
        max_ = max(row["test_accuracy_list"])
    except ValueError:
        max_ = 0.0
    return max_

def final_acc_lambda(row):
    try:
        last = row["test_accuracy_list"][-1]
    except IndexError:
        last = float("NaN")
    return last

cumulative_time_lambda = lambda row: np.cumsum(row["time"])

def total_time_lambda(row):
    try:
        max_ = row["training_timestamps"][-1]
    except IndexError:
        max_ = float("NaN")
    return max_

def time_to_peak_lambda(row):
    try:
        max_idx = np.array(row["test_accuracy_list"]).argmax()
        time_to_peak = row["training_timestamps"][max_idx]

    except ValueError:
        time_to_peak = np.inf

    return time_to_peak


def fgd_lambda(row):
    try:
        opt_name = row["specs.opt"]
        batch_size_train = int(row["specs.batch_size_train"])
        if opt_name == "sgd" and batch_size_train >= 50000:
            row["specs.opt"] = "fgd"
    except ValueError:
        print(f"row failed: {row['specs.opt']}")


def apply_lambda(df: pd.DataFrame, colname: str, func: Callable):
    tmp = df.apply(func, axis=1)
    df[colname] = tmp.copy()

    return df

def get_summary_vals(df: pd.DataFrame):
    df = apply_lambda(df, "top_test_acc", max_acc_lambda)
    df = apply_lambda(df, "final_test_acc", final_acc_lambda)
    df = apply_lambda(df, "training_timestamps", cumulative_time_lambda)
    df = apply_lambda(df, "total_training_time", total_time_lambda)
    df = apply_lambda(df, "time_to_peak_acc", time_to_peak_lambda)
    df = df.dropna(subset=["total_training_time"])
    if DROP_N_EQUALS_TWO:
        df = df[df["specs.batch_size_train"] > 2]

    return df

cifar = get_summary_vals(cifar)
cifar = cifar.loc[cifar["specs.opt"].isin(OPT_OF_INTEREST)]

def drop_fr_5k_failure_to_train(df: pd.DataFrame):
    """Handle the weird case of FR failing to train at 5k batch size"""
    return df.loc[~((df["specs.opt"] == "fr") & (df["top_test_acc"] < 15) & (df["specs.batch_size_train"] == 5000))]

cifar.head()

In [None]:
opt_dfs = [(opt, cifar.loc[cifar["specs.opt"] == opt][["time_to_peak_acc", "top_test_acc", "specs.batch_size_train"]]) for opt in OPT_OF_INTEREST]

In [None]:
def treat_peak_acc_outliers(opt: str, df: pd.DataFrame) -> pd.DataFrame:
    if opt == "fr":
        df = df.loc[~(df["top_test_acc"] < 15) & (df["specs.batch_size_train"] == 5000)]
    Q1 = df["top_test_acc"].quantile(0.25)
    Q3 = df["top_test_acc"].quantile(0.75)
    IQR = Q3-Q1
    return df.where(lambda x: (Q1-1.5*IQR <= x["top_test_acc"]) & (x["top_test_acc"] <= Q3 + 1.5*IQR)).dropna()

opt_dfs_treated = [(opt, df[["top_test_acc", "time_to_peak_acc"]]) for opt, df in [(opt, treat_peak_acc_outliers(opt, df)) for opt, df in opt_dfs]]
opt_dfs = [(opt, df[["top_test_acc", "time_to_peak_acc"]]) for opt, df in opt_dfs]
    

In [None]:
def plot_single_violin_plot(df: pd.DataFrame, ax: plt.Axes, opt_name: str, outlier_treatment: bool, color: str):
    batch_sizes = [100, 1000, 5000, 10000, 25000, 50000]
    opt_df = df.loc[df["specs.opt"] == opt_name].reset_index()

    if outlier_treatment:
        if opt_name == "fr":
            # Remove the abject failures to train
            opt_df = opt_df.loc[opt_df["top_test_acc"] >= 15]


        opt_df = opt_df.reset_index()

    ttp = opt_df["time_to_peak_acc"]
    acc = opt_df["top_test_acc"]


    x_axis = opt_df["specs.batch_size_train"]
    tmp_df = pd.concat([x_axis, ttp, acc], axis=1, keys=["hparam", "ttp", "acc"])
    hparams = list(set(x_axis))
    if len(hparams) == 0:
        raise RuntimeError(f"No data for optimizer {opt_name}!")

    hparams.sort()
    def get_hparam_match(hparam_):
        tmp = tmp_df.loc[tmp_df["hparam"] == hparam_][["ttp", "acc"]]
        if len(tmp) == 0:
            return pd.Series(np.zeros(2)*np.nan), 0
        return tmp

    data = [get_hparam_match(i) for i in batch_sizes]
    if outlier_treatment:
        def filter_peak_acc_iqr(x: pd.Series):
            Q1 = x["acc"].quantile(0.25)
            Q3 = x["acc"].quantile(0.75)
            IQR = Q3-Q1
            filter_lower = (Q1-1.5*IQR <= x["acc"])
            filter_upper = (x["acc"] <= Q3 + 1.5*IQR)
            return x.loc[filter_lower & filter_upper].dropna()

        data = [filter_peak_acc_iqr(i) for i in data]

    data = [np.log10(i["ttp"]) for i in data]
    #data = [i["ttp"] for i in data]
    counts = [len(i) for i in data]
    hparam_strs = [f"{h}\nN={c}" for h, c in zip(batch_sizes, counts)]
    xticklabels = hparam_strs
    xticks = [*range(1,7)]


    violins = ax.violinplot(data, showmeans=True)
    for pc in violins["bodies"]:
        pc.set_color(color)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    max_value = 5.5
    ax.set_ylim(0, max_value)
    max_value = int(np.ceil(max_value).item())
    ax.yaxis.set_major_locator(matplotlib.ticker.FixedLocator([*range(max_value + 1)]))
    ra = np.array([[n+(np.log10(i))] for n in range(0,max_value) for i in [*range(2,10)]])
    ax.yaxis.set_minor_locator(matplotlib.ticker.FixedLocator(ra))
    def format_func(x, p):
        _ = p
        superscript = [
            "\u2070", # 0
            "\u00B9", # 1
            "\u00B2", # 2
            "\u00B3", # 3
            "\u2074", # 4
            "\u2075", # 5
            "\u2076", # 6
            "\u2077", # 7
            "\u2078", # 8
            "\u2079", # 9
        ]

        return f"10{superscript[int(x)]}"

    major_formatter = matplotlib.ticker.FuncFormatter(format_func)
    ax.yaxis.set_major_formatter(major_formatter)
    if outlier_treatment:
        subtitle = f"{opt_name.upper()}"
        ax.set_title(subtitle, y=0, pad=-75)


def multiple_violin_plots_time_to_peak(df: pd.DataFrame) -> None:
    fig = plt.figure(figsize=(16, 9), constrained_layout=True)
    #fig.suptitle("Optimizer Time to Peak Test Accuracy in 100 Epochs, Log Scale")
    subfigs = fig.subfigures(nrows=2, ncols=1)
    fr_treatment = (False, True)
    opt_name = ("sgd", "bfgs", "fr")
    colors = ("green", "blue", "red")
    for row, treatment in zip(subfigs, fr_treatment):
        if treatment:
            title = "Peak Test Accuracy Outliers Removed"
        else:
            title = "Full Data"
        row.suptitle(title)
        cols = row.subplots(nrows=1, ncols=3, sharey=True)
        for col, opt, color in zip(cols, opt_name, colors):
            plot_single_violin_plot(df, col, opt, treatment, color)
        cols[1].set_xlabel("Batch Size and Experimental Sample Count")
        cols[0].set_ylabel("Time to Peak Accuracy, seconds")

    fig.savefig("ttp_violin_comparison.png", dpi=300)

multiple_violin_plots_time_to_peak(cifar)