In [None]:
from typing import Callable, List, Optional, Iterable
from pathlib import Path
import os
from copy import deepcopy
import json
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import multiprocessing as mp
from tabulate import tabulate
import warnings
warnings.filterwarnings("ignore")

DATA_DIR = "./expt_rslts/primed/"
PICKLE_NAME = "nn_rslts_primed_df.pkl"
IMG_OUTPUT_DIR = Path("imgs/")
IMG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TABLE_OUTPUT_DIR = Path("tables/")
TABLE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

OPT_OF_INTEREST = ("sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")
DISCRIMINATE_FGD = False
if DISCRIMINATE_FGD:
    OPT_OF_INTEREST = ("fgd", "sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")

# Truncate the opts of interest...
OPT_OF_INTEREST = ("kn", "fr", "bfgs")


DROP_N_EQUALS_TWO = True

NAME_DICT = {
    "sgd": "Stochastic Gradient Descent",
    "fgd": "Full Gradient Descent",
    "kn": "Krylov-Newton",
    "fr": "Fletcher-Reeves",
    "pr": "Polak-Ribiere",
    "hs": "Hestenes-Stiefel",
    "dy": "Dai-Yuan",
    "bfgs": "BFGS",
    "bfgsi": "BFGS Inverse",
    "dfp": "Davidon-Fletcher-Powell",
    "dfpi": "Davidon-Fletcher-Powell Inverse",
    "sr1": "Symmetric Rank-One",
    "sr1d": "Symmetric Rank-One Dual",
    "levenberg": "Levenberg"
}

def read_json_to_df(fname):
    try:
        with open(fname) as f:
            tmp_df = pd.json_normalize(json.load(f))
            return tmp_df
    except Exception:
        return None

if not os.path.exists(PICKLE_NAME):
    print("Creating dataframe!")
    assert(os.path.exists(DATA_DIR))
    f_list = glob.glob(os.path.join(DATA_DIR, "TEST_priming_*.json"))
    print("Number of files:", len(f_list))
    with mp.Pool() as p:
        dframes = p.map(read_json_to_df, f_list)
    #dframes = [read_json_to_df(i) for i in f_list]
    dframes = [i for i in dframes if i is not None]
    monolith = pd.concat(dframes)
    monolith.to_pickle(PICKLE_NAME)

else:
    print("Reading dataframe!")
    monolith = pd.read_pickle(PICKLE_NAME)

monolith.keys()

In [None]:
print("Optimizers:", list(set(monolith["specs.opt"])))
print("Learning rates:", list(set(monolith["specs.learning_rate"])))
print("Learning rates:", list(set(monolith["specs.momentum"])))

In [None]:
monolith.head()


In [None]:
cifar = monolith.loc[monolith["specs.dataset"] == "cifar10"]

In [None]:
cifar["specs.batch_size_train"] = cifar["specs.batch_size_train"].apply(int)
if DISCRIMINATE_FGD:
    cifar.loc[(cifar["specs.opt"] == "sgd") & (cifar["specs.batch_size_train"] >= 50000), "specs.opt"] = "fgd"
    cifar.loc[cifar["specs.opt"] == "fgd"]

In [None]:
def max_acc_lambda(row):
    try:
        max_ = max(row["test_accuracy_list"])
    except ValueError:
        max_ = 0.0
    return max_

def final_acc_lambda(row):
    try:
        last = row["test_accuracy_list"][-1]
    except IndexError:
        last = float("NaN")
    return last

cumulative_time_lambda = lambda row: np.cumsum(row["time"])

def total_time_lambda(row):
    try:
        max_ = row["training_timestamps"][-1]
    except IndexError:
        max_ = float("NaN")
    return max_

def time_to_peak_lambda(row):
    try:
        max_idx = np.array(row["test_accuracy_list"]).argmax()
        time_to_peak = np.sum(row["training_timestamps"][:max_idx+1])

    except ValueError:
        time_to_peak = np.inf

    return time_to_peak


def fgd_lambda(row):
    try:
        opt_name = row["specs.opt"]
        batch_size_train = int(row["specs.batch_size_train"])
        if opt_name == "sgd" and batch_size_train >= 50000:
            row["specs.opt"] = "fgd"
    except ValueError:
        print(f"row failed: {row['specs.opt']}")


def apply_lambda(df: pd.DataFrame, colname: str, func: Callable):
    tmp = df.apply(func, axis=1)
    df[colname] = tmp.copy()

    return df

def get_summary_vals(df: pd.DataFrame):
    df = apply_lambda(df, "top_test_acc", max_acc_lambda)
    df = apply_lambda(df, "final_test_acc", final_acc_lambda)
    df = apply_lambda(df, "training_timestamps", cumulative_time_lambda)
    df = apply_lambda(df, "total_training_time", total_time_lambda)
    df = apply_lambda(df, "time_to_peak_acc", time_to_peak_lambda)
    df = df.dropna(subset=["total_training_time"])
    if DROP_N_EQUALS_TWO:
        df = df[df["specs.batch_size_train"] > 2]

    return df

cifar = get_summary_vals(cifar)

In [None]:
def get_n_best_acc(full_df, n, total_time_filter=100000000):
    full_df.sort_values(["top_test_acc"], ascending=False, inplace=True)
    tmp = full_df.loc[full_df["total_training_time"] < total_time_filter]
    return tmp.head(n)

def get_n_best_final_acc(full_df, n, total_time_filter=10000):
    full_df.sort_values(["final_test_acc"], ascending=False, inplace=True)
    tmp = full_df.loc[full_df["total_training_time"] < total_time_filter]
    return tmp.head(n)

def get_n_random(full_df: pd.DataFrame, n: int):
    subsample = full_df.sample(n=n, replace=True)
    return subsample

In [None]:
def plot_rows(df: pd.DataFrame, best_column: str, opt_name: str, save=False, time=True, epoch=True, random=False):
    save_args = dict(facecolor="white", transparent=False)
    top_or_rand_str = "top"
    if random:
        top_or_rand_str = "random"

    plt.figure(figsize=(16,9))
    if epoch:
        for row in df.itertuples():
            plt.plot(row.test_accuracy_list)
        plt.xlabel("Epoch")
        plt.ylabel("Percent correct")
        plt.ylim((0, 100.0))
        title = f"{opt_name}, {top_or_rand_str.capitalize()} {len(df.index)} runs by {best_column}, Accuracy vs. Epoch"
        plt.title(title)
        if save:
            save_name = f"{opt_name}_{top_or_rand_str}_{len(df.index)}_{best_column}_acc_vs_epoch.png"
        plt.savefig(IMG_OUTPUT_DIR / save_name, **save_args)

    if time:
        plt.figure(figsize=(16,9))
        for row in df.itertuples():
            plt.plot(row.training_timestamps, row.test_accuracy_list)
        plt.xlabel("Time (s)")
        plt.ylabel("Percent correct")
        plt.ylim((0, 100.0))
        title = f"{opt_name}, {top_or_rand_str.capitalize()} {len(df.index)} runs by {best_column}, Accuracy vs. Time"
        plt.title(title)
        if save:
            save_name = f"{opt_name}_{top_or_rand_str}_{len(df.index)}_{best_column}_acc_vs_time.png"
            plt.savefig(IMG_OUTPUT_DIR / save_name, **save_args)

    if epoch:
        plt.figure(figsize=(16,9))
        for row in df.itertuples():
            plt.plot(row.train_loss_list)
        plt.xlabel("Epoch")
        plt.ylabel("Training Loss, Log")
        plt.yscale("log")
        title = f"{opt_name}, {top_or_rand_str.capitalize()} {len(df.index)} runs by {best_column}, Training Loss vs. Epoch"
        plt.title(title)
        if save:
            save_name = f"{opt_name}_{top_or_rand_str}_{len(df.index)}_{best_column}_train_loss_vs_epoch.png"
            plt.savefig(IMG_OUTPUT_DIR / save_name, **save_args)

        plt.figure(figsize=(16,9))
        for row in df.itertuples():
            plt.plot(row.test_loss_list)
        plt.xlabel("Epoch")
        plt.ylabel("Test Loss, Log")
        plt.yscale("log")
        title = f"{opt_name}, {top_or_rand_str.capitalize()} {len(df.index)} runs by {best_column}, Test Loss vs. Epoch"
        plt.title(title)
        if save:
            save_name = f"{opt_name}_{top_or_rand_str}_{len(df.index)}_{best_column}_test_loss_vs_epoch.png"
            plt.savefig(IMG_OUTPUT_DIR / save_name, **save_args)


def plot_performance_vs_batch_size(df: pd.DataFrame, opt_name: str, peak: bool, violin: bool=False):
    batch_sizes = [100, 1000, 5000, 10000, 25000, 50000]
    opt_df = df.loc[df["specs.opt"] == opt_name]
    fig = plt.figure(figsize=(16, 9))
    if peak:
        acc = opt_df["top_test_acc"]
        ylabel = "Peak Test Accuracy"
    else:
        acc = opt_df["final_test_acc"]
        ylabel = "Final Test Accuracy"

    x_axis = opt_df["specs.batch_size_train"]

    if violin:
        tmp_df = pd.concat([x_axis, acc], axis=1, keys=["hparam", "acc"])
        hparams = list(set(x_axis))
        if len(hparams) == 0:
            print(f"No data for optimizer {opt_name}!")
            return
        hparams.sort()
        def get_hparam_match_and_count(hparam_):
            tmp = tmp_df.loc[tmp_df["hparam"] == hparam_]["acc"]
            if len(tmp) == 0:
                return pd.Series(np.zeros(2)*np.nan), 0
            return tmp, len(tmp)
        data_and_counts = [get_hparam_match_and_count(i) for i in batch_sizes]
        data = [i[0] for i in data_and_counts]
        counts = [i[1] for i in data_and_counts]
        hparam_strs = [f"{h}\nN={c}" for h, c in zip(batch_sizes, counts)]
        xticklabels = hparam_strs
        xticks = [*range(1,7)]

        ax = fig.add_subplot(111)
        ax.violinplot(data, showmedians=True)
        ax.set_xticks(xticks)
        ax.set_xticklabels(xticklabels)
    else:
        plt.scatter(x_axis, acc)

    plt.xlabel("Batch Size")
    plt.ylabel(ylabel)
    plt.ylim((0.0, 100.0))
    name = f"{opt_name}_{ylabel}_vs_batch_size".replace(" ", "_")
    name = name.replace("specs.", "")
    plot_title = name.split("_")
    plot_title[0] = plot_title[0].upper()

    plt.title(" ".join(plot_title).title())
    plt.savefig(IMG_OUTPUT_DIR / (name + ".png"), facecolor="white", transparent=False)

for optim in OPT_OF_INTEREST:
    plot_performance_vs_batch_size(cifar, optim, True, violin=True)

In [None]:
def plot_optimizer_of_interest(df: pd.DataFrame, opt_name: str, to_plot: int, time=True, epoch=True, random=False) -> None:
    opt_df = df.loc[df["specs.opt"] == opt_name]
    if random:
        to_plot = get_n_random(opt_df, to_plot)
    else:
        to_plot = get_n_best_acc(opt_df, to_plot)
    A = "top_test_acc"
    B = "final_test_acc"
    plot_rows(to_plot, A, opt_name, save=True, time=time, epoch=epoch, random=random)
    #plot_rows(best_final_val, B, opt_name, save=True)

In [None]:
for opt in OPT_OF_INTEREST:
    plot_optimizer_of_interest(cifar, opt, 100, epoch=True, random=True)

In [None]:
def violin_opt_comparison(
    full_df: pd.DataFrame,
    opts_of_interest: Iterable[str],
    save: bool=True,
) -> None:
    dataframes = []
    for opt_name in opts_of_interest:
        acc_series = full_df.loc[full_df["specs.opt"] == opt_name]["top_test_acc"]
        tmp_df = acc_series.to_frame(name=NAME_DICT[opt_name])
        tmp_df.reset_index(inplace=True)
        tmp_df.drop("index", inplace=True, axis=1)
        dataframes.append(tmp_df)

    labels = [i.columns[0] for i in dataframes]
    dataset = [i[i.columns[0]] for i in dataframes]
    plt.figure(figsize=(15, 9))
    plt.violinplot(dataset, showextrema=True, showmedians=True)
    plt.xticks([*range(1, len(dataset)+1)], labels=labels)
    plt.ylim((0.0, 100.0))
    plt.ylabel("Optimizer")
    plt.ylabel("Peak Accuracy, %")
    plt.show()

violin_opt_comparison(cifar, OPT_OF_INTEREST)

In [None]:
def violin_plot_peak_acc(
    df: pd.DataFrame,
    opt_name: str,
    discriminate_fgd: bool=False,
    save=True
) -> None:
    nrows = 1
    fig, axes = plt.subplots(nrows=nrows, ncols=1, figsize=(16,9), squeeze=False)
    axes[0, 0].set_ylim(bottom=0.0, top=100.0)
    if discriminate_fgd and opt_name == "sgd":
        opt_df = df.loc[df["specs.opt"] == opt_name]
        opt_df = opt_df.append(df.loc[df["specs.opt"] == "fgd"], ignore_index=True)
    else:
        opt_df = df.loc[df["specs.opt"] == opt_name]
    #opt_df = get_summary_vals(opt_df)

    # Peak accuracy
    peak_acc = opt_df["top_test_acc"]
    axes[0, 0].violinplot(peak_acc, showextrema=True, showmedians=True)
    axes[0, 0].set_title(f"Distribution of Peak Testing Accuracy: {opt_name}")

    # Peak Accuracy Table
    acc_summary = pd.DataFrame(peak_acc.describe())
    acc_summary.rename(columns={"top_test_acc": "Peak Test Accuracy"}, inplace=True)


    if save:
        plt.savefig(IMG_OUTPUT_DIR / f"{opt}_peak_accuracy_violin.png")
    plt.show()

In [None]:
for opt in OPT_OF_INTEREST:
    violin_plot_peak_acc(cifar, opt, save=True)

In [None]:
def time_to_peak_vs_peak_acc(df: pd.DataFrame, opt_name: str, plot_full: bool=False, hist: bool=True):
    opt_df = df.loc[df["specs.opt"] == opt_name][["top_test_acc", "time_to_peak_acc"]]
    opt_df[opt_df["time_to_peak_acc"] == np.inf] = np.nan

    opt_df = opt_df.dropna()

    peak = max(opt_df["top_test_acc"])
    peak_acc_row = opt_df.loc[opt_df["top_test_acc"] == peak]
    time_of_shortest_peak = min(peak_acc_row["time_to_peak_acc"].values)
    time_to_peak_acc = deepcopy(opt_df["time_to_peak_acc"])
    top_test_acc = deepcopy(opt_df["top_test_acc"])

    time_to_peak_acc[time_to_peak_acc == np.inf] = np.nan

    if plot_full:
        plt.figure(figsize=(16, 9))
        plt.scatter(time_to_peak_acc, top_test_acc)
        plt.ylim((0, 100))
        plt.title(f"{NAME_DICT[opt_name]}, Time to Peak Accuracy vs. Peak Accuracy")
        plt.axhline(y=peak, color='g', linestyle="-", label=f"Peak Accuracy: {peak}%")
        plt.axvline(x=time_of_shortest_peak, color="r", linestyle="-", label=f"Shortest time to Peak Accuracy: {time_of_shortest_peak:.0f} seconds")
        plt.xlabel("Time to peak accuracy (seconds)")
        plt.ylabel("Peak Test Accuracy (%)")
        plt.legend()

    if hist:
        plt.figure(figsize=(16,9))
        plt.hist2d(time_to_peak_acc, top_test_acc, bins=50)
        plt.ylabel("Peak Test Accuracy, %")
        plt.xlabel("Time To Peak Accuracy, seconds")


    plt.figure(figsize=(16, 9))
    plt.ylim((0, 100))
    plt.scatter(time_to_peak_acc, top_test_acc)
    plt.xlim((-30000, 650000))
    plt.title(f"{NAME_DICT[opt_name]}, Time to Peak Accuracy vs. Peak Accuracy, Windowed")
    plt.axhline(y=peak, color='g', linestyle="-", label=f"Peak Accuracy: {peak}%")
    plt.axvline(x=time_of_shortest_peak, color="r", linestyle="-", label=f"Shortest time to Peak Accuracy: {time_of_shortest_peak:.0f} seconds")
    plt.xlabel("Time to peak accuracy (seconds)")
    plt.ylabel("Peak Test Accuracy (%)")
    plt.legend()
    plt.show()

    return time_of_shortest_peak, peak

def plot_solution_space(solution_space):
    opts = [i[0] for i in solution_space]
    ts = [i[1] for i in solution_space]
    ps = [i[2] for i in solution_space]

    plt.figure(figsize=(8,8))
    plt.scatter(ts, ps)
    for i, opt in enumerate(opts):
        plt.annotate(opt, (ts[i], ps[i]))

    plt.xlabel("Time to Best Peak Accuracy (seconds)")
    plt.ylabel("Best Peak Accuracy (%)")

solution_space = []
for opt in OPT_OF_INTEREST:
    t, p = time_to_peak_vs_peak_acc(cifar, opt)
    solution_space.append((opt, t, p))

plot_solution_space(solution_space)


In [None]:
from scipy.stats import f_oneway, kruskal, alexandergovern

def conduct_test_intra_optimizer(df: pd.DataFrame, opt_name: str, test: Callable, batch_sizes: Optional[List[int]]=None):
    opt_df = df.loc[df["specs.opt"] == opt_name]
    if batch_sizes is None:
        batch_sizes = sorted(list(set(opt_df["specs.batch_size_train"])))
    
    batch_size_rslts = [opt_df.loc[opt_df["specs.batch_size_train"] == i]["top_test_acc"] for i in batch_sizes]
    #batch_size_rslts = [opt_df.loc[df["specs.batch_size_train"] == i] for i in batch_sizes]
    results = test(*batch_size_rslts)

    return results

for opt in OPT_OF_INTEREST:
    print(f"{opt}:", conduct_test_intra_optimizer(cifar, opt, f_oneway))

for opt in OPT_OF_INTEREST:
    print(f"{opt}:", conduct_test_intra_optimizer(cifar, opt, kruskal))

for opt in OPT_OF_INTEREST:
    print(f"{opt}:", conduct_test_intra_optimizer(cifar, opt, alexandergovern))