In [None]:
from typing import Callable, List, Optional, Iterable
from pathlib import Path
import os
from copy import deepcopy
import json
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import multiprocessing as mp
from tabulate import tabulate
import warnings

import scipy

warnings.filterwarnings("ignore")

DATA_DIR = "./expt_rslts/"
PICKLE_NAME = "nn_rslts_df.pkl"
IMG_OUTPUT_DIR = Path("imgs/")
IMG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TABLE_OUTPUT_DIR = Path("tables/")
TABLE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

#OPT_OF_INTEREST = ("sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")
#DISCRIMINATE_FGD = False
#if DISCRIMINATE_FGD:
#    OPT_OF_INTEREST = ("fgd", "sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")

# Truncate the opts of interest...
OPT_OF_INTEREST = ("sgd", "fr", "bfgs")


DROP_N_EQUALS_TWO = True

NAME_DICT = {
    "sgd": "Stochastic Gradient Descent",
    "fgd": "Full Gradient Descent",
    "kn": "Krylov-Newton",
    "fr": "Fletcher-Reeves",
    "pr": "Polak-Ribiere",
    "hs": "Hestenes-Stiefel",
    "dy": "Dai-Yuan",
    "bfgs": "BFGS",
    "bfgsi": "BFGS Inverse",
    "dfp": "Davidon-Fletcher-Powell",
    "dfpi": "Davidon-Fletcher-Powell Inverse",
    "sr1": "Symmetric Rank-One",
    "sr1d": "Symmetric Rank-One Dual",
    "levenberg": "Levenberg"
}

def read_json_to_df(fname):
    try:
        with open(fname) as f:
            tmp_df = pd.json_normalize(json.load(f))
            return tmp_df
    except Exception:
        return None

if not os.path.exists(PICKLE_NAME):
    print("Creating dataframe!")
    assert(os.path.exists(DATA_DIR))
    f_list = glob.glob(os.path.join(DATA_DIR, "TEST_*.json"))
    print("Number of files:", len(f_list))
    with mp.Pool() as p:
        dframes = p.map(read_json_to_df, f_list)
    #dframes = [read_json_to_df(i) for i in f_list]
    dframes = [i for i in dframes if i is not None]
    monolith = pd.concat(dframes)
    monolith.to_pickle(PICKLE_NAME)

else:
    print("Reading dataframe!")
    monolith = pd.read_pickle(PICKLE_NAME)

monolith.keys()
cifar = monolith.loc[monolith["specs.dataset"] == "cifar10"]
cifar["specs.batch_size_train"] = cifar["specs.batch_size_train"].apply(int)

In [None]:
def max_acc_lambda(row):
    try:
        max_ = max(row["test_accuracy_list"])
    except ValueError:
        max_ = 0.0
    return max_

def final_acc_lambda(row):
    try:
        last = row["test_accuracy_list"][-1]
    except IndexError:
        last = float("NaN")
    return last

cumulative_time_lambda = lambda row: np.cumsum(row["time"])

def total_time_lambda(row):
    try:
        max_ = row["training_timestamps"][-1]
    except IndexError:
        max_ = float("NaN")
    return max_

def time_to_peak_lambda(row):
    try:
        max_idx = np.array(row["test_accuracy_list"]).argmax()
        time_to_peak = row["training_timestamps"][max_idx]

    except ValueError:
        time_to_peak = np.inf

    return time_to_peak


def fgd_lambda(row):
    try:
        opt_name = row["specs.opt"]
        batch_size_train = int(row["specs.batch_size_train"])
        if opt_name == "sgd" and batch_size_train >= 50000:
            row["specs.opt"] = "fgd"
    except ValueError:
        print(f"row failed: {row['specs.opt']}")


def apply_lambda(df: pd.DataFrame, colname: str, func: Callable):
    tmp = df.apply(func, axis=1)
    df[colname] = tmp.copy()

    return df

def get_summary_vals(df: pd.DataFrame):
    df = apply_lambda(df, "top_test_acc", max_acc_lambda)
    df = apply_lambda(df, "final_test_acc", final_acc_lambda)
    df = apply_lambda(df, "training_timestamps", cumulative_time_lambda)
    df = apply_lambda(df, "total_training_time", total_time_lambda)
    df = apply_lambda(df, "time_to_peak_acc", time_to_peak_lambda)
    df = df.dropna(subset=["total_training_time"])
    if DROP_N_EQUALS_TWO:
        df = df[df["specs.batch_size_train"] > 2]

    return df

cifar = get_summary_vals(cifar)
cifar = cifar.loc[cifar["specs.opt"].isin(OPT_OF_INTEREST)]

def drop_fr_5k_failure_to_train(df: pd.DataFrame):
    """Handle the weird case of FR failing to train at 5k batch size"""
    return df.loc[~((df["specs.opt"] == "fr") & (df["top_test_acc"] < 15) & (df["specs.batch_size_train"] == 5000))]

cifar = drop_fr_5k_failure_to_train(cifar)
cifar.head()

In [None]:
ttp_dfs = [(opt, cifar.loc[cifar["specs.opt"] == opt]["time_to_peak_acc"]) for opt in OPT_OF_INTEREST]
total_time_dfs = [(opt, cifar.loc[cifar["specs.opt"] == opt]["total_training_time"]) for opt in OPT_OF_INTEREST]

In [None]:
longest_data = max([len(series.values) for _, series in ttp_dfs])
tmp_array = np.empty((longest_data, 3))
tmp_array[:] = np.nan
opts = []
for i, (opt, series) in enumerate(ttp_dfs):
    print(opt)
    print(series.shape)
    val = series.values
    length = val.shape[-1]
    tmp_array[:length, i] = val[:]
    opts.append(opt)
    
time_to_peak_df = pd.DataFrame(data=tmp_array, columns=opts)

In [None]:
longest_data = max([len(series.values) for _, series in total_time_dfs])
tmp_array = np.empty((longest_data, 3))
tmp_array[:] = np.nan
opts = []
for i, (opt, series) in enumerate(total_time_dfs):
    print(opt)
    print(series.shape)
    val = series.values
    length = val.shape[-1]
    tmp_array[:length, i] = val[:]
    opts.append(opt)
    
total_time_df = pd.DataFrame(data=tmp_array, columns=opts)

In [None]:
print(time_to_peak_df)

In [None]:
print(total_time_df)

In [None]:
def opt_time_comparison_boxplot(df: pd.DataFrame, title: str, ybounds) -> None:
    ax = df.plot(kind="box")
    ax.figure.set_size_inches(16, 9)
    ax.set_yscale("log")
    ax.set_ylabel("Log Time (seconds)")
    ax.set_ylim(*ybounds)
    ax.set_xlabel("Optimizer")
    ax.plot()
    ax.set_title(f"Optimizer {title} Distributions, bad FR removed, no outlier treatment")

In [None]:
opt_time_comparison_boxplot(time_to_peak_df, "TTP", (0, 10**5.5))
opt_time_comparison_boxplot(total_time_df, "Runtime", (0, 10**5.5))

In [None]:
def remove_outliers(series: pd.Series):
    Q1 = series.quantile(q=0.25)
    Q2 = series.quantile(q=0.75)
    iqr = scipy.stats.iqr(series)
    range_ = 1.5*iqr
    series.loc[(series > Q2 + range_) | (series < Q1 - range_)] = np.nan
    #series.between(Q1 - (iqr*1.5), Q2 + (iqr*1.5), inclusive=False)
    return series

def opt_time_comparison_boxplot_no_outliers(df: pd.DataFrame, title: str, ybounds) -> None:
    tmp = df.copy()
    tmp["sgd"] = remove_outliers(tmp["sgd"])
    tmp["bfgs"] = remove_outliers(tmp["bfgs"])
    tmp["fr"] = remove_outliers(tmp["fr"])
    ax = tmp.plot(kind="box")
    ax.figure.set_size_inches(16, 9)
    ax.set_yscale("log")
    ax.set_ylabel("Log Time (seconds)")
    ax.set_ylim(*ybounds)
    ax.set_xlabel("Optimizer")
    ax.plot()
    ax.set_title(f"Optimizer {title} Distributions, bad FR removed, Outliers Treated")

In [None]:
opt_time_comparison_boxplot_no_outliers(time_to_peak_df, "TTP", (0, 10**5.5))
opt_time_comparison_boxplot_no_outliers(total_time_df, "Runtime", (0, 10**5.5))

In [None]:
def batchsize_time_comparison_boxplot(df: pd.DataFrame, opt: str, val: str, ybounds) -> None:
    ax = df.plot(kind="box")
    ax.figure.set_size_inches(16, 9)
    ax.set_yscale("log")
    ax.set_ylabel("Log Time (seconds)")
    ax.set_ylim(*ybounds)
    ax.set_xlabel("Optimizer")
    ax.plot()
    ax.set_title(f"{opt} Batch Size {val} Distributions, bad FR removed, no outlier treatment")

In [None]:
batch_sizes = sorted(pd.unique(cifar["specs.batch_size_train"]))
batch_sizes = [i for i in batch_sizes if int(i) > 10]

sgd_batch_size_df = cifar[cifar["specs.opt"] == "sgd"][["specs.batch_size_train", "total_training_time", "time_to_peak_acc"]]
fr_batch_size_df = cifar[cifar["specs.opt"] == "fr"][["specs.batch_size_train", "total_training_time", "time_to_peak_acc"]]
bfgs_batch_size_df = cifar[cifar["specs.opt"] == "bfgs"][["specs.batch_size_train", "total_training_time", "time_to_peak_acc"]]

def get_the_dang_df(df: pd.DataFrame, opt: str, batch_sizes: Iterable[str], column: str):
    opt_batch_size_df = cifar[cifar["specs.opt"] == opt][["specs.batch_size_train", column]]
    tmp = dict()
    for batch_size in batch_sizes:
        tmp2 = opt_batch_size_df[opt_batch_size_df["specs.batch_size_train"] == int(batch_size)][column].values
        tmp[str(batch_size)] = pd.Series(tmp2)

    return pd.DataFrame(tmp)

sgd_batch_vs_ttp = get_the_dang_df(cifar, "sgd", batch_sizes, "time_to_peak_acc")
fr_batch_vs_ttp = get_the_dang_df(cifar, "fr", batch_sizes, "time_to_peak_acc")
bfgs_batch_vs_ttp = get_the_dang_df(cifar, "bfgs", batch_sizes, "time_to_peak_acc")

sgd_batch_vs_runtime = get_the_dang_df(cifar, "sgd", batch_sizes, "total_training_time")
fr_batch_vs_runtime = get_the_dang_df(cifar, "fr", batch_sizes, "total_training_time")
bfgs_batch_vs_runtime = get_the_dang_df(cifar, "bfgs", batch_sizes, "total_training_time")

In [None]:
batchsize_time_comparison_boxplot(sgd_batch_vs_ttp, "SGD", "TTP", (1, 10**5.5))
batchsize_time_comparison_boxplot(bfgs_batch_vs_ttp, "BFGS", "TTP", (1, 10**5.5))
batchsize_time_comparison_boxplot(fr_batch_vs_ttp, "FR", "TTP", (1, 10**5.5))