In [None]:
from typing import Callable, List, Optional, Iterable
from pathlib import Path
import os
from copy import deepcopy
import json
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import multiprocessing as mp
from tabulate import tabulate

import warnings
warnings.filterwarnings("ignore")

DATA_DIR = "./expt_rslts/primed/"
PICKLE_NAME = "nn_rslts_primed_df.pkl"
IMG_OUTPUT_DIR = Path("imgs/")
IMG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TABLE_OUTPUT_DIR = Path("tables/")
TABLE_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

OPT_OF_INTEREST = ("sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")
DISCRIMINATE_FGD = False
if DISCRIMINATE_FGD:
    OPT_OF_INTEREST = ("fgd", "sgd", "kn", "fr", "pr", "hs", "dy", "bfgs")

# Truncate the opts of interest...
OPT_OF_INTEREST = ("sgd", "fr", "bfgs", "kn")


DROP_N_EQUALS_TWO = True
DROP_N_LESS_THAN = 100

NAME_DICT = {
    "sgd": "Stochastic Gradient Descent",
    "fgd": "Full Gradient Descent",
    "kn": "Krylov-Newton",
    "fr": "Fletcher-Reeves",
    "pr": "Polak-Ribiere",
    "hs": "Hestenes-Stiefel",
    "dy": "Dai-Yuan",
    "bfgs": "BFGS",
    "bfgsi": "BFGS Inverse",
    "dfp": "Davidon-Fletcher-Powell",
    "dfpi": "Davidon-Fletcher-Powell Inverse",
    "sr1": "Symmetric Rank-One",
    "sr1d": "Symmetric Rank-One Dual",
    "levenberg": "Levenberg"
}

def read_json_to_df(fname):
    try:
        with open(fname) as f:
            tmp_df = pd.json_normalize(json.load(f))
            return tmp_df
    except Exception:
        return None

if not os.path.exists(PICKLE_NAME):
    print("Creating dataframe!")
    assert(os.path.exists(DATA_DIR))
    f_list = glob.glob(os.path.join(DATA_DIR, "TEST_priming_*.json"))
    print("Number of files:", len(f_list))
    with mp.Pool() as p:
        dframes = p.map(read_json_to_df, f_list)
    #dframes = [read_json_to_df(i) for i in f_list]
    dframes = [i for i in dframes if i is not None]
    cifar = pd.concat(dframes)
    cifar.to_pickle(PICKLE_NAME)

else:
    print("Reading dataframe!")
    cifar = pd.read_pickle(PICKLE_NAME)

In [None]:
cifar.keys()

In [None]:
# Data cleaning/preprocessing
cifar["specs.batch_size_train"] = cifar["specs.batch_size_train"].apply(int)
if DISCRIMINATE_FGD:
    cifar.loc[(cifar["specs.opt"] == "sgd") & (cifar["specs.batch_size_train"] >= 50000), "specs.opt"] = "fgd"
    cifar.loc[cifar["specs.opt"] == "fgd"]

cifar = cifar[cifar["specs.opt"].isin(OPT_OF_INTEREST)]

def max_acc_lambda(row):
    try:
        max_ = max(row["test_accuracy_list"])
    except ValueError:
        max_ = 0.0
    return max_

def final_acc_lambda(row):
    try:
        last = row["test_accuracy_list"][-1]
    except IndexError:
        last = float("NaN")
    return last

cumulative_time_lambda = lambda row: np.cumsum(row["time"])

def total_time_lambda(row):
    try:
        max_ = row["training_timestamps"][-1]
    except IndexError:
        max_ = float("NaN")
    return max_

def time_to_peak_lambda(row):
    try:
        max_idx = np.array(row["test_accuracy_list"]).argmax()
        time_to_peak = np.sum(row["training_timestamps"][:max_idx+1])

    except ValueError:
        time_to_peak = np.inf

    return time_to_peak


def fgd_lambda(row):
    try:
        opt_name = row["specs.opt"]
        batch_size_train = int(row["specs.batch_size_train"])
        if opt_name == "sgd" and batch_size_train >= 50000:
            row["specs.opt"] = "fgd"
    except ValueError:
        print(f"row failed: {row['specs.opt']}")


def apply_lambda(df: pd.DataFrame, colname: str, func: Callable):
    tmp = df.apply(func, axis=1)
    df[colname] = tmp.copy()

    return df

def get_summary_vals(df: pd.DataFrame):
    df = apply_lambda(df, "top_test_acc", max_acc_lambda)
    df = apply_lambda(df, "final_test_acc", final_acc_lambda)
    df = apply_lambda(df, "training_timestamps", cumulative_time_lambda)
    df = apply_lambda(df, "total_training_time", total_time_lambda)
    df = apply_lambda(df, "time_to_peak_acc", time_to_peak_lambda)
    df = df.dropna(subset=["total_training_time"])
    if DROP_N_LESS_THAN is not None:
        df = df[df["specs.batch_size_train"] >= DROP_N_LESS_THAN]

    return df

cifar = get_summary_vals(cifar)
rename_dict = {
    "specs.opt": "Optimizer",
    "specs.batch_size_train": "TrainingBatchSize",
    "top_test_acc": "TopTestAccuracy"
}
cifar = cifar.rename(columns=rename_dict)
cifar["TrainingBatchSize"].unique()

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.factorplots import interaction_plot
from scipy import stats

def my_interaction_plot(df, func, xscale="log"):
    fig, ax = plt.subplots()
    fig.set_figheight(9)
    fig.set_figwidth(16)
    ax.set_xscale(xscale)
    _ = interaction_plot(df["TrainingBatchSize"], df["Optimizer"], df["TopTestAccuracy"], func=func, ax=ax)

my_interaction_plot(cifar, np.mean)
my_interaction_plot(cifar, np.median)
my_interaction_plot(cifar, np.max)
my_interaction_plot(cifar, np.min)
my_interaction_plot(cifar, np.var)



In [None]:
model = ols("TopTestAccuracy ~ C(TrainingBatchSize) + C(Optimizer) + C(TrainingBatchSize):C(Optimizer)", data=cifar).fit()
table = sm.stats.anova_lm(model, typ=2)


In [None]:
table

In [None]:
def dump_data(df: pd.DataFrame, opts_of_interest: Iterable[str], cols: Iterable[str]):
    # Filter on optimizer
    opt_df = df[df["Optimizer"].isin(opts_of_interest)]
    # Subselect rows
    hyperparam_subset = opt_df[cols]
    with open("subselected_data_priming.csv", "w", encoding="UTF-8") as ofile:
        hyperparam_subset.to_csv(ofile, index=False)
    
dump_data(cifar, OPT_OF_INTEREST, ["Optimizer", "TrainingBatchSize", "TopTestAccuracy"])
