# Analysis for hyperparameter search

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set()

First I'm just going to load exploding blocks data & use that for all my analysis. Later on, I'll move things into functions & execute them separately.

Also, why doesn't Jupyter have support for this kind of repeated analysis? It would be nice to have scoped, parameterised cells that could play a similar role to functions, but make it easier to fiddle with values, etc.

In [None]:
# prob_data = pd.read_csv('./tune-results-exbw.csv')
# prob_data = pd.read_csv('./tune-results-gm.csv')
# prob_data = pd.read_csv('./tune-results-mbw.csv')
prob_data = pd.read_csv('./tune-results-pbw-exw-huge.csv')
# prob_data = pd.read_csv('./tune-results-huge-gm-mbw.csv')

Rest of this stuff should be problem-agnostic.

In [None]:
conf_cols = [c for c in prob_data.columns if c.startswith('config:')]
conf_names = [c.split(':', 1)[1] for c in conf_cols]
print('Configuration variables:')
for i, (conf_col, conf_name) in enumerate(zip(conf_cols, conf_names)):
    print('  [%02d] %s (%s)' % (i, conf_name, conf_col))
cov_series = prob_data['coverage'].dropna()
print("Found %d runs, of which %d have non-NaN coverage" % (len(prob_data), len(cov_series)))
prob_data_with_nans = prob_data
prob_data['coverage_clean'] = prob_data['coverage'].fillna(-0.01)

In [None]:
sns.distplot(cov_series, bins=20)
plt.title("Coverage of successfully completed tuning runs")
plt.xlabel("Coverage (fraction of runs reaching goal)")
plt.ylabel("Number of trials with coverage")
plt.show()

## Plotting effect of individual variables

In [None]:
# plot coverage as a function of each var
log_scale_vars = {'l1_reg', 'l2_reg'}
for ls_var in sorted(log_scale_vars):
    prob_data[f'config:{ls_var}_log10'] = prob_data[f'{ls_var}_log10'] = np.log10(prob_data[ls_var])
prob_data['success'] = prob_data['coverage'].notnull()
plot_cols = 3
plot_rows = int(np.ceil(len(conf_names) / float(plot_cols)))
plt.figure(figsize=(20, 5 * plot_rows))
for plot_num, (conf_name, conf_col_name) in enumerate(zip(conf_names, conf_cols), start=1):
    plt.subplot(plot_rows, plot_cols, plot_num)
    if conf_name in log_scale_vars:
        # this needs to happen before sns.scatterplot() or it will mess up the x-axis range
        plt.xscale('log')
    sns.scatterplot(x=conf_name, y='coverage_clean', hue='success', data=prob_data)
plt.show()

## Plotting effect of pairs of vars

In [None]:
pair_vars = ['l2_reg_log10', 'dropout', 'hidden_size']  #, 'l1_reg_log10',]
pair_col_names = [f'config:{n}' for n in pair_vars]
plt.figure(figsize=(20, 30))

sns.pairplot(
    data=prob_data, hue='coverage_clean', diag_kind='hist', vars=pair_vars,
    palette=sns.cubehelix_palette(len(set(prob_data['coverage_clean']))))
plt.show()

## Figuring out what the best configs are

In [None]:
best_few = prob_data.nlargest(20, 'coverage_clean')
plt.figure(figsize=(20, 5 * plot_rows))
for plot_num, (conf_name, conf_col_name) in enumerate(zip(conf_names, conf_cols), start=1):
    plt.subplot(plot_rows, plot_cols, plot_num)
    if conf_name in log_scale_vars:
        plt.xscale('log')
    sns.scatterplot(x=conf_name, y='coverage_clean', data=best_few)
plt.show()

In [None]:
for row_idx in range(len(best_few)):
    row_num = row_idx + 1
    row = best_few.iloc[row_idx]
    print("Config #%d (coverage=%f):" % (row_num, row['coverage_clean']))
    for conf_name in conf_names:
        print(f"  {conf_name} = {row[conf_name]}")