In [41]:
import os, time, shutil, itertools, json
import numpy as np
import scipy.stats
from collections import defaultdict
from tqdm import tqdm, tqdm_notebook


def crawl_directory(dirname):
    """ Walk a nested directory to get all filename ending in a pattern """
    for path, subdirs, files in os.walk(dirname):
        for name in files:
            if not name.endswith('.DS_Store'):
                yield os.path.join(path, name)


def remove_empty_dirs(path):
    for root, dirnames, filenames in os.walk(path, topdown=False):
        for dirname in dirnames:
            remove_empty_dir(os.path.realpath(os.path.join(root, dirname)))


def remove_empty_dir(path):
    try:
        os.rmdir(path)
    except OSError:
        pass
    

def nested_pickle_dict():
    """ Picklable defaultdict nested dictionaries """
    return defaultdict(nested_pickle_dict)


def format_e(n):
    a = '%E' % n
    return (a.split('E')[0].rstrip('0').rstrip('.') + 'E' + a.split('E')[1]).lower()


def mean_confidence_interval(data, axis=0, confidence=0.95):
    n = data.shape[axis]
    mu, std = np.nanmean(data, axis=axis), scipy.stats.sem(data, axis=axis, nan_policy='omit')
    h = np.ma.getdata(std) * scipy.stats.t.ppf((1 + confidence) / 2., n-1)       

    return mu, h, mu-h, mu+h

def load_best(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
        
    return data

In [38]:
""" Get best performance according to hyperparameter mean minimum """

dirname = '/Users/sob/Desktop/gan_results/hypertuning/multivariate/'

# Same for all..
resultnames = [i for i in os.listdir(dirname) if i != '.DS_Store']
tuningnames = [i for i in os.listdir('/Users/sob/Desktop/gan_results/hypertuning/multivariate/16_dims_1000_samples/trial_1') if '1024' in i]

In [9]:
optimal_1000 = load_best('/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}_dims_1000_samples/data.json'.format(d))
optimal_10000 = load_best('/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}_dims_10000_samples/data.json'.format(d))
optimal_100000 = load_best('/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}_dims_100000_samples/data.json'.format(d))

In [42]:
# """ Find best results """
# for name in tqdm_notebook(resultnames):
    
#     print(name)
    
#     best_path = '/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}/'.format(t)
#     global_optimal = nested_pickle_dict()

#     # Initialize best dictionary    
#     for t in tqdm_notebook(tuningnames):

#         optimal = nested_pickle_dict()
#         results = []

#         # Load in the results from each trial
#         for trial in range(1, 21):
#             path = '/Users/sob/Desktop/gan_results/hypertuning/multivariate/{0}/trial_{1}/{2}'.format(name, trial, t)
                
#             data = []
#             with open(path) as f:
#                 for line in f:
#                     data.append(json.loads(line))

#             results.append(data[0])

#         # Go through each one and append the results
#         for result in results:
#             for model, distributions in result.items():
#                 for distribution, metrics in distributions.items():
#                     for metric, values in metrics.items():
#                         if metric in ["LR", "HDIM", 'GLoss', 'DLoss', "BSIZE", "Energy-Distance"]:
#                             continue
#                         else:
#                             # If metric is seen for the first time, initialize it
#                             if 'values' not in optimal[model][distribution][metric]:
#                                 optimal[model][distribution][metric]["values"] = []

#                             # Otherwise, compare it the presently considered value
#                             optimal[model][distribution][metric]["values"].append(values)

#         for model, distributions in result.items():
#             for distribution, metrics in distributions.items():
#                 for metric, values in metrics.items():
#                     if metric in ["LR", "HDIM", 'GLoss', 'DLoss', "BSIZE", "Energy-Distance"]:
#                         continue
#                     else:
#                         data_min = np.nanmean(np.nanmin(np.array(optimal[model][distribution][metric]["values"]), axis=1))
                        
#                         # Init global optimal
#                         if 'best' not in global_optimal[model][distribution][metric]:
#                             global_optimal[model][distribution][metric]['best'] = 1e10     

#                         if data_min < global_optimal[model][distribution][metric]['best']:
# #                             print(model, distribution, metric, '\n', global_optimal[model][distribution][metric]['best'], '-->', data_min, '\n')
#                             global_optimal[model][distribution][metric]['best'] = data_min
#                             global_optimal[model][distribution][metric]['parameters'] = [metrics["LR"], metrics["HDIM"], metrics["BSIZE"]]
#                             global_optimal[model][distribution][metric]["values"] = optimal[model][distribution][metric]["values"]
            
#                             mean, h, low, high = mean_confidence_interval(np.array(optimal[model][distribution][metric]["values"]), axis=0)
                
#                             global_optimal[model][distribution][metric]['low'] = list(low)
#                             global_optimal[model][distribution][metric]['h'] = list(h)
#                             global_optimal[model][distribution][metric]['mean'] = list(mean)
#                             global_optimal[model][distribution][metric]['high'] = list(high)

# #     if not os.path.exists(best_path):
# #         os.makedirs(best_path)
    
# #     with open(best_path + 'data.json', 'w') as outfile:
# #         json.dump(global_optimal, outfile)

In [52]:
global_optimal = load_best('/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}/data.json'.format(name))

In [75]:
np.nanmin(np.array(global_optimal[model][distribution][metric]["values"]), axis=1)

array([0.20618419, 0.19593402, 0.19403938, 0.19630246, 0.17052982,
       0.20884451, 0.19485955, 0.21225419, 0.19609026, 0.18250627,
       0.20626169, 0.21324213, 0.19362725, 0.18573761, 0.20808232,
       0.20553133, 0.18600923, 0.22414468, 0.22308071, 0.20108958])

In [81]:
np.mean(global_optimal[model][distribution][metric]["mean"])

0.22269121198918806

In [85]:
resultnames

['128_dims_100000_samples',
 '128_dims_10000_samples',
 '128_dims_1000_samples',
 '16_dims_100000_samples',
 '16_dims_10000_samples',
 '16_dims_1000_samples',
 '32_dims_100000_samples',
 '32_dims_10000_samples',
 '32_dims_1000_samples',
 '64_dims_100000_samples',
 '64_dims_10000_samples',
 '64_dims_1000_samples']

In [105]:
""" Find number of other settings within its confidence interval """
robust = nested_pickle_dict()
j = 0

for name in tqdm_notebook(resultnames):
    
    print(name)
    global_optimal = load_best('/Users/sob/Desktop/gan_results/best_1024/multivariate/{0}/data.json'.format(name))

    # Initialize best dictionary    
    for t in tqdm_notebook(tuningnames):

        optimal = nested_pickle_dict()
        results = []

        # Load in the results from each trial
        for trial in range(1, 21):
            path = '/Users/sob/Desktop/gan_results/hypertuning/multivariate/{0}/trial_{1}/{2}'.format(name, trial, t)
                
            data = []
            with open(path) as f:
                for line in f:
                    data.append(json.loads(line))

            results.append(data[0])

        # Go through each one and append the results
        for result in results:
            for model, distributions in result.items():
                for distribution, metrics in distributions.items():
                    for metric, values in metrics.items():
                        if metric in ["LR", "HDIM", 'GLoss', 'DLoss', "BSIZE", "Energy-Distance"]:
                            continue
                        else:
                                                        
                            # If metric is seen for the first time, initialize it
                            if 'values' not in optimal[model][distribution][metric]:
                                optimal[model][distribution][metric]["values"] = []

                            # Otherwise, compare it the presently considered value
                            optimal[model][distribution][metric]["values"].append(values)

        for model, distributions in result.items():
            for distribution, metrics in distributions.items():
                for metric, values in metrics.items():
                    if metric in ["LR", "HDIM", 'GLoss', 'DLoss', "BSIZE", "Energy-Distance"]:
                        continue
                    else: 
                        
                        if metric not in robust[model][name][distribution]:
                            robust[model][name][distribution][metric] = 0
                            
                        if 'total' not in robust[model]['all']:
                            robust[model]['all']['total'] = 0
                            
                        if metric not in robust[model]['all']:
                            robust[model]['all'][metric] = 0
                            
                        j += 1
                        
                        _, _, global_low, global_high = mean_confidence_interval(np.array(global_optimal[model][distribution][metric]['mean']))
                        data_mean, _, data_low, data_high = mean_confidence_interval(np.nanmin(np.array(optimal[model][distribution][metric]["values"]), axis=1))
                        
                        if global_low <= data_mean <= global_high:
                            
                            robust[model][name][distribution][metric] += 1
                            robust[model]['all']['total'] += 1
                            robust[model]['all'][metric] += 1

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

128_dims_100000_samples


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

128_dims_10000_samples


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

KeyboardInterrupt: 

In [106]:
for i in robust.keys():
    print(i)
    print('TOTAL:', robust[i]['all']['total'])
    for k in ['Jensen-Shannon', 'KL-Divergence', 'Wasserstein-Distance']:
        if k == 'total':
            continue
        print(k, robust[i]['all'][k])
    
    print('\n')

wgan
TOTAL: 23
Jensen-Shannon 10
KL-Divergence 4
Wasserstein-Distance 9


began
TOTAL: 38
Jensen-Shannon 10
KL-Divergence 3
Wasserstein-Distance 25


lsgan
TOTAL: 31
Jensen-Shannon 9
KL-Divergence 4
Wasserstein-Distance 18


dragan
TOTAL: 59
Jensen-Shannon 17
KL-Divergence 11
Wasserstein-Distance 31


ragan
TOTAL: 34
Jensen-Shannon 3
KL-Divergence 10
Wasserstein-Distance 21


wgpgan
TOTAL: 78
Jensen-Shannon 21
KL-Divergence 16
Wasserstein-Distance 41


fgan_forward_kl
TOTAL: 21
Jensen-Shannon 2
KL-Divergence 4
Wasserstein-Distance 15


nsgan
TOTAL: 13
Jensen-Shannon 3
KL-Divergence 5
Wasserstein-Distance 5


infogan
TOTAL: 46
Jensen-Shannon 13
KL-Divergence 4
Wasserstein-Distance 29


fishergan
TOTAL: 23
Jensen-Shannon 1
KL-Divergence 6
Wasserstein-Distance 16


fgan_hellinger
TOTAL: 23
Jensen-Shannon 5
KL-Divergence 2
Wasserstein-Distance 16


fgan_reverse_kl
TOTAL: 6
Jensen-Shannon 0
KL-Divergence 2
Wasserstein-Distance 4


fgan_total_var
TOTAL: 24
Jensen-Shannon 4
KL-Divergence 5
Wa

In [107]:
j

4320

In [79]:
len(global_optimal['wgan']['beta']['Jensen-Shannon']['mean'])

25

In [11]:
data = []
with open(path) as f:
    for line in f:
        data.append(json.loads(line))

In [59]:
#     try:
axis = 0
n = data.shape[axis]
#     except IndexError:
#         axis = 0
#         n = data.shape[axis]

mu, std = np.nanmean(data, axis=axis), scipy.stats.sem(data, axis=axis, nan_policy='omit')
h = np.ma.getdata(std) * scipy.stats.t.ppf((1 + 0.95) / 2., n-1)   

In [64]:
data.shape

(20, 25)

In [70]:
data[:, 0]

array([0.1930042 , 0.2066939 , 0.19153797, 0.19525308, 0.20647096,
       0.19276595, 0.20471301, 0.19250052, 0.19584609, 0.17902637,
       0.20165009, 0.19579905, 0.17772294, 0.21042508, 0.22041611,
       0.17814663, 0.18736884, 0.19345119, 0.17377413, 0.18209787])

In [67]:
mu

array([0.1939332 , 0.21364977, 0.20867878, 0.20379185, 0.20591492,
       0.21686991, 0.20523758, 0.20114066, 0.20765131, 0.20959509,
       0.20937313, 0.20580058, 0.20463819, 0.20934051, 0.20512742,
       0.20654916, 0.20383895, 0.20391237, 0.20536536, 0.20604479,
       0.20699752, 0.20883711, 0.20583202, 0.20704382, 0.20659206])

In [None]:
global_optimal[model][distribution]['KL-Divergence']["mean"]

In [None]:
best_path = '../best/' + '/'.join(dirname.split('/')[-3:])
if not os.path.exists(best_path):
    os.makedirs(best_path)

files = os.listdir(dirname)
files = [f for f in files if f != '.DS_Store']
for idx, f in tqdm.tqdm_notebook(enumerate(files)):

    optimal = get_best_performance_mnist(dirname + f + '/')
    if len(os.listdir(dirname + f + '/')) < 60:
        print(f, len(os.listdir(dirname + f + '/')))

    with open(best_path + '/trial_{0}.json'.format(idx+1), 'w') as outfile:
        json.dump(optimal, outfile)


In [None]:
global_optimums[model][distribution]['Wasserstein-Distance']

In [None]:
def get_best_performance_multivariate(mypath):
    """ For a trial, get the best performance for multivariate data according to any hyperparam """
    # Get path, files in path
    files = os.listdir(mypath)
    results = []

    # Read in the files
    for file in files:
        if file == '.DS_Store':
            continue
                            
        with open(mypath + file, 'r') as f:
            data = json.load(f)

        results.append(data)
        
    # Initialize best dictionary
    optimal = nested_pickle_dict()

    # Go through all models, distributionss, metrics, and record the best
    for result in results:
        for model, distributions in result.items():
            for distribution, metrics in distributions.items():
                for metric, values in metrics.items():
                    if metric not in ["LR", "HDIM", "BSIZE"]:

                        # If metric is seen for the first time, it is the best
                        if metric not in optimal[model][distribution]:
                            optimal[model][distribution][metric]["value"] = values
                            optimal[model][distribution][metric]["parameters"] = [metrics["LR"], metrics["HDIM"], metrics["BSIZE"]]

                        # Otherwise, compare it the presently considered value
                        elif min(optimal[model][distribution][metric]["value"]) > min(values):
                            optimal[model][distribution][metric]["value"] = values
                            optimal[model][distribution][metric]["parameters"] = [metrics["LR"], metrics["HDIM"], metrics["BSIZE"]]

    return optimal


def get_best_performance_mnist(*args):
    return get_best_performance_multivariate(*args)


def multivariate_hypertuning2best(dirname='/Users/sob/Desktop/gan_results/hypertuning/multivariate/64_dims_100000_samples/'):
    """ MOVE HYPERTUNING RESULTS TO BEST FOLDER """
    best_path = '../best/' + '/'.join(dirname.split('/')[-3:])
    if not os.path.exists(best_path):
        os.makedirs(best_path)

    files = os.listdir(dirname)
    files = [f for f in files if f != '.DS_Store']
    for idx, f in tqdm.tqdm_notebook(enumerate(files)):

        optimal = get_best_performance_mnist(dirname + f + '/')
        if len(os.listdir(dirname + f + '/')) < 60:
            print(f, len(os.listdir(dirname + f + '/')))

        with open(best_path + '/trial_{0}.json'.format(idx+1), 'w') as outfile:
            json.dump(optimal, outfile)


def merge_mixture(dirname):
    outdir = dirname
    for idx, file in enumerate(os.listdir(dirname)):

        if '.DS_Store' in file:
            continue

        for nest in crawl_directory(dirname + file):

            index = 1

            if 'dims' not in nest.split('/')[7]:
                outdir = '/'.join(nest.split('/')[:7] + nest.split('/')[8:9]) + '/'
            else:
                outdir = dirname
                
            # Initialize directory
            if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                os.makedirs(outdir + 'trial_{0}/'.format(index))

            try:
                shutil.move(nest, outdir + 'trial_{0}/'.format(index))
            except:
                extension = nest.split('/')[-1]
                while os.path.exists(outdir + 'trial_{0}/'.format(index) + extension):
                    index += 1

                if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                    os.makedirs(outdir + 'trial_{0}/'.format(index))

                shutil.move(nest, outdir + 'trial_{0}/'.format(index))

    remove_empty_dirs(dirname)
    
    
def merge_multivariate(dirname):
    outdir = dirname
    for idx, file in enumerate(os.listdir(dirname)):

        if '.DS_Store' in file:
            continue

        for nest in crawl_directory(dirname + file):

            index = 1

            if 'dims' not in nest.split('/')[6]:
                outdir = '/'.join(nest.split('/')[:7] + nest.split('/')[8:9]) + '/'
            else:
                # Uncomment the + for mixture
                outdir = dirname + nest.split('/')[6] + '/'
                
            # Initialize directory
            if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                os.makedirs(outdir + 'trial_{0}/'.format(index))

            try:
                shutil.move(nest, outdir + 'trial_{0}/'.format(index))
            except:
                extension = nest.split('/')[-1]
                while os.path.exists(outdir + 'trial_{0}/'.format(index) + extension):
                    index += 1

                if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                    os.makedirs(outdir + 'trial_{0}/'.format(index))

                shutil.move(nest, outdir + 'trial_{0}/'.format(index))

    remove_empty_dirs(dirname)
    
    
def merge_mnist(dirname):
    outdir = dirname
    for idx, file in enumerate(os.listdir(dirname)):

        if '.DS_Store' in file:
            continue

        for nest in crawl_directory(dirname + file):

            index = 1

            if 'dims' in nest.split('/')[5]:
                outdir = '/'.join(nest.split('/')[:6]) + '/'
            else:
                # Uncomment the + for mixture
                outdir = dirname + nest.split('/')[7] + '/'
    

            # Initialize directory
            if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                os.makedirs(outdir + 'trial_{0}/'.format(index))

            try:
                shutil.move(nest, outdir + 'trial_{0}/'.format(index))
            except:
                extension = nest.split('/')[-1]
                while os.path.exists(outdir + 'trial_{0}/'.format(index) + extension):
                    index += 1

                if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                    os.makedirs(outdir + 'trial_{0}/'.format(index))

                shutil.move(nest, outdir + 'trial_{0}/'.format(index))

    remove_empty_dirs(dirname)
    

def get_stats(dirname):
    """ Get missing runs for all trials """
    hidden_dims = [32, 64, 128, 256, 512]
    batch_sizes = [128, 256, 512, 1024]
    learning_rates = [2e-1, 2e-2, 2e-3]

    filenames, hyperparams = [], []

    for (lr, hdim, bsize) in itertools.product(*[learning_rates, hidden_dims, batch_sizes]):
        hyperparam = (lr * min(batch_sizes)/bsize, hdim, bsize)
        filename = 'results_{0}.json'.format("_".join([str(i) for i in hyperparam]))
        filenames.append(filename)
        hyperparams.append((str(format_e(lr)), str(hdim), str(bsize)))
    
    TODO = []
    for file in os.listdir(dirname):
        if '.DS_Store' in file:
            continue

        print(file, len(os.listdir(dirname + file)))
        idx = 0
        try:
            for f in os.listdir(dirname + file):
                if '.DS_Store' in f:
                    continue

                files = os.listdir(dirname + file + '/' + f)
                length = len(files)
                print(f, length)

                if length >= 60:
                    idx += 1            
                else:
                    missing = [hyperparams[idx] for idx, item in enumerate(filenames) if item not in files]
                    TODO.extend(missing)

            print('{0}/20'.format(idx))
            print('\n')
        except NotADirectoryError:
            files = os.listdir(dirname + file)
            missing = [hyperparams[idx] for idx, item in enumerate(filenames) if item not in files]
            TODO.extend(missing)
            
        
    return TODO

In [None]:
dirname = '/Users/sob/Desktop/apple/64_dims_100000_samples/'
outdir = dirname
for idx, file in enumerate(os.listdir(dirname)):

    if '.DS_Store' in file:
        continue

    for nest in crawl_directory(dirname + file):

        index = 1

        if 'dims' not in nest.split('/')[6]:
            outdir = '/'.join(nest.split('/')[:6]) + '/'# + nest.split('/')[8:9]) + '/'
        else:
            # Uncomment the + for mixture
            outdir = dirname + nest.split('/')[6]
        
        print(outdir)
            
#        # Initialize directory
        if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
            os.makedirs(outdir + 'trial_{0}/'.format(index))

        try:
            shutil.move(nest, outdir + 'trial_{0}/'.format(index))
        except:
            extension = nest.split('/')[-1]
            while os.path.exists(outdir + 'trial_{0}/'.format(index) + extension):
                index += 1

            if not os.path.exists(outdir + 'trial_{0}/'.format(index)):
                os.makedirs(outdir + 'trial_{0}/'.format(index))

            shutil.move(nest, outdir + 'trial_{0}/'.format(index))

remove_empty_dirs(dirname)

In [None]:
""" MOVE HYPERTUNING RESULTS TO BEST FOLDER """
import tqdm
best_path = '/Users/sob/Desktop/gan_results/best/multivariate/64_dims_100000_samples/'
dirname = '/Users/sob/Desktop/gan_results/hypertuning/multivariate/64_dims_100000_samples/'
if not os.path.exists(best_path):
    os.makedirs(best_path)

files = os.listdir(dirname)
files = [f for f in files if f != '.DS_Store']
for idx, f in tqdm.tqdm_notebook(enumerate(files)):
    
    optimal = get_best_performance_mnist(dirname + f + '/')
    if len(os.listdir(dirname + f + '/')) < 60:
        print(f, len(os.listdir(dirname + f + '/')))
    
    with open(best_path + '/trial_{0}.json'.format(idx+1), 'w') as outfile:
        json.dump(optimal, outfile)