In [112]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError
from itertools import cycle

In [71]:
from multi_objective.main import main, get_config

In [72]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [182]:
configs = {
    'adult': get_config('configs/adult.yaml'),
    'compass': get_config('configs/compass.yaml'),
    'credit': get_config('configs/credit.yaml'),
    'mm': get_config('configs/multi_mnist.yaml'),
    'mf': get_config('configs/multi_fashion.yaml'),
    'mfm': get_config('configs/multi_fashion_mnist.yaml')
}

num_evaluations = 100
J = 2

max_dist = {
    'adult': 0.7,
    'compass': 0.7,
    'credit': 1.,
    'mm': 0.2,
    'mf': 0.2,
    'mfm': 0.2,
}

epochs_tabular = 20
epochs_mnist = 50

In [183]:
# log transform for lr 
lr_range = (np.log(1e-4), np.log(1e-2))
lamda_range = (np.log(0.00001), np.log(10))
alpha_range = (np.log(.2), np.log(5))
cosmos_norm_choice = [True, False]
scheduler_choice = ['none', 'MultiStep', 'CosineAnnealing']
mgda_norm_choice = ['none', 'l2', 'loss', 'loss+']
phn_solver = ['linear', 'epo']

Sample the hyperparameters

In [184]:
# sampling
np.random.seed(1)

learning_rates = np.exp(np.random.uniform(*lr_range ,[num_evaluations]))
lamdas = np.exp(np.random.uniform(*lamda_range ,[num_evaluations]))
alphas = np.exp(np.random.uniform(*alpha_range ,[num_evaluations, J]))
cosmos_norms = np.random.choice(cosmos_norm_choice, num_evaluations)
schedulers = np.random.choice(scheduler_choice, num_evaluations)
mgda_norms = np.random.choice(mgda_norm_choice, num_evaluations)
phn_solvers = np.random.choice(phn_solver, num_evaluations)
                     
print(f"Num evals: {len(learning_rates)}")

Num evals: 100


In [185]:
def convert_hp(array, arg, dtype):
    n = len(array)
    array = [[dtype(a_i) for a_i in a] if isinstance(a, np.ndarray) else dtype(a) for a in array]
    args = list(itertools.repeat(arg, n))
    return args, array

In [186]:
cosmos_lrs = convert_hp(learning_rates, 'cosmos.lr', float)
cosmos_scheds = convert_hp(schedulers, 'cosmos.lr_scheduler', str)
cosmos_lamdas = convert_hp(lamdas, 'cosmos.lamda', float)
cosmos_alphas = convert_hp(alphas, 'cosmos.alpha', float)
cosmos_norm = convert_hp(cosmos_norms, 'cosmos.normalize', bool)

mgda_lrs = convert_hp(learning_rates, 'mgda.lr', float)
mgda_scheds = convert_hp(schedulers, 'mgda.lr_scheduler', str)
mgda_norms = convert_hp(mgda_norms, 'mgda.normalization_type', str)

pmtl_lrs = convert_hp(learning_rates, 'pmtl.lr', float)
pmtl_scheds = convert_hp(schedulers, 'pmtl.lr_scheduler', str)



In [187]:
hpo_space = {
    'cosmos': {
        'adult':   list(zip(*cosmos_lrs, *cosmos_alphas)),    # lamda=0
        'compass': list(zip(*cosmos_lrs, *cosmos_alphas)),    # lamda=0
        'credit':  list(zip(*cosmos_lrs, *cosmos_alphas)),    # lamda=0
        'mm':      list(zip(*cosmos_lrs, *cosmos_alphas, *cosmos_lamdas, *cosmos_scheds)),
        'mf':      list(zip(*cosmos_lrs, *cosmos_alphas, *cosmos_lamdas, *cosmos_scheds)),
        'mfm':     list(zip(*cosmos_lrs, *cosmos_alphas, *cosmos_lamdas, *cosmos_scheds)),
    },
    'mgda': {
        'adult':   list(zip(*mgda_lrs, *mgda_norms)),
        'compass': list(zip(*mgda_lrs, *mgda_norms)),
        'credit':  list(zip(*mgda_lrs, *mgda_norms)),
    }
}

In [188]:
def percent_finished(jobs):
    if len(jobs):
        return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs, tag='hpo', force=False):
    if not results_exist(method, dataset) or force:
        with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'wb') as f:
            pickle.dump(jobs, f)
    else:
        print('skipping')

        
def load_jobs(method, dataset, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset, tag='hpo'):
    return os.path.exists(f'pickles/{method}_{dataset}_{tag}.pickle')


def get_optimal_cfg(jobs, max_dist=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        try:
            hv, dist = job.result()
        except FailedJobError:
            continue
        if max_dist is not None and dist > max_dist:
            continue
        
        if hv > hv_best:
            print(hv, dist, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    return jobs[idx_best].submission().args[3]

In [189]:
def execute(method, dataset, force=False):
    if not force and results_exist(method, dataset):
        print('skipping. Use force=True to enforce execution')
        return []
    
    cfg = configs[dataset].clone()
    if dataset in ['adult', 'compass', 'credit']:
        cfg.epochs = epochs_tabular
        cfg.eval_every = epochs_tabular
    elif dataset in ['mm', 'mf', 'mfm']:
        cfg.epochs = epochs_mnist
        cfg.eval_every = epochs_mnist
    
    hpos = hpo_space[method][dataset]
    
    cfgs = []
    for args in hpos:
        cfg = cfg.clone()
        cfg.merge_from_list(args)
        cfgs.append(cfg)
    
    # func, rank, world_size, method_name, cfg, tag
    return executor.map_array(main, cycle([0]), cycle([1]), cycle([method]), cfgs, cycle(['hpo']))

In [190]:
executor.update_parameters(timeout_min=20, slurm_partition="alldlc_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=100)

## COSMOS

#### Adult

In [191]:
method = 'cosmos'
dataset = 'adult'
jobs = execute('cosmos', 'adult', force=True)

In [193]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [27]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

#### Compass

In [33]:
jobs = execute('cosmos', 'compass')

In [36]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0


[]

In [37]:
save_jobs(method, dataset, jobs)

#### Credit

In [64]:
jobs = execute('cosmos', 'credit')

In [66]:
print(percent_finished(jobs))
jobs_failed(jobs)

1.0


[]

In [67]:
save_jobs(method, dataset, jobs)

#### Multi MNIST

In [149]:
method = 'cosmos'
dataset = 'mm'
jobs = execute(method, dataset)

skipping. Use force=True to enforce execution


In [133]:
print(percent_finished(jobs), jobs_failed(jobs))

None


[]

In [134]:
save_jobs(method, dataset, jobs)

#### Multi Fashion

In [136]:
method = 'cosmos'
dataset = 'mf'
jobs = execute(method, dataset)

In [142]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [143]:
save_jobs(method, dataset, jobs)

#### Multi Fastion+Mnist

In [151]:
method = 'cosmos'
dataset = 'mfm'
jobs = execute(method, dataset)

In [153]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [154]:
save_jobs(method, dataset, jobs)

In [194]:
# jobs = load_jobs(method, dataset)
cfg = get_optimal_cfg(jobs, max_dist[dataset])

3.325317963661113 0.6643087703627651 5881654_0
3.32860757027072 0.6664323006000586 5881654_16
3.3319845076147216 0.6774339598285042 5881654_33
3.3371375879367178 0.6668008493196068 5881654_48
3.3388226971885597 0.6733755906510289 5881654_84
Best job: 5881654_84


In [171]:
method = 'cosmos'
dataset = 'adult'

In [195]:
cfg

CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.0887843512723165, 0.3092787487483168], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.00034662172842139723}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [104]:
jobs[33].submission().kwargs['cfg']

CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.7328161368791897, 0.853738192417888], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.0011650124592172252}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [32]:
jobs_l[0].result()

(3.02712881565094, 0)

In [13]:
def add(a, b):
    return a + b

In [14]:
job = executor.submit(add, 5, 7)  # will compute add(5, 7)

In [15]:
print(job.job_id)  # ID of your job

5359691


In [16]:
job.result()

In [17]:
job.submission().args

12