In [1]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError, UncompletedJobError
from itertools import cycle
from time import sleep

In [2]:
from multi_objective.main import main, get_config

In [3]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [4]:
num_evaluations = 150

In [6]:
lr           = [0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]
weight_decay = [0.1, 0.25, 0.05, 0.075, 0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]
scheduler    = ['none', 'MultiStep', 'CosineAnnealing']

# phn
solver = ['linear', 'epo']

# mgda
norm = ['none', 'loss', 'loss+', 'l2']

# pmtl & cosmos
mildening = [0.8, 0.5, 0.3, 0.]

# cosmos
clipping  = [10., 5., 2., 1., 0.5]
dampening = [0.1, 0.01, 0.001, 0.0001, 0]

Sample the hyperparameters

In [9]:
np.random.seed(1)

lr_samples = {'lr': np.random.choice(lr, num_evaluations)}
weight_decay_samples = {'weight_decay': np.random.choice(weight_decay, num_evaluations)}
scheduler_samples = {'lr_scheduler': np.random.choice(scheduler, num_evaluations)}
solver_samples = {'internal_solver_phn': np.random.choice(solver, num_evaluations)}
norm_samples = {'normalization_type': np.random.choice(norm, num_evaluations)}
mildening_samples = {'train_ray_mildening': np.random.choice(mildening, num_evaluations)}
clipping_samples = {'lambda_clipping': np.random.choice(clipping, num_evaluations)}
dampening_samples = {'dampening': np.random.choice(dampening, num_evaluations)}

In [20]:
def percent_finished(jobs):
    if len(jobs):
        return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs, tag='hpo', force=False):
    if not results_exist(method, dataset, tag) or force:
        with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'wb') as f:
            pickle.dump(jobs, f)
    else:
        print('skipping')

        
def load_jobs(method, dataset, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset, tag='hpo'):
    return os.path.exists(f'pickles/{method}_{dataset}_{tag}.pickle')


def get_optimal_cfg(jobs, max_dist=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        if job.state == 'FAILED' or job.state == 'TIMEOUT':
            continue

        hv, dist = job.result()
        
        if max_dist is not None and dist > max_dist:
            continue
        
        if hv > hv_best:
            print(hv, dist, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    cfg = jobs[idx_best].submission().args[2]
    return {
        'lr': cfg.lr,
        'weight_decay': cfg.weight_decay,
        'scheduler': cfg.lr_scheduler,
        'solver': cfg.internal_solver_phn,
        'norm': cfg.normalization_type,
        'mildening': cfg.train_ray_mildening,
        'clipping': cfg.lambda_clipping,
        'dampening': cfg.dampening,
    }

In [22]:
def execute(config, hp_samples, force=False, tag='hpo'):
    if not force and results_exist(config.method, config.dataset, tag):
        print('skipping. Use force=True to enforce execution')
        return []
    
    
    cfg = config.clone()
    cfg.eval_every = 1   # early stopping
    hpos = [list(zip(cycle(s.keys()), list(s.values())[0])) for s in hp_samples]
    hpos = np.hstack(hpos).tolist()
    
    cfgs = []
    for args in hpos:
        cfg = cfg.clone()
        cfg.merge_from_list(args)
        cfgs.append(cfg)
    
    tags = [f"{tag}_{i :02d}" for i in range(len(cfgs))]
    
    # func, rank, world_size, cfg, tag
    return executor.map_array(main, cycle([0]), cycle([1]), cfgs, tags)

In [27]:
def execute_and_save(config, hp_samples, force=False, tag='hpo'):
    jobs = execute(config, hp_samples, force, tag)
    done = len(jobs) == 0
    while not done:
        done = percent_finished(jobs) == 1
        sleep(10)
    
    method_name = config.method + "_" + config.task_id if config.method == 'single_task' else config.method
    
    save_jobs(method_name, config.dataset, jobs, force=force, tag=tag)
    return jobs

In [28]:
executor.update_parameters(timeout_min=20, slurm_partition="alldlc_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=150)

In [29]:
executor.update_parameters(slurm_exclude='dlcgpu12')

## Analyze

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ],
    tag='hpo_capacity', force=True)

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ],
    tag='hpo_capacity', force=True)

In [20]:
jobs = load_jobs('single_task', 'multi_fashion_mnist', 'hpo_capacity')

In [34]:
get_optimal_cfg(jobs)

0.5008392256130811 16 6835378_0
0.5790844750420077 98 6835378_6
Best job: 6835378_6


{'lr': 0.00025,
 'weight_decay': 0.05,
 'scheduler': 'CosineAnnealing',
 'solver': 'linear',
 'norm': 'none',
 'mildening': 0.5,
 'clipping': 5.0,
 'dampening': 0.2}

## COSMOS

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

In [61]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

In [64]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

## mgda

In [78]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/mgda.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        norm_samples,
    ])

In [79]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/mgda.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        norm_samples,
    ])

In [80]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/mgda.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        norm_samples,
    ])

## phn

In [75]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/phn.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        alpha_samples,
        solver_samples,
    ])

In [76]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/phn.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        alpha_samples,
        solver_samples,
    ])

In [77]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/phn.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        alpha_samples,
        solver_samples,
    ])

## pmtl

In [144]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/pmtl.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/pmtl.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/pmtl.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
    ])

## Single task

In [31]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_1.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

In [33]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_2.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

In [65]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

In [70]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

## Uniform

In [67]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/uniform.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

In [68]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/uniform.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

In [69]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/uniform.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
    ])

#### Adult

In [191]:
method = 'cosmos'
dataset = 'adult'
jobs = execute('cosmos', 'adult', force=True)

In [193]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [27]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

#### Compass

In [33]:
jobs = execute('cosmos', 'compass')

In [36]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0


[]

In [37]:
save_jobs(method, dataset, jobs)

#### Credit

In [64]:
jobs = execute('cosmos', 'credit')

In [66]:
print(percent_finished(jobs))
jobs_failed(jobs)

1.0


[]

In [67]:
save_jobs(method, dataset, jobs)

#### Multi MNIST

In [149]:
method = 'cosmos'
dataset = 'mm'
jobs = execute(method, dataset)

skipping. Use force=True to enforce execution


In [133]:
print(percent_finished(jobs), jobs_failed(jobs))

None


[]

In [134]:
save_jobs(method, dataset, jobs)

#### Multi Fashion

In [136]:
method = 'cosmos'
dataset = 'mf'
jobs = execute(method, dataset)

In [142]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [143]:
save_jobs(method, dataset, jobs)

#### Multi Fastion+Mnist

In [151]:
method = 'cosmos'
dataset = 'mfm'
jobs = execute(method, dataset)

In [153]:
print(percent_finished(jobs), jobs_failed(jobs))

1.0 []


In [154]:
save_jobs(method, dataset, jobs)

In [194]:
# jobs = load_jobs(method, dataset)
cfg = get_optimal_cfg(jobs, max_dist[dataset])

3.325317963661113 0.6643087703627651 5881654_0
3.32860757027072 0.6664323006000586 5881654_16
3.3319845076147216 0.6774339598285042 5881654_33
3.3371375879367178 0.6668008493196068 5881654_48
3.3388226971885597 0.6733755906510289 5881654_84
Best job: 5881654_84


In [171]:
method = 'cosmos'
dataset = 'adult'

In [195]:
cfg

CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.0887843512723165, 0.3092787487483168], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.00034662172842139723}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [104]:
jobs[33].submission().kwargs['cfg']

CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.7328161368791897, 0.853738192417888], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.0011650124592172252}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [32]:
jobs_l[0].result()

(3.02712881565094, 0)

In [13]:
def add(a, b):
    return a + b

In [14]:
job = executor.submit(add, 5, 7)  # will compute add(5, 7)

In [15]:
print(job.job_id)  # ID of your job

5359691


In [16]:
job.result()

In [17]:
job.submission().args

12