In [1]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError

In [2]:
from multi_objective.main import main, get_config

In [3]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [61]:
adult_cfg = get_config('configs/adult.yaml')
compass_cfg = get_config('configs/compass.yaml')
credit_cfg = get_config('configs/credit.yaml')
mm_cfg = get_config('configs/multi_mnist.yaml')
mf_cfg = get_config('configs/multi_fashion.yaml')
mfm_cfg = get_config('configs/multi_fashion_mnist.yaml')

methods = ['cosmos', 'mgda']
WORLDSIZE = 1

num_evaluations = 100
J = 2

max_dist = {
    'adult': 0.7,
    'compass': 0.7,
    'credit': 1.,
    'mm': 0.005
}

epochs_tabular = 20
epochs_mnist = 50

In [8]:
# log transform for lr 
lr_range = (np.log(1e-4), np.log(1e-2))
lamda_range = (np.log(0.00001), np.log(10))
alpha_range = (np.log(.2), np.log(5))
cosmos_norm_choice = [True, False]
scheduler_choice = ['none', 'MultiStep', 'CosineAnnealing']
mgda_norm_choice = ['none', 'l2', 'loss', 'loss+']
phn_solver = ['linear', 'epo']

Sample the hyperparameters

In [9]:
# sampling
np.random.seed(1)

learning_rates = np.exp(np.random.uniform(*lr_range ,[num_evaluations]))
lamdas = np.exp(np.random.uniform(*lamda_range ,[num_evaluations]))
alphas = np.exp(np.random.uniform(*alpha_range ,[num_evaluations, J]))
cosmos_norms = np.random.choice(cosmos_norm_choice, num_evaluations)
schedulers = np.random.choice(scheduler_choice, num_evaluations)
mgda_norms = np.random.choice(mgda_norm_choice, num_evaluations)
phn_solvers = np.random.choice(phn_solver, num_evaluations)
                     
print(f"Num evals: {len(learning_rates)}")

Num evals: 100


In [10]:
def convert_hp(array, arg, dtype):
    n = len(array)
    array = [[dtype(a_i) for a_i in a] if isinstance(a, np.ndarray) else dtype(a) for a in array]
    args = list(itertools.repeat(arg, n))
    return args, array

In [11]:
cosmos_lrs = convert_hp(learning_rates, 'cosmos.lr', float)
cosmos_scheds = convert_hp(schedulers, 'cosmos.lr_scheduler', str)
cosmos_lamdas = convert_hp(lamdas, 'cosmos.lamda', float)
cosmos_alphas = convert_hp(alphas, 'cosmos.alpha', float)
cosmos_norm = convert_hp(cosmos_norms, 'cosmos.normalize', bool)

mgda_lrs = convert_hp(learning_rates, 'mgda.lr', float)
mgda_scheds = convert_hp(schedulers, 'mgda.lr_scheduler', str)
mgda_norms = convert_hp(mgda_norms, 'mgda.normalization_type', str)

pmtl_lrs = convert_hp(learning_rates, 'pmtl.lr', float)
pmtl_scheds = convert_hp(schedulers, 'pmtl.lr_scheduler', str)



In [24]:
def percent_finished(jobs):
    return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'wb') as f:
        pickle.dump(jobs, f)

        
def load_jobs(method, dataset, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset, tag='hpo'):
    exist = os.path.exists(f'pickles/{method}_{dataset}_{tag}.pickle')
    if exist:
        print('skipping.')
    return exist

def get_optimal_cfg(jobs, max_dist=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        try:
            hv, dist = job.result()
        except FailedJobError:
            continue
        if max_dist is not None and dist > max_dist:
            continue
        
        if hv > hv_best:
            print(hv, dist, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    return jobs[idx_best].submission().kwargs['cfg']

In [13]:
executor.update_parameters(timeout_min=20, slurm_partition="ml_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=35)

In [19]:
executor.update_parameters(slurm_exclude='mlgpu05')

## COSMOS

In [15]:
hpo_cosmos = list(zip(*cosmos_lrs, *cosmos_alphas, *cosmos_norm))   # lamda=0
method = 'cosmos'

#### Adult

In [16]:
cfg = adult_cfg.clone()
cfg.epochs = epochs_tabular
cfg.eval_every = epochs_tabular
dataset = 'adult'
hpos = hpo_cosmos

Runs the search

In [20]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, rank=0, world_size=1, method_name=method, cfg=cfg, tag='hpo')
            jobs.append(job)

In [26]:
print(percent_finished(jobs))
jobs_failed(jobs)

1.0


In [27]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [30]:
jobs = load_jobs(method, dataset)
get_optimal_cfg(jobs, max_dist[dataset])

3.325317963661113 0.6643087703627651 5842949_0
3.32860757027072 0.6664323006000586 5842949_16
3.3319845076147216 0.6774339598285042 5842949_33
3.3371375879367178 0.6668008493196068 5842949_48
3.3388226971885597 0.6733755906510289 5842949_84
Best job: 5842949_84


CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.0887843512723165, 0.3092787487483168], 'lamda': 0.0, 'normalize': True, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.00034662172842139723}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

#### Compass

In [31]:
cfg = compass_cfg.clone()
cfg.epochs = epochs_tabular
cfg.eval_every = epochs_tabular
dataset = 'compass'
hpos = hpo_cosmos

In [33]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, rank=0, world_size=1, method_name=method, cfg=cfg, tag='hpo')
            jobs.append(job)

In [36]:
print(percent_finished(jobs))
jobs_failed(jobs)

1.0


[]

In [37]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [50]:
jobs = load_jobs(method, dataset)
get_optimal_cfg(jobs, max_dist[dataset])

3.1536010422661493 0.5258129107968698 5843785_0
3.154472635262012 0.5275914213931486 5843785_1
3.16375872914587 0.5347681938254868 5843785_21
3.2447071564995342 0.5677434719156779 5843785_76
3.3910956813849893 0.6618628130080276 5843785_80
Best job: 5843785_80


CfgNode({'dataset': 'compass', 'dim': (20,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [0.5257256444318705, 0.3169155612008374], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.005842681096339225}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

#### Credit

In [62]:
cfg = credit_cfg.clone()
cfg.epochs = epochs_tabular
cfg.eval_every = epochs_tabular
dataset = 'credit'
hpos = hpo_cosmos

In [64]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, rank=0, world_size=1, method_name=method, cfg=cfg, tag='hpo')
            jobs.append(job)

In [66]:
print(percent_finished(jobs))
jobs_failed(jobs)

1.0


[]

In [67]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [69]:
jobs = load_jobs(method, dataset)
get_optimal_cfg(jobs, max_dist[dataset])

3.1319100380976765 0.6987541223870782 5844367_0
3.1356159286184395 0.7085256784660161 5844367_10
3.140301658808066 0.7033411714036643 5844367_13
3.1478612104709147 0.7112588979654137 5844367_34
3.1484955621693826 0.7270503770037596 5844367_66
3.154545607583958 0.7136027191485462 5844367_76
Best job: 5844367_76


CfgNode({'dataset': 'credit', 'dim': (90,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [0.461394878836116, 0.4575066947045334], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.007161717085944733}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

#### Multi MNIST

In [75]:
cfg = mm_cfg.clone()
cfg.epochs = epochs_mnist
cfg.eval_every = epochs_mnist

hpo_cosmos = list(zip(*cosmos_lrs, *cosmos_lamdas, *cosmos_alphas, *cosmos_scheds, *cosmos_norm))
hpo_cosmos = list(zip(*cosmos_lrs, *cosmos_alphas, *cosmos_scheds, *cosmos_norm))
method = 'cosmos'
dataset = 'mm'
hpos = hpo_cosmos

In [50]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, method, cfg, 'hpo')
            jobs.append(job)

In [86]:
print(percent_finished(jobs))

1.0


In [57]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [58]:
cosmos_mm_jobs = load_jobs(method, dataset)

In [35]:
jobs_failed(jobs)

[]

In [38]:
jobs[53].result()

FailedJobError: Job (task=0) failed during processing with trace:
----------------------
Traceback (most recent call last):
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/submitit/core/submission.py", line 53, in process_job
    result = delayed.result()
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/submitit/core/utils.py", line 128, in result
    self._result = self.function(*self.args, **self.kwargs)
  File "/home/ruchtem/dev/moo/multi_objective/main.py", line 192, in main
    torch.cuda.set_device(rank)
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/cuda/__init__.py", line 263, in set_device
    torch._C._cuda_setDevice(device)
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/cuda/__init__.py", line 172, in _lazy_init
    torch._C._cuda_init()
RuntimeError: CUDA driver initialization failed, you might not have a CUDA gpu.

----------------------
You can check full logs with 'job.stderr(0)' and 'job.stdout(0)'or at paths:
  - /home/ruchtem/dev/moo/tmp/submitit_hpo/5840962_53_0_log.err
  - /home/ruchtem/dev/moo/tmp/submitit_hpo/5840962_53_0_log.out

In [137]:
get_optimal_cfg(jobs, 0.7)

3.3448198535973948 0.6927948936538749 5792962_0
3.3507989340281177 0.6999448141847925 5792962_16
3.353983636374194 0.6528753636249982 5792962_80
Best job: 5792962_80


CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [0.5257256444318705, 0.3169155612008374], 'lamda': 0.0, 'normalize': False, 'instances': True, 'lr_scheduler': 'none', 'lr': 0.005842681096339225}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [104]:
jobs[33].submission().kwargs['cfg']

CfgNode({'dataset': 'adult', 'dim': (88,), 'objectives': ['BinaryCrossEntropyLoss', 'ddp'], 'task_ids': [], 'ignore_index': -100, 'epochs': 20, 'num_workers': 4, 'checkpoint_every': 0, 'lr_scheduler': 'none', 'lr': 0.001, 'batch_size': 256, 'pmtl': CfgNode({'num_starts': 5, 'lr_scheduler': 'none', 'lr': 0.001}), 'mgda': CfgNode({'approximate_norm_solution': False, 'normalization_type': 'loss', 'lr_scheduler': 'none', 'lr': 0.001}), 'phn': CfgNode({'alpha': 0.2, 'internal_solver': 'linear', 'lr_scheduler': 'none', 'lr': 0.001}), 'single_task': CfgNode({'task_id': None, 'lr_scheduler': 'none', 'lr': 0.001}), 'cosmos': CfgNode({'alpha': [1.7328161368791897, 0.853738192417888], 'lamda': 0.0, 'normalize': False, 'instances': False, 'lr_scheduler': 'none', 'lr': 0.0011650124592172252}), 'seed': 1, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 20, 'train_eval_every': 0, 'reference_point': [2, 2], 'device': 'cuda', 'metrics': None})

In [32]:
jobs_l[0].result()

(3.02712881565094, 0)

In [13]:
def add(a, b):
    return a + b

In [14]:
job = executor.submit(add, 5, 7)  # will compute add(5, 7)

In [15]:
print(job.job_id)  # ID of your job

5359691


In [16]:
job.result()

In [17]:
job.submission().args

12