In [52]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode

In [53]:
from multi_objective.main import main, get_config

In [59]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [31]:
adult_cfg = get_config('configs/adult.yaml')
compass_cfg = get_config('configs/compass.yaml')
mm_cfg = get_config('configs/multi_mnist.yaml')
mf_cfg = get_config('configs/multi_fashion.yaml')
mfm_cfg = get_config('configs/multi_fashion_mnist.yaml')

methods = ['cosmos', 'mgda']

num_evaluations = 100
J = 2

min_angles = {
    'adult': 0.02,
    'compass': 0.0055,
    'mm': 0.005
}

epochs_tabular = 20
epochs_mnist = 50

In [5]:
# log transform for lr 
lr_range = (np.log(1e-4), np.log(1e-2))
lamda_range = (np.log(0.2), np.log(5))
alpha_range = (.2, 1.5)
scheduler_choice = ['none', 'MultiStep', 'CosineAnnealing']
mgda_norm_choice = ['none', 'l2', 'loss', 'loss+']
phn_solver = ['linear', 'epo']

Sample the hyperparameters

In [6]:
# sampling
np.random.seed(1)

learning_rates = np.exp(np.random.uniform(*lr_range ,[num_evaluations]))
lamdas = np.exp(np.random.uniform(*lamda_range ,[num_evaluations]))
alphas = np.random.uniform(*alpha_range ,[num_evaluations, J])
schedulers = np.random.choice(scheduler_choice, num_evaluations)
mgda_norms = np.random.choice(mgda_norm_choice, num_evaluations)
phn_solvers = np.random.choice(phn_solver, num_evaluations)
                     
print(f"Num evals: {len(learning_rates)}")

Num evals: 100


In [7]:
def convert_hp(array, arg, dtype):
    n = len(array)
    array = [[dtype(a_i) for a_i in a] if isinstance(a, np.ndarray) else dtype(a) for a in array]
    args = list(itertools.repeat(arg, n))
    return args, array

In [8]:
cosmos_lrs = convert_hp(learning_rates, 'cosmos.lr', float)
cosmos_scheds = convert_hp(schedulers, 'cosmos.lr_scheduler', str)
cosmos_lamdas = convert_hp(lamdas, 'cosmos.lamda', float)
cosmos_alphas = convert_hp(alphas, 'cosmos.alpha', float)

mgda_lrs = convert_hp(learning_rates, 'mgda.lr', float)
mgda_scheds = convert_hp(schedulers, 'mgda.lr_scheduler', str)
mgda_norms = convert_hp(mgda_norms, 'mgda.normalization_type', str)

pmtl_lrs = convert_hp(learning_rates, 'pmtl.lr', float)
pmtl_scheds = convert_hp(schedulers, 'pmtl.lr_scheduler', str)



In [37]:
def percent_finished(jobs):
    return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs):
    with open(f'{method}_{dataset}_hpo.pickle', 'wb') as f:
        pickle.dump(jobs, f)

        
def load_jobs(method, dataset):
    with open(f'{method}_{dataset}_hpo.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset):
    exist = os.path.exists(f'{method}_{dataset}_hpo.pickle')
    if exist:
        print('skipping.')
    return exist

In [10]:
executor.update_parameters(timeout_min=20, slurm_partition="ml_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=35)

## COSMOS

In [11]:
hpo_cosmos = list(zip(*cosmos_lrs, *cosmos_lamdas, *cosmos_alphas))
method = 'cosmos'

#### Adult

In [39]:
cfg = adult_cfg.clone()
cfg.epochs = epochs_tabular
cfg.eval_every = epochs_tabular
dataset = 'adult'
hpos = hpo_cosmos

Runs the search

In [42]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, method, cfg, 'hpo')
            jobs.append(job)

skipping.


In [None]:
print(percent_finished(jobs))

In [40]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

skipping.


In [41]:
cosmos_adult_jobs = load_jobs(method, dataset)

#### Compass

In [44]:
cfg = compass_cfg.clone()
cfg.epochs = epochs_tabular
cfg.eval_every = epochs_tabular
dataset = 'compass'
hpos = hpo_cosmos

In [33]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, method, cfg, 'hpo')
            jobs.append(job)

In [19]:
print(percent_finished(jobs))

NameError: name 'cosmos_compass_jobs' is not defined

In [40]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [None]:
cosmos_compass_jobs = load_jobs(method, dataset)

#### Multi MNIST

In [48]:
cfg = mm_cfg.clone()
cfg.epochs = epochs_mnist
cfg.eval_every = epochs_mnist

hpo_cosmos = list(zip(*cosmos_lrs, *cosmos_lamdas, *cosmos_alphas, *cosmos_scheds))
method = 'cosmos'
dataset = 'mm'
hpos = hpo_cosmos

In [15]:
if not results_exist(method, dataset):
    jobs = []
    with executor.batch():
        for args in hpos:
            cfg = cfg.clone()
            cfg.merge_from_list(args)
            job = executor.submit(main, method, cfg, 'hpo')
            jobs.append(job)

In [21]:
print(percent_finished(jobs))

1.0


In [27]:
if not results_exist(method, dataset):
    save_jobs(method, dataset, jobs)

In [55]:
cosmos_mm_jobs = load_jobs(method, dataset)

In [56]:
jobs_failed(cosmos_mm_jobs)

[]

In [57]:
def get_optimal_cfg(jobs, min_angle=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        hv, angle = job.result()
        if min_angle is not None and angle <= min_angle:
            continue
        
        if hv > hv_best:
            print(hv, angle, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    return jobs[idx_best].submission().args[1]

In [58]:
get_optimal_cfg(cosmos_mm_jobs, min_angles['mm'])

UncompletedJobError: Job 5524323_0 (task: 0) with path /home/ruchtem/dev/moo/tmp/5524323_0_0_result.pkl
has not produced any output (state: COMPLETED)
No output/error stream produced ! Check: /home/ruchtem/dev/moo/tmp/5524323_0_0_log.out

In [32]:
jobs_l[0].result()

(3.02712881565094, 0)

In [13]:
def add(a, b):
    return a + b

In [14]:
job = executor.submit(add, 5, 7)  # will compute add(5, 7)

In [15]:
print(job.job_id)  # ID of your job

5359691


In [16]:
job.result()

In [17]:
job.submission().args

12