In [31]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError, UncompletedJobError
from itertools import cycle
from time import sleep

random.seed(1)

In [32]:
from multi_objective.main import main, get_config

In [33]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [34]:
num_evaluations = 100

In [35]:
lr           = {'lr': [0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]}
weight_decay = {'weight_decay': [0.1, 0.25, 0.05, 0.075, 0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]}
scheduler    = {'lr_scheduler': ['none', 'MultiStep', 'CosineAnnealing']}

# phn
solver = {'internal_solver_phn': ['linear', 'epo']}
alpha  = {'alpha': [0.1, 0.2, 0.5, 1., 1.2, 1.5]} 

# mgda
norm = {'normalization_type': ['none', 'loss', 'loss+', 'l2']}

# cosmos
lamda = {'lamda': [1., 2., 4., 8., 16.]}

In [36]:
def percent_finished(jobs):
    if len(jobs):
        return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs, tag='hpo', force=False):
    if not results_exist(method, dataset, tag) or force:
        with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'wb') as f:
            pickle.dump(jobs, f)
    else:
        print('skipping')

        
def load_jobs(method, dataset, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset, tag='hpo'):
    return os.path.exists(f'pickles/{method}_{dataset}_{tag}.pickle')


def get_optimal_cfg(jobs, max_dist=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        if job.state == 'FAILED' or job.state == 'TIMEOUT':
            continue

        hv, dist = job.result()
        
        if max_dist is not None and dist > max_dist:
            continue
        
        if hv > hv_best:
            print(hv, dist, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    cfg = jobs[idx_best].submission().args[2]
    return {
        'scheduler': cfg.lr_scheduler,
        'lr': cfg.lr,
        'weight_decay': cfg.weight_decay,
        'solver': cfg.internal_solver_phn,
        'norm': cfg.normalization_type,
        'alpha': cfg.alpha,
        'lamda': cfg.lamda if 'lamda' in cfg else None,
        'channel_multiplier': cfg.channel_multiplier,
    }

In [37]:
def execute(config, hp_spaces, force=False, tag='hpo'):
    if not force and results_exist(config.method, config.dataset, tag):
        print('skipping. Use force=True to enforce execution')
        return []
    
    cfg = config.clone()
    cfg.eval_every = 100
    
    cartesian_product = list(itertools.product(*(list(hp.values())[0] for hp in hp_spaces)))
    
    configurations = []
    for c in cartesian_product:
        hp_with_names = [(list(name.keys())[0], value) for name, value in zip(hp_spaces, c)]
        flattened = [item for sublist in hp_with_names for item in sublist]
        configurations.append(flattened)
    
    configurations = random.sample(configurations, num_evaluations)
    
    cfgs = []
    for args in configurations:
        cfg = cfg.clone()
        cfg.merge_from_list(args)
        cfgs.append(cfg)
    
    tags = [f"{tag}_{i :03d}" for i in range(len(cfgs))]
    
    # func, rank, world_size, cfg, tag
    return executor.map_array(main, cycle([0]), cycle([1]), cfgs, tags)

In [38]:
def execute_and_save(config, hp_spaces, force=False, tag='hpo'):
    jobs = execute(config, hp_spaces, force, tag)
    done = len(jobs) == 0
    while not done:
        done = percent_finished(jobs) == 1
        sleep(10)
    
    method_name = config.method + "_" + config.task_id if config.method == 'single_task' else config.method
    
    save_jobs(method_name, config.dataset, jobs, force=force, tag=tag)
    return jobs

In [39]:
executor.update_parameters(timeout_min=35, slurm_partition="alldlc_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=100)

In [10]:
# executor.update_parameters(slurm_exclude='dlcgpu09')

## Analyze

In [29]:
jobs = load_jobs('single_task_r', 'multi_mnist', 'hpo_size_10')
get_optimal_cfg(jobs)

0.3452180310415267 99 7244714_0
0.48006205162478555 99 7244714_1
0.4875842712136915 99 7244714_3
0.5395834475087122 99 7244714_6
0.5446465594140557 99 7244714_7
0.5749713747625675 99 7244714_12
0.5839264198008023 99 7244714_26
Best job: 7244714_26


{'scheduler': 'MultiStep',
 'lr': 0.00025,
 'weight_decay': 0.01,
 'solver': 'linear',
 'norm': 'none',
 'alpha': 0.2,
 'lamda': 0.0,
 'channel_multiplier': 10.0}

#### COSMOS original

In [11]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/cosmos_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [18]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/cosmos_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [19]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/cosmos_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

#### COSMOS

In [17]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/cosmos.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        alpha,
        lamda,
    ])

In [20]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/cosmos.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        alpha,
        lamda,
    ])

In [21]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/cosmos.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        alpha,
        lamda,
    ])

#### mgda

In [10]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

In [11]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

In [12]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

#### phn original

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/phn_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/phn_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/phn_orig.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

#### phn

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
        alpha,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
        alpha,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
        alpha,
    ])

#### pmtl

In [None]:
executor.update_parameters(timeout_min=120)

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

#### Single task

In [28]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [29]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [30]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [31]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [32]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [33]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

#### Uniform

In [59]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [60]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [61]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

# Different size

In [40]:
executor.update_parameters(timeout_min=400)
executor.update_parameters(slurm_exclude='dlcgpu09')
num_evaluations = 100

In [46]:
jobs = load_jobs('uniform', 'multi_mnist', 'hpo_size_50')
get_optimal_cfg(jobs)

0.355660399600751 99 7404787_0
0.4104817393035205 99 7404787_3
0.5714953496038699 99 7404787_6
0.6041196119346615 99 7404787_7
0.6187506913460388 99 7404787_11
0.6310281884425972 99 7404787_42
Best job: 7404787_42


{'scheduler': 'MultiStep',
 'lr': 0.0001,
 'weight_decay': 0.0075,
 'solver': 'linear',
 'norm': 'none',
 'alpha': 0.2,
 'lamda': 0.0,
 'channel_multiplier': 50.0}

# size 50
#### single task

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [41]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

#### Uniform

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

In [None]:
jobs = execute_and_save(
    get_config('size_50_configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_50')

# size 0.5
#### single task

In [22]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [23]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [31]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [32]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [33]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [34]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

#### Uniform

In [24]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [59]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

In [60]:
jobs = execute_and_save(
    get_config('size_0.5_configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_0.5')

# size 10
#### single task

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [67]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [68]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

#### Uniform

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [None]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

In [69]:
jobs = execute_and_save(
    get_config('size_10_configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_size_10')

# Grid search

In [4]:
num_evaluations = 351

#### Single task

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

#### Uniform

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ],
    tag='hpo_grid')