In [1]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError, UncompletedJobError
from itertools import cycle
from time import sleep

random.seed(1)

In [2]:
from multi_objective.main import main, get_config

In [3]:
executor = submitit.AutoExecutor(folder="tmp/submitit_hpo")

# Prepare the globals

In [4]:
num_evaluations = 150

In [5]:
lr           = {'lr': [0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]}
weight_decay = {'weight_decay': [0.1, 0.25, 0.05, 0.075, 0.01, 0.0075, 0.005, 0.0025, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]}
scheduler    = {'lr_scheduler': ['none', 'MultiStep', 'CosineAnnealing']}

# phn
solver = {'internal_solver_phn': ['linear', 'epo']}

# mgda
norm = {'normalization_type': ['none', 'loss', 'loss+', 'l2']}

# pmtl & cosmos
mildening = [0.8, 0.5, 0.3, 0.]

# cosmos
clipping  = [10., 5., 2., 1., 0.5]
dampening = [0.1, 0.01, 0.001, 0.0001, 0]

In [6]:
def percent_finished(jobs):
    if len(jobs):
        return sum(job.done() for job in jobs) / len(jobs)


def save_jobs(method, dataset, jobs, tag='hpo', force=False):
    if not results_exist(method, dataset, tag) or force:
        with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'wb') as f:
            pickle.dump(jobs, f)
    else:
        print('skipping')

        
def load_jobs(method, dataset, tag='hpo'):
    with open(f'pickles/{method}_{dataset}_{tag}.pickle', 'rb') as f:
        return pickle.load(f)


def jobs_failed(jobs):
    idxs = []
    for i, job in enumerate(jobs):
        if job.state == 'FAILED':
            idxs.append(i)
    return idxs


def results_exist(method, dataset, tag='hpo'):
    return os.path.exists(f'pickles/{method}_{dataset}_{tag}.pickle')


def get_optimal_cfg(jobs, max_dist=None):
    idx_best = None
    hv_best = 0
    
    for i, job in enumerate(jobs):
        if job.state == 'FAILED' or job.state == 'TIMEOUT':
            continue

        hv, dist = job.result()
        
        if max_dist is not None and dist > max_dist:
            continue
        
        if hv > hv_best:
            print(hv, dist, job.job_id)
            hv_best = hv
            idx_best = i
    assert idx_best is not None, "No optimal cfg found"
    print(f'Best job: {jobs[idx_best].job_id}')
    cfg = jobs[idx_best].submission().args[2]
    return {
        'lr': cfg.lr,
        'weight_decay': cfg.weight_decay,
        'scheduler': cfg.lr_scheduler,
        'solver': cfg.internal_solver_phn,
        'norm': cfg.normalization_type,
        'mildening': cfg.train_ray_mildening,
        'clipping': cfg.lambda_clipping,
        'dampening': cfg.dampening,
    }

In [7]:
def execute(config, hp_spaces, force=False, tag='hpo'):
    if not force and results_exist(config.method, config.dataset, tag):
        print('skipping. Use force=True to enforce execution')
        return []
    
    cfg = config.clone()
    cfg.eval_every = 1   # early stopping
    
    cartesian_product = list(itertools.product(*(list(hp.values())[0] for hp in hp_spaces)))
    
    configurations = []
    for c in cartesian_product:
        hp_with_names = [(list(name.keys())[0], value) for name, value in zip(hp_spaces, c)]
        flattened = [item for sublist in hp_with_names for item in sublist]
        configurations.append(flattened)
    
    configurations = random.sample(configurations, num_evaluations)
    
    cfgs = []
    for args in configurations:
        cfg = cfg.clone()
        cfg.merge_from_list(args)
        cfgs.append(cfg)
    
    tags = [f"{tag}_{i :03d}" for i in range(len(cfgs))]
    
    # func, rank, world_size, cfg, tag
    return executor.map_array(main, cycle([0]), cycle([1]), cfgs, tags)

In [8]:
def execute_and_save(config, hp_spaces, force=False, tag='hpo'):
    jobs = execute(config, hp_spaces, force, tag)
    done = len(jobs) == 0
    while not done:
        done = percent_finished(jobs) == 1
        sleep(10)
    
    method_name = config.method + "_" + config.task_id if config.method == 'single_task' else config.method
    
    save_jobs(method_name, config.dataset, jobs, force=force, tag=tag)
    return jobs

In [24]:
executor.update_parameters(timeout_min=120, slurm_partition="alldlc_gpu-rtx2080", name='hpo', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=150)

In [10]:
executor.update_parameters(slurm_exclude='dlcgpu12')

## Analyze

In [19]:
jobs = load_jobs('pmtl', 'multi_mnist', 'hpo')

In [20]:
get_optimal_cfg(jobs)

0.5054907642862094 99 7085549_0
0.5156304269139014 99 7085549_94
Best job: 7085549_94


{'lr': 0.00075,
 'weight_decay': 0.05,
 'scheduler': 'CosineAnnealing',
 'solver': 'linear',
 'norm': 'none',
 'mildening': 0.5,
 'clipping': 5.0,
 'dampening': 0.2}

## COSMOS

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

In [61]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

In [64]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/cosmos.yaml'), 
    hp_samples=[
        lr_samples, 
        weight_decay_samples, 
        scheduler_samples,
        mildening_samples,
        clipping_samples,
        dampening_samples,
    ])

## mgda

In [26]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

In [28]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

In [30]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/mgda.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        norm,
    ])

## phn

In [12]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
    ])

In [13]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
    ])

In [14]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/phn.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
        solver,
    ])

## pmtl

In [25]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [26]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [None]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/pmtl.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

## Single task

In [26]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [19]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [28]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [11]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [13]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [16]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

## Uniform

In [25]:
jobs = execute_and_save(
    get_config('configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [18]:
jobs = execute_and_save(
    get_config('configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [22]:
jobs = execute_and_save(
    get_config('configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

# Different size

#### single task

In [26]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [19]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [28]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [11]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [13]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion_mnist/single_task_1.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [16]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion_mnist/single_task_2.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

#### Uniform

In [25]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [18]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])

In [22]:
jobs = execute_and_save(
    get_config('size_2_configs/multi_fashion_mnist/uniform.yaml'), 
    hp_spaces=[
        lr, 
        weight_decay, 
        scheduler,
    ])