In [24]:
import submitit

import torch
import random
import numpy as np

np.random.seed(1)
random.seed(1)
torch.manual_seed(1)

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from rtb import log_every_n_seconds, log_first_n, setup_logger

In [37]:
from multi_objective.main import get_config, evaluate, method_from_name
from multi_objective import utils, defaults
from multi_objective.objectives import from_name

from multi_objective.methods import HypernetMethod, ParetoMTLMethod, SingleTaskMethod, COSMOSMethod, MGDAMethod, UniformScalingMethod
from multi_objective.scores import from_objectives

Prepare submitions

In [49]:
executor = submitit.AutoExecutor(folder="tmp/submitit")

In [50]:
executor.update_parameters(timeout_min=20, slurm_partition="testdlc_gpu-rtx2080", name='debug', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=2)

In [41]:
def eval(method_name, cfg, checkpoint_dir, tag='eval', last_only=True, splits=['val']):
    cfg.freeze()
    # create the experiment folders
    logdir = os.path.join(cfg['logdir'], method_name, cfg['dataset'], utils.get_runname(cfg) + f'_{tag}')
    pathlib.Path(logdir).mkdir(parents=True, exist_ok=True)

    logger = setup_logger(os.path.join(logdir, 'exp.log'), name=__name__)
    logger.info(f"start experiment with settings {cfg}")

    # prepare
    utils.set_seed(cfg['seed'])
    objectives = from_name(**cfg)
    scores = from_objectives(objectives, **cfg)

    train_loader, val_loader, test_loader = utils.loaders_from_name(**cfg)

    model = utils.model_from_dataset(**cfg).to(cfg.device)
    method = method_from_name(method_name, objectives, model, cfg)

    train_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    val_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    test_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    
#     task_ids = settings['task_ids'] if method_name == 'single_task' else [0]
    task_ids = [0]
    for j in task_ids:
        checkpoints = pathlib.Path(checkpoint_dir).glob('**/c_*.pth')

        train_results[f"start_{j}"] = {}
        val_results[f"start_{j}"] = {}
        test_results[f"start_{j}"] = {}

        if last_only:
            # Eval only last checkpoint
            checkpoints = [list(sorted(checkpoints))[-1]]

        for c in sorted(checkpoints):
            print("checkpoint", c)
            _, e = c.stem.replace('c_', '').split('-')

            j = int(j)
            e = int(e)
            
            method.model.load_state_dict(torch.load(c))

            # Validation results
            if 'val' in splits:
                val_results = evaluate(j, e, method, scores, val_loader,
                        split='val',
                        result_dict=val_results,
                        logdir=logdir,
                        train_time=0,
                        cfg=cfg,
                        logger=logger,)

            # Test results
            if 'test' in splits:
                test_results = evaluate(j, e, method, scores, test_loader,
                        split='test',
                        result_dict=test_results,
                        logdir=logdir,
                        train_time=0,
                        cfg=cfg,
                        logger=logger,)

            # Train results
            if 'train' in splits:
                train_results = evaluate(j, e, method, scores, train_loader,
                        split='train',
                        result_dict=train_results,
                        logdir=logdir,
                        train_time=0,
                        scfg=cfg,
                        logger=logger,)
    return val_results['start_0'][f'epoch_{e}']

Checkpoints

In [7]:
def get_dirs(method, dataset, root='results'):
    path = pathlib.Path(os.path.join(root, method, dataset))
    return list(sorted(path.glob('*')))

In [11]:
get_dirs('mgda', 'cityscapes')

[PosixPath('results/mgda/cityscapes/5006462_full_approx'),
 PosixPath('results/mgda/cityscapes/5006467_full_no_approx'),
 PosixPath('results/mgda/cityscapes/5301812')]

In [16]:
method = 'mgda'
dataset = 'cityscapes'
cfg = get_config(f'configs/{dataset}.yaml')
checkpoint_dir = get_dirs(method, dataset)[0]

In [51]:
job = executor.submit(eval, method, cfg, checkpoint_dir, tag='eval', last_only=True, splits=['val'])

In [52]:
job.result()

{'loss': {'center_ray': [0.48630033590291677,
   25.883729181791608,
   0.05021902134543971]},
 'metrics': {'center_ray': [0.4110914431449433,
   25.883729181791608,
   0.05021902134543971]},
 'training_time_so_far': 0}

In [53]:
job.stdout()

"submitit INFO (2021-04-21 15:32:50,926) - Starting with JobEnvironment(job_id=5527721, hostname=dlcgpu01, local_rank=0(1), node=0(1), global_rank=0(1))\nsubmitit INFO (2021-04-21 15:32:50,926) - Loading pickle: /home/ruchtem/dev/moo/tmp/submitit/5527721_submitted.pkl\n\x1b[32m[04/21 15:32:56 __main__]: \x1b[0mstart experiment with settings batch_size: 8\ncheckpoint_every: 0\ncosmos:\n  alpha: [1.3]\n  lamda: 0.5\n  lr: 0.0005\n  lr_scheduler: MultiStep\ndataset: cityscapes\ndevice: cuda\ndim: (3, 256, 512)\nepochs: 200\neval_every: 5\nlogdir: results\nlr: 0.001\nlr_scheduler: None\nmetrics: ['mIoU', 'L1Loss', 'L1Loss']\nmgda:\n  approximate_norm_solution: True\n  lr: 0.0005\n  lr_scheduler: MultiStep\n  normalization_type: l2\nn_partitions: 5\nnum_workers: 4\nobjectives: ['CrossEntropyLoss', 'L1Loss', 'L1Loss']\nphn:\n  alpha: 0.2\n  internal_solver: linear\n  lr: 0.001\n  lr_scheduler: None\npmtl:\n  lr: 0.001\n  lr_scheduler: None\n  num_starts: 5\nreference_point: [2, 2]\nseed: 1\n