In [1]:
import submitit

import torch
import random
import numpy as np

np.random.seed(1)
random.seed(1)
torch.manual_seed(1)

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from rtb import log_every_n_seconds, log_first_n, setup_logger

import torch.multiprocessing as mp

In [2]:
from multi_objective.main import get_config, evaluate, method_from_name, main
from multi_objective import utils, defaults
from multi_objective.objectives import from_name

from multi_objective.methods import HypernetMethod, ParetoMTLMethod, SingleTaskMethod, COSMOSMethod, MGDAMethod, UniformScalingMethod
from multi_objective.scores import from_objectives

Prepare submitions

In [3]:
executor = submitit.AutoExecutor(folder="tmp/submitit")

In [4]:
executor.update_parameters(timeout_min=20, slurm_partition="testdlc_gpu-rtx2080", name='debug', gpus_per_node=1)
executor.update_parameters(slurm_array_parallelism=2)

In [5]:
def evalu(method_name, cfg, checkpoint_dir, tag='eval', last_only=True, splits=['val']):
    cfg.freeze()
    # create the experiment folders
    logdir = os.path.join(cfg['logdir'], method_name, cfg['dataset'], utils.get_runname(cfg) + f'_{tag}')
    pathlib.Path(logdir).mkdir(parents=True, exist_ok=True)

    logger = setup_logger(os.path.join(logdir, 'exp.log'), name=__name__)
    logger.info(f"start experiment with settings {cfg}")

    # prepare
    utils.set_seed(cfg['seed'])
    objectives = from_name(**cfg)
    scores = from_objectives(objectives, **cfg)

    train_loader, val_loader, test_loader, sampler = utils.loaders_from_name(**cfg)

    model = utils.model_from_dataset(**cfg).to(cfg.device)
    method = method_from_name(method_name, objectives, model, cfg)

    train_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    val_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    test_results = dict(settings=cfg, num_parameters=utils.num_parameters(method.model_params()))
    
#     task_ids = settings['task_ids'] if method_name == 'single_task' else [0]
    task_ids = [0]
    for j in task_ids:
        checkpoints = pathlib.Path(checkpoint_dir).glob('**/c_*.pth')

        train_results[f"start_{j}"] = {}
        val_results[f"start_{j}"] = {}
        test_results[f"start_{j}"] = {}

        if last_only:
            # Eval only last checkpoint
            checkpoints = [list(sorted(checkpoints))[-1]]

        for c in sorted(checkpoints):
            print("checkpoint", c)
            _, e = c.stem.replace('c_', '').split('-')

            j = int(j)
            e = int(e)
            
            method.model.load_state_dict(torch.load(c))

            # Validation results
            if 'val' in splits:
                val_results = evaluate(j, e, method, scores, val_loader,
                        split='val',
                        result_dict=val_results,
                        logdir=logdir,
                        train_time=0,
                        cfg=cfg,
                        logger=logger,)

            # Test results
            if 'test' in splits:
                test_results = evaluate(j, e, method, scores, test_loader,
                        split='test',
                        result_dict=test_results,
                        logdir=logdir,
                        train_time=0,
                        cfg=cfg,
                        logger=logger,)

            # Train results
            if 'train' in splits:
                train_results = evaluate(j, e, method, scores, train_loader,
                        split='train',
                        result_dict=train_results,
                        logdir=logdir,
                        train_time=0,
                        cfg=cfg,
                        logger=logger,)
    result = {}
    if 'val' in splits:
        result['val'] = val_results['start_0'][f'epoch_{e}']
    if 'train' in splits:
        result['train'] = train_results['start_0'][f'epoch_{e}']
    if 'test' in splits:
        result['test'] = test_results['start_0'][f'epoch_{e}']
    return result

Checkpoints

In [6]:
def get_dirs(method, dataset, root='results'):
    path = pathlib.Path(os.path.join(root, method, dataset))
    return list(sorted(path.glob('*')))

In [7]:
get_dirs('mgda', 'cityscapes')

[PosixPath('results/mgda/cityscapes/5006462_full_approx'),
 PosixPath('results/mgda/cityscapes/5006467_full_no_approx'),
 PosixPath('results/mgda/cityscapes/5607578_lossfix'),
 PosixPath('results/mgda/cityscapes/5744156_eval'),
 PosixPath('results/mgda/cityscapes/5744159_eval'),
 PosixPath('results/mgda/cityscapes/5793458_eval')]

In [8]:
method = 'mgda'
dataset = 'cityscapes'
cfg = get_config(f'configs/{dataset}.yaml')
checkpoint_dir = get_dirs(method, dataset)[2]

In [9]:
job = executor.submit(evalu, method, cfg, checkpoint_dir, tag='eval', last_only=True, splits=['val'])

In [10]:
job.result()

{'val': {'loss': {'center_ray': [0.27116543878065913,
    2.7455966441254867,
    29.06497473465769]},
  'metrics': {'center_ray': [0.4946948167252065,
    2.7455966441254867,
    29.06497473465769]},
  'training_time_so_far': 0}}

In [24]:
executor = submitit.AutoExecutor(folder="tmp/submitit")
executor.update_parameters(timeout_min=20, slurm_partition="ml_gpu-rtx2080", name='debug', gpus_per_node=8)
executor.update_parameters(slurm_array_parallelism=32)

In [25]:
def run_distributed(world_size, method, cfg, tag):
    mp.spawn(main,
            args=(world_size, method, cfg, tag),
            nprocs=world_size,
            join=True
        )

In [26]:
method = 'cosmos'
dataset = 'cityscapes'
cfg = get_config(f'configs/{dataset}.yaml')
tag = 'test'
world_size = 8

In [36]:
job = executor.submit(run_distributed, world_size, method, cfg, tag)

In [46]:
job.result()

FailedJobError: Job (task=0) failed during processing with trace:
----------------------
Traceback (most recent call last):
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/submitit/core/submission.py", line 53, in process_job
    result = delayed.result()
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/submitit/core/utils.py", line 128, in result
    self._result = self.function(*self.args, **self.kwargs)
  File "<ipython-input-25-97c057ba50a8>", line 2, in run_distributed
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
    while not context.join():
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
    raise Exception(msg)
Exception: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/ruchtem/dev/venvs/base/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/home/ruchtem/dev/moo/multi_objective/main.py", line 244, in main
    assert not math.isnan(loss) and not math.isnan(sim)
AssertionError


----------------------
You can check full logs with 'job.stderr(0)' and 'job.stdout(0)'or at paths:
  - /home/ruchtem/dev/moo/tmp/submitit/5616032_0_log.err
  - /home/ruchtem/dev/moo/tmp/submitit/5616032_0_log.out