From e6691bca5a7886b359f0ca0a5acaaa9f1b36cf17 Mon Sep 17 00:00:00 2001 From: FrAnCOisCokELaER Date: Wed, 3 Feb 2021 22:16:25 +0100 Subject: [PATCH 01/19] Recall/Precision metrics for ddp : average == false and multilabel == true --- ignite/metrics/precision.py | 10 ++++++---- tests/ignite/metrics/test_precision.py | 2 +- tests/ignite/metrics/test_recall.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py index 912ff813cd43..5ef35f3d7dca 100644 --- a/ignite/metrics/precision.py +++ b/ignite/metrics/precision.py @@ -53,12 +53,14 @@ def compute(self) -> Union[torch.Tensor, float]: raise NotComputableError( f"{self.__class__.__name__} must have at least one example before it can be computed." ) - - if not (self._type == "multilabel" and not self._average): - if not self._is_reduced: + if not self._is_reduced: + if not (self._type == "multilabel" and not self._average): self._true_positives = idist.all_reduce(self._true_positives) # type: ignore[assignment] self._positives = idist.all_reduce(self._positives) # type: ignore[assignment] - self._is_reduced = True # type: bool + else: + self._true_positives = cast(torch.Tensor, idist.all_gather(self._true_positives)) + self._positives = cast(torch.Tensor, idist.all_gather(self._positives)) + self._is_reduced = True # type: bool result = self._true_positives / (self._positives + self.eps) diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index 3c5e4af784a5..5db50442db38 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -837,7 +837,7 @@ def update(engine, i): pr.update((y_pred, y)) pr_compute1 = pr.compute() pr_compute2 = pr.compute() - assert len(pr_compute1) == 4 * 6 * 8 + assert len(pr_compute1) == idist.get_world_size() * 4 * 6 * 8 assert (pr_compute1 == pr_compute2).all() diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index 537cea38fb79..ca6407e0c84d 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -837,7 +837,7 @@ def update(engine, i): re.update((y_pred, y)) re_compute1 = re.compute() re_compute2 = re.compute() - assert len(re_compute1) == 4 * 6 * 8 + assert len(re_compute1) == idist.get_world_size() * 4 * 6 * 8 assert (re_compute1 == re_compute2).all() From b57d89d262a7cb44cdd4fe46f306c558e49826c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20COKELAER?= Date: Fri, 5 Feb 2021 17:43:59 +0100 Subject: [PATCH 02/19] =?UTF-8?q?For=20v0.4.3=20-=20Add=20more=20versionad?= =?UTF-8?q?ded,=20versionchanged=20tags=20-=20Change=20v0.5=E2=80=A6=20(#1?= =?UTF-8?q?612)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * For v0.4.3 - Add more versionadded, versionchanged tags - Change v0.5.0 to v0.4.3 * Update ignite/contrib/metrics/regression/canberra_metric.py Co-authored-by: vfdev * Update ignite/contrib/metrics/regression/manhattan_distance.py Co-authored-by: vfdev * Update ignite/contrib/metrics/regression/r2_score.py Co-authored-by: vfdev * Update ignite/handlers/checkpoint.py Co-authored-by: vfdev * address PR comments Co-authored-by: vfdev --- ignite/contrib/handlers/base_logger.py | 3 +++ ignite/contrib/metrics/regression/canberra_metric.py | 4 ++++ ignite/contrib/metrics/regression/manhattan_distance.py | 4 ++++ ignite/contrib/metrics/regression/r2_score.py | 3 +++ ignite/distributed/auto.py | 3 +++ ignite/handlers/checkpoint.py | 4 +++- ignite/metrics/metric.py | 3 +++ ignite/metrics/psnr.py | 2 +- ignite/utils.py | 2 ++ 9 files changed, 26 insertions(+), 2 deletions(-) diff --git a/ignite/contrib/handlers/base_logger.py b/ignite/contrib/handlers/base_logger.py index c3776835449d..6fd9949b274d 100644 --- a/ignite/contrib/handlers/base_logger.py +++ b/ignite/contrib/handlers/base_logger.py @@ -207,6 +207,9 @@ def attach_opt_params_handler( Returns: :class:`~ignite.engine.RemovableEventHandle`, which can be used to remove the handler. + + .. versionchanged:: 0.4.3 + Added missing return statement. """ return self.attach(engine, self._create_opt_params_handler(*args, **kwargs), event_name=event_name) diff --git a/ignite/contrib/metrics/regression/canberra_metric.py b/ignite/contrib/metrics/regression/canberra_metric.py index 9f995370ed4b..5bb3cb8cca67 100644 --- a/ignite/contrib/metrics/regression/canberra_metric.py +++ b/ignite/contrib/metrics/regression/canberra_metric.py @@ -22,6 +22,10 @@ class CanberraMetric(_BaseRegression): .. _scikit-learn distance metrics: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html + .. versionchanged:: 0.4.3 + + - Fixed implementation: ``abs`` in denominator. + - Works with DDP. """ def __init__( diff --git a/ignite/contrib/metrics/regression/manhattan_distance.py b/ignite/contrib/metrics/regression/manhattan_distance.py index 1abf3cffa17c..9d884196f17b 100644 --- a/ignite/contrib/metrics/regression/manhattan_distance.py +++ b/ignite/contrib/metrics/regression/manhattan_distance.py @@ -21,6 +21,10 @@ class ManhattanDistance(_BaseRegression): __ https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html + .. versionchanged:: 0.4.3 + + - Fixed sklearn compatibility. + - Workes with DDP. """ def __init__( diff --git a/ignite/contrib/metrics/regression/r2_score.py b/ignite/contrib/metrics/regression/r2_score.py index d7f735d5b696..f48adf22e33d 100644 --- a/ignite/contrib/metrics/regression/r2_score.py +++ b/ignite/contrib/metrics/regression/r2_score.py @@ -19,6 +19,9 @@ class R2Score(_BaseRegression): - ``update`` must receive output of the form ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``. - `y` and `y_pred` must be of same shape `(N, )` or `(N, 1)` and of type `float32`. + + .. versionchanged:: 0.4.3 + Works with DDP. """ def __init__( diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 6cef0ed5bae5..9764e38db6f7 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -174,6 +174,9 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod - Added Horovod distributed framework. - Added ``sync_bn`` argument. + + .. versionchanged:: 0.4.3 + Added kwargs to ``idist.auto_model``. """ logger = setup_logger(__name__ + ".auto_model") diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 016bf9433075..41a567eecd56 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -248,7 +248,9 @@ def score_function(engine): > ["best_model_9_val_acc=0.77.pt", "best_model_10_val_acc=0.78.pt", ] .. versionchanged:: 0.4.3 - Added ``greater_or_equal`` parameter. + + - Checkpoint can save model with same filename. + - Added ``greater_or_equal`` argument. """ Item = NamedTuple("Item", [("priority", int), ("filename", str)]) diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py index 4b602092ef4c..053d7d0224ba 100644 --- a/ignite/metrics/metric.py +++ b/ignite/metrics/metric.py @@ -302,6 +302,9 @@ def completed(self, engine: Engine, name: str) -> None: Args: engine (Engine): the engine to which the metric must be attached name (str): the name of the metric used as key in dict `engine.state.metrics` + + .. versionchanged:: 0.4.3 + Added dict in metrics results. """ result = self.compute() if isinstance(result, Mapping): diff --git a/ignite/metrics/psnr.py b/ignite/metrics/psnr.py index 64f5cd195c94..8c00dba78725 100644 --- a/ignite/metrics/psnr.py +++ b/ignite/metrics/psnr.py @@ -64,7 +64,7 @@ def get_y_channel(output): state = engine.run(data) print(f"PSNR: {state.metrics['psrn']}") - .. versionadded:: 0.5.0 + .. versionadded:: 0.4.3 """ def __init__( diff --git a/ignite/utils.py b/ignite/utils.py index 91a9b8f4982c..fcee45a166d2 100644 --- a/ignite/utils.py +++ b/ignite/utils.py @@ -156,6 +156,8 @@ def manual_seed(seed: int) -> None: Args: seed (int): Random state seed + .. versionchanged:: 0.4.3 + Added ``torch.cuda.manual_seed_all(seed)``. """ random.seed(seed) torch.manual_seed(seed) From 4575e9212fd6111f7a6fe64b08914499a03b7c96 Mon Sep 17 00:00:00 2001 From: Ahmed Omar <40790298+ahmedo42@users.noreply.github.com> Date: Sat, 6 Feb 2021 09:26:03 +0200 Subject: [PATCH 03/19] added TimeLimit handler with its test and doc (#1611) * added TimeLimit handler with its test and doc * fixed documentation * fixed docstring and formatting * flake8 fix trailing whitespace :) * modified class logger , default value and tests * changed rounding to nearest integer * tests refactored , docs modified * fixed default value , removed global logger * fixing formatting * Added versionadded * added test for engine termination Co-authored-by: vfdev --- docs/source/handlers.rst | 3 ++ ignite/handlers/__init__.py | 2 ++ ignite/handlers/time_limit.py | 46 ++++++++++++++++++++++++ tests/ignite/handlers/test_time_limit.py | 39 ++++++++++++++++++++ 4 files changed, 90 insertions(+) create mode 100644 ignite/handlers/time_limit.py create mode 100644 tests/ignite/handlers/test_time_limit.py diff --git a/docs/source/handlers.rst b/docs/source/handlers.rst index 42545fefbfe8..31828b5ac60d 100644 --- a/docs/source/handlers.rst +++ b/docs/source/handlers.rst @@ -28,3 +28,6 @@ Complete list of handlers .. autoclass:: TerminateOnNan .. autofunction:: global_step_from_engine + +.. autoclass:: TimeLimit + :members: \ No newline at end of file diff --git a/ignite/handlers/__init__.py b/ignite/handlers/__init__.py index cb37d8ce431f..749ec4bcb408 100644 --- a/ignite/handlers/__init__.py +++ b/ignite/handlers/__init__.py @@ -5,6 +5,7 @@ from ignite.handlers.checkpoint import Checkpoint, DiskSaver, ModelCheckpoint from ignite.handlers.early_stopping import EarlyStopping from ignite.handlers.terminate_on_nan import TerminateOnNan +from ignite.handlers.time_limit import TimeLimit from ignite.handlers.timing import Timer __all__ = [ @@ -15,6 +16,7 @@ "EarlyStopping", "TerminateOnNan", "global_step_from_engine", + "TimeLimit", ] diff --git a/ignite/handlers/time_limit.py b/ignite/handlers/time_limit.py new file mode 100644 index 000000000000..bef9cf6ad7e7 --- /dev/null +++ b/ignite/handlers/time_limit.py @@ -0,0 +1,46 @@ +import logging +import time +from typing import Optional + +from ignite.engine import Engine + +__all__ = ["TimeLimit"] + + +class TimeLimit: + """TimeLimit handler can be used to control training time for computing environments where session time is limited. + Timer starts when handler is created and not training started. + This handler gracefully terminates the training if time passed in the training exceeds a limit. + + Args: + limit_sec (int, optional): Maximum time before training terminates (in seconds). Defaults to 28800. + + Examples: + + .. code-block:: python + + from ignite.engine import Events + from ignite.handlers import TimeLimit + + handler = TimeLimit() # 8 hours of training + trainer.add_event_handler(Events.ITERATION_COMPLETED, handler) + + .. versionadded:: 0.4.3 + """ + + def __init__(self, limit_sec: Optional[int] = 28800): + + if not isinstance(limit_sec, int): + raise TypeError("Argument limit_sec should be an integer.") + if limit_sec <= 0: + raise ValueError("Argument limit_sec should be a positive integer.") + + self.limit_sec = limit_sec + self.start_time = time.time() + self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) + + def __call__(self, engine: Engine) -> None: + elapsed_time = time.time() - self.start_time + if elapsed_time > self.limit_sec: + self.logger.info("Reached the time limit: {} sec. Stop training".format(self.limit_sec)) + engine.terminate() diff --git a/tests/ignite/handlers/test_time_limit.py b/tests/ignite/handlers/test_time_limit.py new file mode 100644 index 000000000000..2965603f758e --- /dev/null +++ b/tests/ignite/handlers/test_time_limit.py @@ -0,0 +1,39 @@ +import time + +import pytest + +from ignite.engine import Engine, Events +from ignite.handlers import TimeLimit + + +def test_arg_validation(): + + with pytest.raises(ValueError, match=r"Argument limit_sec should be a positive integer."): + TimeLimit(limit_sec=-5) + + with pytest.raises(TypeError, match=r"Argument limit_sec should be an integer."): + TimeLimit(limit_sec="abc") + + +def test_terminate_on_time_limit(): + def _train_func(engine, batch): + time.sleep(1) + + def _test(n_iters, limit): + started = time.time() + trainer = Engine(_train_func) + + @trainer.on(Events.TERMINATE) + def _(): + trainer.state.is_terminated = True + + trainer.add_event_handler(Events.ITERATION_COMPLETED, TimeLimit(limit)) + trainer.state.is_terminated = False + + trainer.run(range(n_iters)) + elapsed = round(time.time() - started) + assert elapsed <= limit + 1 + assert trainer.state.is_terminated == (n_iters > limit) + + _test(20, 10) + _test(5, 10) From 8935f4c33cfcc2cf22a922370c9eb72606e4f82c Mon Sep 17 00:00:00 2001 From: Pradyumna Rahul Date: Sat, 6 Feb 2021 18:44:49 +0530 Subject: [PATCH 04/19] Update handlers to use setup_logger (#1617) * Fixes #1614 - Updated handlers EarlyStopping and TerminateOnNan - Replaced `logging.getLogger` with `setup_logger` in the mentioned handlers * Updated `TimeLimit` handler. Replaced use of `logger.getLogger` with `setup_logger` from `ignite.utils` Co-authored-by: Pradyumna Rahul K Co-authored-by: Sylvain Desroziers --- ignite/handlers/early_stopping.py | 4 ++-- ignite/handlers/terminate_on_nan.py | 4 ++-- ignite/handlers/time_limit.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ignite/handlers/early_stopping.py b/ignite/handlers/early_stopping.py index b414883a2bd6..9b7e37793c70 100644 --- a/ignite/handlers/early_stopping.py +++ b/ignite/handlers/early_stopping.py @@ -1,9 +1,9 @@ -import logging from collections import OrderedDict from typing import Callable, Mapping, Optional, cast from ignite.base import Serializable from ignite.engine import Engine +from ignite.utils import setup_logger __all__ = ["EarlyStopping"] @@ -76,7 +76,7 @@ def __init__( self.trainer = trainer self.counter = 0 self.best_score = None # type: Optional[float] - self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) + self.logger = setup_logger(__name__ + "." + self.__class__.__name__) def __call__(self, engine: Engine) -> None: score = self.score_function(engine) diff --git a/ignite/handlers/terminate_on_nan.py b/ignite/handlers/terminate_on_nan.py index 9b54fbee2a2d..cc103b5a0b62 100644 --- a/ignite/handlers/terminate_on_nan.py +++ b/ignite/handlers/terminate_on_nan.py @@ -5,7 +5,7 @@ import torch from ignite.engine import Engine -from ignite.utils import apply_to_type +from ignite.utils import apply_to_type, setup_logger __all__ = ["TerminateOnNan"] @@ -33,7 +33,7 @@ class TerminateOnNan: """ def __init__(self, output_transform: Callable = lambda x: x): - self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) + self.logger = setup_logger(__name__ + "." + self.__class__.__name__) self.logger.addHandler(logging.StreamHandler()) self._output_transform = output_transform diff --git a/ignite/handlers/time_limit.py b/ignite/handlers/time_limit.py index bef9cf6ad7e7..a59677bc6294 100644 --- a/ignite/handlers/time_limit.py +++ b/ignite/handlers/time_limit.py @@ -1,4 +1,3 @@ -import logging import time from typing import Optional @@ -6,6 +5,8 @@ __all__ = ["TimeLimit"] +from ignite.utils import setup_logger + class TimeLimit: """TimeLimit handler can be used to control training time for computing environments where session time is limited. @@ -37,7 +38,7 @@ def __init__(self, limit_sec: Optional[int] = 28800): self.limit_sec = limit_sec self.start_time = time.time() - self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) + self.logger = setup_logger(__name__ + "." + self.__class__.__name__) def __call__(self, engine: Engine) -> None: elapsed_time = time.time() - self.start_time From d01793a4b586ae1b6e8bbcae473812a394a66e95 Mon Sep 17 00:00:00 2001 From: Devanshu Shah <56106207+Devanshu24@users.noreply.github.com> Date: Sat, 6 Feb 2021 20:42:57 +0530 Subject: [PATCH 05/19] Managing Deprecation using decorators (#1585) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Starter code for managing deprecation * Make functions deprecated using the `@deprecated` decorator * Add arguments to the @deprecated decorator to customize it for each function * Improve `@deprecated` decorator and add tests * Replaced the `raise` keyword with added `warnings` * Added tests several possibilities of the decorator usage * Removing the test deprecation to check tests * Add static typing, fix mypy errors * Make `@deprecated` to raise Exceptions or Warning * The `@deprecated` decorator will now always emit warning unless explicitly asked to raise an Exception * Fix mypy errors * Fix mypy errors (hopefully) * Fix the test `test_deprecated_setup_any_logging` * Change the test to work with the `@deprecated` decorator * Change to snake_case, handle mypy ignores * Improve Type Annotations * Update common.py * For v0.4.3 - Add more versionadded, versionchanged tags - Change v0.5… (#1612) * For v0.4.3 - Add more versionadded, versionchanged tags - Change v0.5.0 to v0.4.3 * Update ignite/contrib/metrics/regression/canberra_metric.py Co-authored-by: vfdev * Update ignite/contrib/metrics/regression/manhattan_distance.py Co-authored-by: vfdev * Update ignite/contrib/metrics/regression/r2_score.py Co-authored-by: vfdev * Update ignite/handlers/checkpoint.py Co-authored-by: vfdev * address PR comments Co-authored-by: vfdev * `version` -> version Co-authored-by: vfdev Co-authored-by: François COKELAER Co-authored-by: Sylvain Desroziers --- ignite/contrib/engines/common.py | 21 ++++-- ignite/utils.py | 35 +++++++++- tests/ignite/contrib/engines/test_common.py | 2 +- tests/ignite/test_utils.py | 72 ++++++++++++++++++++- 4 files changed, 122 insertions(+), 8 deletions(-) diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index c38f6f4dc8bd..97fedbcbc16e 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -30,6 +30,7 @@ from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping, TerminateOnNan from ignite.handlers.checkpoint import BaseSaveHandler from ignite.metrics import RunningAverage +from ignite.utils import deprecated def setup_common_training_handlers( @@ -274,11 +275,21 @@ def empty_cuda_cache(_: Engine) -> None: gc.collect() -def setup_any_logging(logger, logger_module, trainer, optimizers, evaluators, log_every_iters) -> None: # type: ignore - raise DeprecationWarning( - "ignite.contrib.engines.common.setup_any_logging is deprecated since 0.4.0. and will be remove in 0.6.0. " - "Please use instead: setup_tb_logging, setup_visdom_logging or setup_mlflow_logging etc." - ) +@deprecated( + "0.4.0", + "0.6.0", + ("Please use instead: setup_tb_logging, setup_visdom_logging or setup_mlflow_logging etc.",), + raise_exception=True, +) +def setup_any_logging( + logger: BaseLogger, + logger_module: Any, + trainer: Engine, + optimizers: Optional[Union[Optimizer, Dict[str, Optimizer], Dict[None, Optimizer]]], + evaluators: Optional[Union[Engine, Dict[str, Engine]]], + log_every_iters: int, +) -> None: + pass def _setup_logging( diff --git a/ignite/utils.py b/ignite/utils.py index fcee45a166d2..b21f9c79b3b0 100644 --- a/ignite/utils.py +++ b/ignite/utils.py @@ -1,8 +1,10 @@ import collections.abc as collections +import functools import logging import random import sys -from typing import Any, Callable, Optional, TextIO, Tuple, Type, Union, cast +import warnings +from typing import Any, Callable, Dict, Optional, TextIO, Tuple, Type, TypeVar, Union, cast import torch @@ -171,3 +173,34 @@ def manual_seed(seed: int) -> None: np.random.seed(seed) except ImportError: pass + + +def deprecated( + deprecated_in: str, removed_in: str = "", reasons: Tuple[str, ...] = (), raise_exception: bool = False +) -> Callable: + + F = TypeVar("F", bound=Callable[..., Any]) + + def decorator(func: F) -> F: + func_doc = func.__doc__ if func.__doc__ else "" + deprecation_warning = ( + f"This function has been deprecated since version {deprecated_in}" + + (f" and will be removed in version {removed_in}" if removed_in else "") + + ".\n Please refer to the documentation for more details." + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Dict[str, Any]) -> Callable: + if raise_exception: + raise DeprecationWarning(deprecation_warning) + warnings.warn(deprecation_warning, DeprecationWarning, stacklevel=2) + return func(*args, **kwargs) + + appended_doc = f".. deprecated:: {deprecated_in}" + ("\n\n\t" if len(reasons) else "") + + for reason in reasons: + appended_doc += "\n\t- " + reason + wrapper.__doc__ = f"**Deprecated function**.\n\n {func_doc}{appended_doc}" + return cast(F, wrapper) + + return decorator diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index f87122cced4b..fcdc3831e1ba 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -274,7 +274,7 @@ def set_eval_metric(engine): def test_deprecated_setup_any_logging(): - with pytest.raises(DeprecationWarning, match=r"is deprecated since 0\.4\.0\."): + with pytest.raises(DeprecationWarning, match=r"deprecated since version 0.4.0"): setup_any_logging(None, None, None, None, None, None) diff --git a/tests/ignite/test_utils.py b/tests/ignite/test_utils.py index 1b523e8691d6..21d8ae5b4671 100644 --- a/tests/ignite/test_utils.py +++ b/tests/ignite/test_utils.py @@ -1,13 +1,14 @@ import logging import os import sys +import warnings from collections import namedtuple import pytest import torch from ignite.engine import Engine, Events -from ignite.utils import convert_tensor, setup_logger, to_onehot +from ignite.utils import convert_tensor, deprecated, setup_logger, to_onehot def test_convert_tensor(): @@ -150,3 +151,72 @@ def _(_): # Needed by windows to release FileHandler in the loggers logging.shutdown() + + +def test_deprecated(): + + # Test on function without docs, @deprecated without reasons + @deprecated("0.4.2", "0.6.0") + def func_no_docs(): + return 24 + + assert func_no_docs.__doc__ == "**Deprecated function**.\n\n .. deprecated:: 0.4.2" + + # Test on function with docs, @deprecated without reasons + @deprecated("0.4.2", "0.6.0") + def func_no_reasons(): + """Docs are cool + """ + return 24 + + assert func_no_reasons.__doc__ == "**Deprecated function**.\n\n Docs are cool\n .. deprecated:: 0.4.2" + + # Test on function with docs, @deprecated with reasons + @deprecated("0.4.2", "0.6.0", reasons=("r1", "r2")) + def func_no_warnings(): + """Docs are very cool + """ + return 24 + + assert ( + func_no_warnings.__doc__ + == "**Deprecated function**.\n\n Docs are very cool\n .. deprecated:: 0.4.2\n\n\t\n\t- r1\n\t- r2" + ) + + # Tests that the function emits DeprecationWarning + @deprecated("0.4.2", "0.6.0", reasons=("r1", "r2")) + def func_check_warning(): + """Docs are very ... + """ + return 24 + + with pytest.deprecated_call(): + func_check_warning() + assert func_check_warning() == 24 + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # Trigger a warning. + func_check_warning() + # Verify some things + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) + assert ( + "This function has been deprecated since version 0.4.2 and will be removed in version 0.6.0." + + "\n Please refer to the documentation for more details." + in str(w[-1].message) + ) + + # Test that the function raises Exception + @deprecated("0.4.2", "0.6.0", reasons=("reason1", "reason2"), raise_exception=True) + def func_with_everything(): + return 1 + + with pytest.raises(Exception) as exec_info: + func_with_everything() + + assert ( + str(exec_info.value) + == "This function has been deprecated since version 0.4.2 and will be removed in version 0.6.0." + + "\n Please refer to the documentation for more details." + ) From 3c0b68f2ac95b6187666ebd75cf9aac07f01832e Mon Sep 17 00:00:00 2001 From: vfdev Date: Sat, 6 Feb 2021 22:02:13 +0100 Subject: [PATCH 06/19] Create documentation.md --- .github/ISSUE_TEMPLATE/documentation.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/documentation.md diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md new file mode 100644 index 000000000000..aca759b24643 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -0,0 +1,10 @@ +--- +name: "\U0001F4DA Documentation" +about: Report an issue, comment or suggestion related to project's docs +labels: 'docs' + +--- + +## 📚 Documentation + + From 4a52ebc09aac4ffcbd46513a8fe8e1a4d8a1c0bf Mon Sep 17 00:00:00 2001 From: Ahmed Omar <40790298+ahmedo42@users.noreply.github.com> Date: Sun, 7 Feb 2021 01:09:51 +0200 Subject: [PATCH 07/19] Distributed tests on Windows should be skipped until fixed. (#1620) * modified CONTRIBUTING.md * bash instead of sh --- CONTRIBUTING.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d6c835fee12..ce56045a8e31 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -156,6 +156,11 @@ To run all tests with coverage (assuming installed `pytest-cov`): bash tests/run_cpu_tests.sh ``` +On Windows, distributed tests should be skipped +```bash +SKIP_DISTRIB_TESTS=1 bash tests/run_cpu_tests.sh +``` + #### Run Mypy checks: To run mypy to check the optional static type: From e4571ae734cd3a0a9418b65313f104737bc5546a Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 7 Feb 2021 23:29:13 +0100 Subject: [PATCH 08/19] Added Checkpoint.get_default_score_fn (#1621) * Added Checkpoint.get_default_score_fn to simplify best_model_handler creation * Added score_sign argument * Updated docs --- docs/source/handlers.rst | 2 +- ignite/contrib/engines/common.py | 9 +--- ignite/handlers/checkpoint.py | 60 +++++++++++++++++++++++- tests/ignite/handlers/test_checkpoint.py | 18 +++++++ 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/docs/source/handlers.rst b/docs/source/handlers.rst index 31828b5ac60d..870776032680 100644 --- a/docs/source/handlers.rst +++ b/docs/source/handlers.rst @@ -11,7 +11,7 @@ Complete list of handlers :autolist: .. autoclass:: Checkpoint - :members: load_objects + :members: reset, setup_filename_pattern, load_objects, state_dict, load_state_dict, get_default_score_fn .. autoclass:: ignite.handlers.checkpoint.BaseSaveHandler :members: __call__, remove diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index 97fedbcbc16e..7b769ea71bf7 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -569,12 +569,7 @@ def setup_trains_logging( return setup_clearml_logging(trainer, optimizers, evaluators, log_every_iters, **kwargs) -def get_default_score_fn(metric_name: str) -> Any: - def wrapper(engine: Engine) -> Any: - score = engine.state.metrics[metric_name] - return score - - return wrapper +get_default_score_fn = Checkpoint.get_default_score_fn def gen_save_best_models_by_val_score( @@ -628,7 +623,7 @@ def gen_save_best_models_by_val_score( n_saved=n_saved, global_step_transform=global_step_transform, score_name=f"{tag}_{metric_name.lower()}", - score_function=get_default_score_fn(metric_name), + score_function=Checkpoint.get_default_score_fn(metric_name), **kwargs, ) evaluator.add_event_handler(Events.COMPLETED, best_model_handler) diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index 41a567eecd56..3fefa45347f5 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -231,8 +231,7 @@ class Checkpoint(Serializable): # Run evaluation on epoch completed event # ... - def score_function(engine): - return engine.state.metrics['accuracy'] + score_function = Checkpoint.get_default_score_fn("accuracy") to_save = {'model': model} handler = Checkpoint( @@ -329,6 +328,7 @@ def reset(self) -> None: trainer.run(data1, max_epochs=max_epochs) print("Last checkpoint:", checkpointer.last_checkpoint) + .. versionadded:: 0.4.3 """ self._saved = [] @@ -463,6 +463,8 @@ def setup_filename_pattern( print(filename_pattern) > "{filename_prefix}_{name}_{global_step}_{score_name}={score}.{ext}" + + .. versionadded:: 0.4.3 """ filename_pattern = "{name}" @@ -561,12 +563,66 @@ def load_objects(to_load: Mapping, checkpoint: Mapping, **kwargs: Any) -> None: obj.load_state_dict(checkpoint[k]) def state_dict(self) -> "OrderedDict[str, List[Tuple[int, str]]]": + """Method returns state dict with saved items: list of ``(priority, filename)`` pairs. + Can be used to save internal state of the class. + """ return OrderedDict([("saved", [(p, f) for p, f in self._saved])]) def load_state_dict(self, state_dict: Mapping) -> None: + """Method replace internal state of the class with provided state dict data. + + Args: + state_dict (Mapping): a dict with "saved" key and list of ``(priority, filename)`` pairs as values. + """ super().load_state_dict(state_dict) self._saved = [Checkpoint.Item(p, f) for p, f in state_dict["saved"]] + @staticmethod + def get_default_score_fn(metric_name: str, score_sign: float = 1.0) -> Callable: + """Helper method to get default score function based on the metric name. + + Args: + metric_name (str): metric name to get the value from ``engine.state.metrics``. + Engine is the one to which :class:`~ignite.handlers.checkpoint.Checkpoint` handler is added. + score_sign (float): sign of the score: 1.0 or -1.0. For error-like metrics, e.g. smaller is better, + a negative score sign should be used (objects with larger score are retained). Default, 1.0. + + Exemples: + + .. code-block:: python + + from ignite.handlers import Checkpoint + + best_acc_score = Checkpoint.get_default_score_fn("accuracy") + + best_model_handler = Checkpoint( + to_save, save_handler, score_name="val_accuracy", score_function=best_acc_score + ) + evaluator.add_event_handler(Events.COMPLETED, best_model_handler) + + Usage with error-like metric: + + .. code-block:: python + + from ignite.handlers import Checkpoint + + neg_loss_score = Checkpoint.get_default_score_fn("loss", -1.0) + + best_model_handler = Checkpoint( + to_save, save_handler, score_name="val_neg_loss", score_function=neg_loss_score + ) + evaluator.add_event_handler(Events.COMPLETED, best_model_handler) + + .. versionadded:: 0.4.3 + """ + if score_sign not in (1.0, -1.0): + raise ValueError("Argument score_sign should be 1 or -1") + + def wrapper(engine: Engine) -> float: + return score_sign * engine.state.metrics[metric_name] + + return wrapper + class DiskSaver(BaseSaveHandler): """Handler that saves input checkpoint on a disk. diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py index bd36f16b7539..2f1a3ff901ca 100644 --- a/tests/ignite/handlers/test_checkpoint.py +++ b/tests/ignite/handlers/test_checkpoint.py @@ -1556,3 +1556,21 @@ def __call__(self, c, f, m): for _ in range(4): checkpointer(trainer) assert handler.counter == 4 + + +def test_get_default_score_fn(): + + with pytest.raises(ValueError, match=r"Argument score_sign should be 1 or -1"): + Checkpoint.get_default_score_fn("acc", 2.0) + + engine = Engine(lambda e, b: None) + engine.state.metrics["acc"] = 0.9 + engine.state.metrics["loss"] = 0.123 + + score_fn = Checkpoint.get_default_score_fn("acc") + score = score_fn(engine) + assert score == 0.9 + + score_fn = Checkpoint.get_default_score_fn("loss", -1) + score = score_fn(engine) + assert score == -0.123 From 6e8dd3d0634d8041e6ac6162b41d650546352d75 Mon Sep 17 00:00:00 2001 From: vfdev Date: Mon, 8 Feb 2021 10:26:23 +0100 Subject: [PATCH 09/19] Update about.rst --- docs/source/about.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/about.rst b/docs/source/about.rst index e7376acfd521..da952078176c 100644 --- a/docs/source/about.rst +++ b/docs/source/about.rst @@ -26,7 +26,6 @@ Authors The following people are currently core contributors to PyTorch-Ignite’s development and maintenance: - Victor Fomin `@vfdev-5 `_ -- Anmol Joshi `@anmolsjoshi `_ - Sylvain Desroziers `@sdesrozis `_ @@ -37,7 +36,7 @@ The following people have been active contributors in the past, but are no longe - Alykhan Tejani `@alykhantejani `_ - Jason Kriss `@jasonkriss `_ - +- Anmol Joshi `@anmolsjoshi `_ Join Core Team -------------- From 2c83380b990f36a86609f4c87b0b22ba34275852 Mon Sep 17 00:00:00 2001 From: Devanshu Shah <56106207+Devanshu24@users.noreply.github.com> Date: Mon, 8 Feb 2021 22:03:10 +0530 Subject: [PATCH 10/19] Update pre-commit hooks and CONTRIBUTING.md (#1622) * Change pre-commit config and CONTRIBUTING.md - Update hook versions - Remove seed-isort-config - Add black profile to isort * Fix files based on new pre-commit config * Add meaningful exclusions to prettier - Also update actions workflow files to match local pre-commit --- .circleci/config.yml | 4 +- .github/FUNDING.yml | 2 +- .github/ISSUE_TEMPLATE/bug-report.md | 13 ++--- .github/ISSUE_TEMPLATE/documentation.md | 3 +- .github/ISSUE_TEMPLATE/feature-request.md | 1 - .../ISSUE_TEMPLATE/questions-help-support.md | 2 +- .github/ISSUE_TEMPLATE/user-feedback.md | 9 ++- .github/PULL_REQUEST_TEMPLATE.md | 10 ++-- .github/workflows/code-style.yml | 2 +- .github/workflows/unit-tests.yml | 2 +- .pre-commit-config.yaml | 26 ++++----- CODE_OF_CONDUCT.md | 26 ++++----- CONTRIBUTING.md | 3 +- README.md | 8 +-- assets/logo/ignite_logo_guidelines.md | 12 +++- docs/source/faq.rst | 2 +- docs/source/handlers.rst | 2 +- docs/source/metrics.rst | 9 ++- examples/contrib/cifar10/.gitignore | 2 +- examples/contrib/cifar10_qat/.gitignore | 2 +- examples/contrib/cifar10_qat/README.md | 19 +++++-- examples/contrib/mnist/README.md | 5 ++ examples/fast_neural_style/README.md | 56 ++++++++++--------- examples/gan/README.md | 1 + examples/mnist/README.md | 39 +++++++------ examples/references/README.md | 11 ++-- .../classification/imagenet/.gitignore | 2 +- .../classification/imagenet/NOTES_MLflow.md | 31 ++++++---- .../classification/imagenet/NOTES_Polyaxon.md | 21 ++++--- .../classification/imagenet/README.md | 6 +- .../imagenet/experiments/mlflow/conda.yaml | 16 +++--- .../segmentation/pascal_voc2012/.gitignore | 2 +- .../pascal_voc2012/NOTES_MLflow.md | 30 ++++++---- .../pascal_voc2012/NOTES_Polyaxon.md | 22 +++++--- .../segmentation/pascal_voc2012/README.md | 12 ++-- .../experiments/mlflow/conda.yaml | 16 +++--- examples/reinforcement_learning/README.md | 1 - 37 files changed, 240 insertions(+), 190 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bb1601f10081..b62b52c3dd78 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -353,13 +353,13 @@ jobs: docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" build_publish_docker_images: - machine: + machine: # https://circleci.com/docs/2.0/configuration-reference/#available-machine-images image: ubuntu-2004:202010-01 docker_layer_caching: true # https://circleci.com/docs/2.0/configuration-reference/#machine-executor-linux resource_class: 2xlarge - + working_directory: << pipeline.parameters.workingdir >> steps: - checkout diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 6085b08d69ce..5a3e0bffd17e 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,6 +1,6 @@ # These are supported funding model platforms -github: [vfdev-5, ] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +github: [vfdev-5] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index dbc7d53f426e..2618df26085c 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -1,7 +1,6 @@ --- name: "\U0001F41B Bug Report" about: Submit a bug report to help us improve Ignite - --- ## 🐛 Bug description @@ -13,9 +12,9 @@ about: Submit a bug report to help us improve Ignite ## Environment - - PyTorch Version (e.g., 1.4): - - Ignite Version (e.g., 0.3.0): - - OS (e.g., Linux): - - How you installed Ignite (`conda`, `pip`, source): - - Python version: - - Any other relevant information: +- PyTorch Version (e.g., 1.4): +- Ignite Version (e.g., 0.3.0): +- OS (e.g., Linux): +- How you installed Ignite (`conda`, `pip`, source): +- Python version: +- Any other relevant information: diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md index aca759b24643..6333a7546ade 100644 --- a/.github/ISSUE_TEMPLATE/documentation.md +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -1,8 +1,7 @@ --- name: "\U0001F4DA Documentation" about: Report an issue, comment or suggestion related to project's docs -labels: 'docs' - +labels: "docs" --- ## 📚 Documentation diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index cf407d2111bb..d5fdfd5b8ad9 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -1,7 +1,6 @@ --- name: "\U0001F680 Feature Request" about: Submit a proposal/request for a new Ingite feature - --- ## 🚀 Feature diff --git a/.github/ISSUE_TEMPLATE/questions-help-support.md b/.github/ISSUE_TEMPLATE/questions-help-support.md index 3394f1445c0f..5cf3d3b0d77c 100644 --- a/.github/ISSUE_TEMPLATE/questions-help-support.md +++ b/.github/ISSUE_TEMPLATE/questions-help-support.md @@ -1,7 +1,7 @@ --- name: "❓Questions/Help/Support" about: Do you have a question? -labels: 'question' +labels: "question" --- ## ❓ Questions/Help/Support diff --git a/.github/ISSUE_TEMPLATE/user-feedback.md b/.github/ISSUE_TEMPLATE/user-feedback.md index 8b1972d9c4d0..1d2b6e9ba13b 100644 --- a/.github/ISSUE_TEMPLATE/user-feedback.md +++ b/.github/ISSUE_TEMPLATE/user-feedback.md @@ -1,13 +1,12 @@ --- name: "\U0001F44D User feedback" about: Say thanks or why you don't like -title: '' -labels: '' -assignees: '' - +title: "" +labels: "" +assignees: "" --- > This is a place to leave any feedback on this package. -> If you like the work, feel free to say thanks here +> If you like the work, feel free to say thanks here > If you do not like something, please, share it with us and we can see how to improve > Thank you ! diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6be1d1605d77..1aaf460f7c76 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,9 +1,9 @@ -Fixes #{issue number} +Fixes #{issue number} Description: - Check list: -* [ ] New tests are added (if a new feature is added) -* [ ] New doc strings: description and/or example code are in RST format -* [ ] Documentation is updated (if required) + +- [ ] New tests are added (if a new feature is added) +- [ ] New doc strings: description and/or example code are in RST format +- [ ] Documentation is updated (if required) diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 51a9a6d4c6b0..bda9b314dca9 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -30,7 +30,7 @@ jobs: python -m pip install autopep8 "black==19.10b0" "isort==5.7.0" autopep8 --recursive --in-place --aggressive --aggressive . black . - isort . + isort --profile black . - name: Commit and push changes uses: stefanzweifel/git-auto-commit-action@v4 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 155b476d7621..f69182261421 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -82,7 +82,7 @@ jobs: pip install flake8 "black==19.10b0" "isort==5.7.0" flake8 ignite/ tests/ examples/ black --check . - isort -c . + isort -c --profile black . - name: Run Mypy shell: bash -l {0} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c1d06180a05..65b1c030d8e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,28 +1,18 @@ exclude: "^conda.recipe" repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + rev: v3.4.0 hooks: - id: check-toml - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - - repo: https://github.com/prettier/pre-commit - rev: main + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.2.1 hooks: - id: prettier - - - repo: https://github.com/asottile/seed-isort-config - rev: v1.9.4 - hooks: - - id: seed-isort-config - args: [--exclude=^((examples|docs)/.*)$] - - - repo: https://github.com/timothycrosley/isort - rev: 4.3.21-2 - hooks: - - id: isort + exclude_types: ["python", "jupyter", "shell", "gitignore"] - repo: https://github.com/python/black rev: 19.10b0 @@ -30,8 +20,14 @@ repos: - id: black language_version: python3.8 + - repo: https://github.com/timothycrosley/isort + rev: 5.7.0 + hooks: + - id: isort + args: ["--profile", "black"] + - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 + rev: 3.8.4 hooks: - id: flake8 args: [--append-config=tox.ini] diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 4bd525a54e78..d38bf51a16f8 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -14,22 +14,22 @@ appearance, race, religion, or sexual identity and orientation. Examples of behavior that contributes to creating a positive environment include: -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members Examples of unacceptable behavior by participants include: -* The use of sexualized language or imagery and unwelcome sexual attention or -advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic -address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a -professional setting +- The use of sexualized language or imagery and unwelcome sexual attention or + advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or electronic + address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting ## Our Responsibilities diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ce56045a8e31..da69bfc43907 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -104,8 +104,8 @@ black manually to format files and commit them. ```bash # This should autoformat the files -isort -rc . black . +isort --profile black . # Run lint checking flake8 ignite/ tests/ examples/ # If everything is OK, then commit @@ -157,6 +157,7 @@ bash tests/run_cpu_tests.sh ``` On Windows, distributed tests should be skipped + ```bash SKIP_DISTRIB_TESTS=1 bash tests/run_cpu_tests.sh ``` diff --git a/README.md b/README.md index a1f514b56c22..046184d36d51 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,11 @@ | ![image](https://img.shields.io/badge/-Tests:-black?style=flat-square) [![image](https://github.com/pytorch/ignite/workflows/Run%20unit%20tests/badge.svg)](https://github.com/pytorch/ignite/actions) [![image](https://img.shields.io/badge/-GPU%20tests-black?style=flat-square)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master)[![image](https://circleci.com/gh/pytorch/ignite.svg?style=svg)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master) [![image](https://codecov.io/gh/pytorch/ignite/branch/master/graph/badge.svg)](https://codecov.io/gh/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/ignite/index.html) | -| :--- | +| :--- | ![image](https://img.shields.io/badge/-Stable%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch/ignite/badges/version.svg)](https://anaconda.org/pytorch/ignite) [![image](https://anaconda.org/pytorch/ignite/badges/downloads.svg)](https://anaconda.org/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=PyPI&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pypi.org/project/pytorch-ignite/) [![image](https://pepy.tech/badge/pytorch-ignite)](https://pepy.tech/project/pytorch-ignite) | -| ![image](https://img.shields.io/badge/-Nightly%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch-nightly/ignite/badges/version.svg)](https://anaconda.org/pytorch-nightly/ignite) [![image](https://img.shields.io/badge/PyPI-pre%20releases-brightgreen)](https://pypi.org/project/pytorch-ignite/#history) | -| ![image](https://img.shields.io/badge/-Features:-black?style=flat-square) [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) [![image](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org) [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite) | -| ![image](https://img.shields.io/badge/-Supported_PyTorch/Python_versions:-black?style=flat-square) [![link](https://img.shields.io/badge/-check_here-blue)](https://github.com/pytorch/ignite/actions?query=workflow%3A%22PyTorch+version+tests%22) | +| ![image](https://img.shields.io/badge/-Nightly%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch-nightly/ignite/badges/version.svg)](https://anaconda.org/pytorch-nightly/ignite) [![image](https://img.shields.io/badge/PyPI-pre%20releases-brightgreen)](https://pypi.org/project/pytorch-ignite/#history)| +| ![image](https://img.shields.io/badge/-Features:-black?style=flat-square) [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) [![image](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org) [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite)| +| ![image](https://img.shields.io/badge/-Supported_PyTorch/Python_versions:-black?style=flat-square) [![link](https://img.shields.io/badge/-check_here-blue)](https://github.com/pytorch/ignite/actions?query=workflow%3A%22PyTorch+version+tests%22)| diff --git a/assets/logo/ignite_logo_guidelines.md b/assets/logo/ignite_logo_guidelines.md index 23fe6d34e268..5d304257496c 100644 --- a/assets/logo/ignite_logo_guidelines.md +++ b/assets/logo/ignite_logo_guidelines.md @@ -1,9 +1,11 @@ # PyTorch Ignite Logo Guidelines + These guidelines are meant to help keep the PyTorch Ignite logo (as developed in [#1221](https://github.com/pytorch/ignite/issues/1221)) consistent and recognizable across all its uses. They also provide a common language for referring to the logos and their components. The primary logo is the combination of the logomark and wordmark next to each other. The logomark is the flame alone (no text) and the wordmark is only the text. It's preferable to use the primary logo whenever possible, and the logomark when a smaller version is needed. ## Color + The full color options are a combonation of PyTorch's main orange (`#ee4c2c`) with yellow details (`#eaa700`). Light options are white (`#FFFFFF`) and dark options dark grey (`#2a2a2a`). The alternate "mixed" logo uses the full color logomark with a dark grey wordmark. Whenever possible, use the full color logos. One color logos (light or dark) are to be used when full color will not have enough contrast, usually when logos must be on colored backgrounds or are being reproduced somewhere that doesn't support color. @@ -11,13 +13,17 @@ Whenever possible, use the full color logos. One color logos (light or dark) are Please note: The orange (`#ee4c2c`) and yellow (`#eaa700`) do not meet WCAG 2.1 color contrast recommendations for text or UI when used with white or other light colors. Make sure to use these colors primarily as decorative elements or with a dark color for text and/or UI. Accessibility should not be overlooked. ## Type -The PyTorch Ignite wordmark is made from Oxygen (by Vernon Adams @vernnobile). + +The PyTorch Ignite wordmark is made from Oxygen (by Vernon Adams @vernnobile). ## Minimum Size + For consistent legibility, please do not display the primary logo at less than 60px wide or the logomark at less than 15px wide. -## Logo Integrity +## Logo Integrity + A few other notes to keep in mind when using the logo: + - Make sure to scale the logo proportionally. - Maintain a good amount of space around the logo. Don’t let it overlap with text, images, or other elements. -- Do not try and recreate or modify the logo. For example, do not use the logomark and then try to write PyTorch Ignite in another font. \ No newline at end of file +- Do not try and recreate or modify the logo. For example, do not use the logomark and then try to write PyTorch Ignite in another font. diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 3ac486900446..b0ecfe190461 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -48,7 +48,7 @@ flexibility to the user to allow for this: .. code-block:: python from ignite.engine import EventEnum - + class BackpropEvents(EventEnum): """ Events based on back propagation diff --git a/docs/source/handlers.rst b/docs/source/handlers.rst index 870776032680..e7ee58dd33d4 100644 --- a/docs/source/handlers.rst +++ b/docs/source/handlers.rst @@ -30,4 +30,4 @@ Complete list of handlers .. autofunction:: global_step_from_engine .. autoclass:: TimeLimit - :members: \ No newline at end of file + :members: diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index 08d03f98d40a..2d6836a842a9 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -54,13 +54,13 @@ use the ``output_transform`` argument to transform it: .. warning:: Please, be careful when using ``lambda`` functions to setup multiple ``output_transform`` for multiple metrics - + .. code-block:: python # Wrong # metrics_group = [Accuracy(output_transform=lambda output: output[name]) for name in names] # As lambda can not store `name` and all `output_transform` will use the last `name` - + # A correct way. For example, using functools.partial from functools import partial @@ -68,7 +68,7 @@ use the ``output_transform`` argument to transform it: return output[name] metrics_group = [Accuracy(output_transform=partial(ot_func, name=name)) for name in names] - + For more details, see `here `_ .. Note :: @@ -126,7 +126,7 @@ This API gives a more fine-grained/custom usage on how to compute a metric. For # Compute the result print("Precision: ", precision.compute()) - # Reset metric + # Reset metric precision.reset() # Start new accumulation: @@ -362,4 +362,3 @@ Complete list of metrics .. autofunction:: reinit__is_reduced .. autofunction:: sync_all_reduce - diff --git a/examples/contrib/cifar10/.gitignore b/examples/contrib/cifar10/.gitignore index c93cd4493396..014791a3390f 100644 --- a/examples/contrib/cifar10/.gitignore +++ b/examples/contrib/cifar10/.gitignore @@ -1,3 +1,3 @@ output cifar10 -raw_pytorch \ No newline at end of file +raw_pytorch diff --git a/examples/contrib/cifar10_qat/.gitignore b/examples/contrib/cifar10_qat/.gitignore index c93cd4493396..014791a3390f 100644 --- a/examples/contrib/cifar10_qat/.gitignore +++ b/examples/contrib/cifar10_qat/.gitignore @@ -1,3 +1,3 @@ output cifar10 -raw_pytorch \ No newline at end of file +raw_pytorch diff --git a/examples/contrib/cifar10_qat/README.md b/examples/contrib/cifar10_qat/README.md index c663521e654a..eaf6d976110b 100644 --- a/examples/contrib/cifar10_qat/README.md +++ b/examples/contrib/cifar10_qat/README.md @@ -2,7 +2,8 @@ Model's implementation is based on https://discuss.pytorch.org/t/evaluator-returns-nan/107972/3 -In this example, we show how to use *Ignite* to train a neural network: +In this example, we show how to use _Ignite_ to train a neural network: + - on 1 or more GPUs - compute training/validation metrics - log learning rate, metrics etc @@ -10,8 +11,8 @@ In this example, we show how to use *Ignite* to train a neural network: Configurations: -* [x] single GPU -* [x] multi GPUs on a single node +- [x] single GPU +- [x] multi GPUs on a single node ## Requirements: @@ -27,34 +28,39 @@ Configurations: We can train, for example, ResNet-18 with 8 bit weights and activations. Run the example on a single GPU: + ```bash CUDA_VISIBLE_DEVICES=0 python main.py run --model="resnet18_QAT_8b" ``` + Note: torch DataParallel is not working (v1.7.1) with QAT. For details on accepted arguments: + ```bash python main.py run -- --help ``` If user would like to provide already downloaded dataset, the path can be setup in parameters as + ```bash --data_path="/path/to/cifar10/" ``` Other available models can be found [here](utils.py): + - resnet18_QAT_8b - ResNet-18 with 8 bit weights and activations - resnet18_QAT_6b - ResNet-18 with 6 bit weights and activations - resnet18_QAT_5b - ResNet-18 with 5 bit weights and activations - resnet18_QAT_4b - ResNet-18 with 4 bit weights and activations - torchvision models - ### Distributed training #### Single node, multiple GPUs Let's start training on a single node with 2 gpus: + ```bash # using torch.distributed.launch python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run --backend="nccl" --model="resnet18_QAT_8b" @@ -65,11 +71,14 @@ python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py run - Please, make sure to have Horovod installed before running. Let's start training on a single node with 2 gpus: + ```bash # horovodrun horovodrun -np=2 python -u main.py run --backend="horovod" --model="resnet18_QAT_8b" ``` + or + ```bash # using function spawn inside the code python -u main.py run --backend="horovod" --nproc_per_node=2 --model="resnet18_QAT_8b" @@ -77,4 +86,4 @@ python -u main.py run --backend="horovod" --nproc_per_node=2 --model="resnet18_Q ### Online logs -On TensorBoard.dev: https://tensorboard.dev/experiment/Kp9Wod3XR36Sg2I1gAh1cA/ \ No newline at end of file +On TensorBoard.dev: https://tensorboard.dev/experiment/Kp9Wod3XR36Sg2I1gAh1cA/ diff --git a/examples/contrib/mnist/README.md b/examples/contrib/mnist/README.md index d4733c04193c..5d5955d27a57 100644 --- a/examples/contrib/mnist/README.md +++ b/examples/contrib/mnist/README.md @@ -3,6 +3,7 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/mnist) Basic neural network training with Ignite and various built-in loggers from `ignite.contrib`: + - TQDM progress bar - Tensorboard - Visdom @@ -17,6 +18,7 @@ Basic neural network training with Ignite and various built-in loggers from `ign #### Logging with TQDM progress bar Run the example: + ``` python mnist_with_tqdm_logger.py ``` @@ -36,11 +38,13 @@ Optionally, user can install `pynvml` package on Python 3 and log GPU informatio #### Usage: Run the example: + ```bash python mnist_with_tensorboard_logger.py --log_dir=/tmp/tensorboard_logs ``` Start tensorboard: + ```bash tensorboard --logdir=/tmp/tensorboard_logs/ ``` @@ -57,6 +61,7 @@ Example with training and validation monitoring using Visdom #### Usage: Run the example: + ```bash python mnist_with_visdom_logger.py ``` diff --git a/examples/fast_neural_style/README.md b/examples/fast_neural_style/README.md index 029bcd23aed8..41388ff7b469 100644 --- a/examples/fast_neural_style/README.md +++ b/examples/fast_neural_style/README.md @@ -1,6 +1,7 @@ # fast-neural-style ### Introduction + This example is ported over from [pytorch-examples](https://github.com/pytorch/examples/tree/master/fast_neural_style). It uses `ignite` to implement an algorithm for artistic style transfer as described in [Perceptual Losses for Real-Time Style Transfer and Super-Resolution](https://arxiv.org/abs/1603.08155). @@ -13,9 +14,9 @@ It uses `ignite` to implement an algorithm for artistic style transfer as descri ### Requirements -* `torch` -* `torchvision` -* `ignite` +- `torch` +- `torchvision` +- `ignite` Example for `virtualenv` setup: @@ -25,15 +26,15 @@ Example for `virtualenv` setup: `pip install torch torchvision pytorch-ignite` -The code runs on CPU, but GPU allows it to run much faster. If using GPU, please ensure proper libraries are installed. +The code runs on CPU, but GPU allows it to run much faster. If using GPU, please ensure proper libraries are installed. ### Documentation #### Training -Code can be used to train a style transfer model for any image. To run code correctly, ensure that [MSCOCO dataset](http://images.cocodataset.org/zips/train2014.zip) and a style image are downloaded. -Since the code using Pytorch's Dataset functions, ensure that directory with MSCOCO dataset is formatted as shown below. The directory should be setup such that the location of the dataset is MSCOCO, which contains a single folder 0, containing all the images. +Code can be used to train a style transfer model for any image. To run code correctly, ensure that [MSCOCO dataset](http://images.cocodataset.org/zips/train2014.zip) and a style image are downloaded. +Since the code using Pytorch's Dataset functions, ensure that directory with MSCOCO dataset is formatted as shown below. The directory should be setup such that the location of the dataset is MSCOCO, which contains a single folder 0, containing all the images. ```bash ├── MSCOCO @@ -44,35 +45,38 @@ Since the code using Pytorch's Dataset functions, ensure that directory with MSC ``` ##### Example + `python neural_style.py train --epochs 2 --cuda 1 --dataset mscoco --dataroot /path/to/mscoco --style_image ./images/style_images/mosaic.jpg` ##### Flags -* `--epochs`: number of training epochs, default is 2. -* `--batch_size`: batch size for training, default is 8. -* `--dataset`: type of dataset. -* `--dataroot`: path to training dataset, the path should point to a folder containing another folder with all the training images. -* `--style_image`: path to style-image. -* `--checkpoint_model_dir`: path to folder where checkpoints of trained models will be saved. -* `--checkpoint_interval`: number of batches after which a checkpoint of trained model will be created. -* `--image_size`: size of training images, default is 256 X 256. -* `--style_size`: size of style-image, default is the original size of style image. -* `--cuda`: set it to 1 for running on GPU, 0 for CPU. -* `--seed`: random seed for training. -* `--content_weight`: weight for content-loss, default is 1e5. -* `--style_weight`: weight for style-loss, default is 1e10. -* `--lr`: learning rate, default is 1e-3. +- `--epochs`: number of training epochs, default is 2. +- `--batch_size`: batch size for training, default is 8. +- `--dataset`: type of dataset. +- `--dataroot`: path to training dataset, the path should point to a folder containing another folder with all the training images. +- `--style_image`: path to style-image. +- `--checkpoint_model_dir`: path to folder where checkpoints of trained models will be saved. +- `--checkpoint_interval`: number of batches after which a checkpoint of trained model will be created. +- `--image_size`: size of training images, default is 256 X 256. +- `--style_size`: size of style-image, default is the original size of style image. +- `--cuda`: set it to 1 for running on GPU, 0 for CPU. +- `--seed`: random seed for training. +- `--content_weight`: weight for content-loss, default is 1e5. +- `--style_weight`: weight for style-loss, default is 1e10. +- `--lr`: learning rate, default is 1e-3. #### Evaluation -Code can be used to stylize an image using a trained style transfer model. +Code can be used to stylize an image using a trained style transfer model. ##### Example + `python neural_style.py eval --content_image ./images/content_images/amber.jpg --output_image test.png --cuda 1 --model /tmp/checkpoints/checkpoint_net_2.pth` #### Flags -* `--content_image`: path to content image you want to stylize. -* `--content_scale`: factor for scaling down the content image. -* `--output_image`: path for saving the output image. -* `--model`: saved model to be used for stylizing the image. -* `--cuda`: set it to 1 for running on GPU, 0 for CPU. + +- `--content_image`: path to content image you want to stylize. +- `--content_scale`: factor for scaling down the content image. +- `--output_image`: path for saving the output image. +- `--model`: saved model to be used for stylizing the image. +- `--cuda`: set it to 1 for running on GPU, 0 for CPU. diff --git a/examples/gan/README.md b/examples/gan/README.md index bf4b0a59035e..b7d892d44bf5 100644 --- a/examples/gan/README.md +++ b/examples/gan/README.md @@ -5,6 +5,7 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/d Usage: For example, run on CIFAR10 dataset: + ``` python dcgan.py --dataset cifar10 --dataroot /tmp/cifar10 --output-dir /tmp/outputs-dcgan ``` diff --git a/examples/mnist/README.md b/examples/mnist/README.md index f2408b22f566..cac50fc14195 100644 --- a/examples/mnist/README.md +++ b/examples/mnist/README.md @@ -10,13 +10,14 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/m #### Usage: Run the example: + ``` python mnist.py ``` ### Logging with Tensorboard -MNIST example with training and validation monitoring using Tensorboard. Notice +MNIST example with training and validation monitoring using Tensorboard. Notice that if PyTorch version is less than 1.2, the module TensorboardX is required. #### Requirements: @@ -28,11 +29,13 @@ that if PyTorch version is less than 1.2, the module TensorboardX is required. #### Usage: Run the example: + ```bash python mnist_with_tensorboard.py --log_dir=/tmp/tensorboard_logs ``` Start tensorboard: + ```bash tensorboard --logdir=/tmp/tensorboard_logs/ ``` @@ -49,24 +52,25 @@ MNIST example with training and validation monitoring using Visdom #### Usage: Start visdom: + ```bash python -m visdom.server ``` Run the example: + ```bash python mnist_with_visdom.py ``` - ### Training save & resume -Example shows how to save a checkpoint of the trainer, model, optimizer, lr scheduler. +Example shows how to save a checkpoint of the trainer, model, optimizer, lr scheduler. User can resume the training from stored latest checkpoint. In addition, training crash can be emulated. -We provided an option `--deterministic` which setups a deterministic trainer as +We provided an option `--deterministic` which setups a deterministic trainer as [`DeterministicEngine`](https://pytorch.org/ignite/engine.html#ignite.engine.deterministic.DeterministicEngine). -Trainer performs dataflow synchronization on epoch in order to ensure the same dataflow when training is resumed. +Trainer performs dataflow synchronization on epoch in order to ensure the same dataflow when training is resumed. Please, see the documentation for more details. #### Requirements: @@ -79,6 +83,7 @@ Please, see the documentation for more details. #### Usage: Training + ```bash python mnist_save_resume_engine.py --log_dir=logs/run_1 --epochs=10 # or same in deterministic mode @@ -86,6 +91,7 @@ python mnist_save_resume_engine.py --log_dir=logs-det/run_1 --deterministic --ep ``` Resume the training + ```bash python mnist_save_resume_engine.py --log_dir=logs/run_2 --resume_from=logs/run_1/checkpoint_5628.pt --epochs=10 # or same in deterministic mode @@ -93,25 +99,26 @@ python mnist_save_resume_engine.py --log_dir=logs-det/run_2 --resume_from=logs-d ``` Start tensorboard: + ```bash tensorboard --logdir=. ``` -The script logs batch stats (mean/std of images, median of targets), model weights' norms and computed gradients norms in -`run.log` and `resume_run.log` to compare training behaviour in both cases. +The script logs batch stats (mean/std of images, median of targets), model weights' norms and computed gradients norms in +`run.log` and `resume_run.log` to compare training behaviour in both cases. If set `--deterministic` option, we can observe the same values after resuming the training. -Non-deterministic| Deterministic ----|--- -![img11](assets/logs_run_1_2.png) | ![img12](assets/logs-det_run_1_2.png) +| Non-deterministic | Deterministic | +| --------------------------------- | ------------------------------------- | +| ![img11](assets/logs_run_1_2.png) | ![img12](assets/logs-det_run_1_2.png) | Deterministic `run.log` vs `resume_run.log` ![img13](assets/run_vs_resume_run_logs_1_2.png) - #### Usage with simulated crash Initial training with a crash + ```bash python mnist_save_resume_engine.py --crash_iteration 5700 --log_dir=logs/run_3_crash --epochs 10 # or same in deterministic mode @@ -119,16 +126,16 @@ python mnist_save_resume_engine.py --crash_iteration 5700 --log_dir=logs-det/run ``` Resume from the latest checkpoint + ```bash python mnist_save_resume_engine.py --resume_from logs/run_3_crash/checkpoint_6.pt --log_dir=logs/run_4 --epochs 10 # or same in deterministic mode python mnist_save_resume_engine.py --resume_from logs-det/run_3_crash/checkpoint_6.pt --log_dir=logs-det/run_4 --epochs 10 --deterministic ``` -Non-deterministic| Deterministic ----|--- -![img21](assets/logs_run_3_4.png) | ![img22](assets/logs-det_run_3_4.png) - +| Non-deterministic | Deterministic | +| --------------------------------- | ------------------------------------- | +| ![img21](assets/logs_run_3_4.png) | ![img22](assets/logs-det_run_3_4.png) | Deterministic `run.log` vs `resume_run.log` -![img23](assets/run_vs_resume_run_logs_3_4.png) \ No newline at end of file +![img23](assets/run_vs_resume_run_logs_3_4.png) diff --git a/examples/references/README.md b/examples/references/README.md index 1702d4ed8e36..bc75979aff16 100644 --- a/examples/references/README.md +++ b/examples/references/README.md @@ -1,10 +1,11 @@ # Reproducible trainings with Ignite -Inspired by [torchvision/references](https://github.com/pytorch/vision/tree/master/references), we provide several +Inspired by [torchvision/references](https://github.com/pytorch/vision/tree/master/references), we provide several reproducible baselines for vision tasks: -* [x] Classification - * [x] [ImageNet](classification/imagenet) +- [x] Classification -* [x] Segmentation - * [x] [Pascal VOC2012](segmentation/pascal_voc2012) + - [x] [ImageNet](classification/imagenet) + +- [x] Segmentation + - [x] [Pascal VOC2012](segmentation/pascal_voc2012) diff --git a/examples/references/classification/imagenet/.gitignore b/examples/references/classification/imagenet/.gitignore index 024d5f16ec64..5ffcda8666da 100644 --- a/examples/references/classification/imagenet/.gitignore +++ b/examples/references/classification/imagenet/.gitignore @@ -4,4 +4,4 @@ output experiments/plx/*.yml experiments/plx/*.yaml .polyaxon/ -.polyaxonignore \ No newline at end of file +.polyaxonignore diff --git a/examples/references/classification/imagenet/NOTES_MLflow.md b/examples/references/classification/imagenet/NOTES_MLflow.md index 8fc35cf3547c..b9074b3c8c50 100644 --- a/examples/references/classification/imagenet/NOTES_MLflow.md +++ b/examples/references/classification/imagenet/NOTES_MLflow.md @@ -4,8 +4,8 @@ User can run ImageNet training using MLflow experiments tracking system on the l ## Requirements -We use `conda` and [MLflow](https://github.com/mlflow/mlflow) to -handle experiments/runs and all python dependencies. +We use `conda` and [MLflow](https://github.com/mlflow/mlflow) to +handle experiments/runs and all python dependencies. Please, install these tools: - [MLflow](https://github.com/mlflow/mlflow): `pip install mlflow` @@ -14,9 +14,10 @@ Please, install these tools: We need to also install Nvidia/APEX and libraries for opencv. APEX is automatically installed on the first run. Manually, all can be installed with the following commands. **Important**, please, check the content of `experiments/setup_opencv.sh` before running. + ```bash sh experiments/setup_apex.sh - + sh experiments/setup_opencv.sh ``` @@ -25,29 +26,34 @@ sh experiments/setup_opencv.sh ### Download ImageNet-1k dataset Since 10/2019, we need to register an account in order to download the dataset. -To download the dataset, use the following form : http://www.image-net.org/download.php +To download the dataset, use the following form : http://www.image-net.org/download.php ### Setup dataset path To configure the path to already existing ImageNet dataset, please specify `DATASET_PATH` environment variable + ```bash export DATASET_PATH=/path/to/imagenet # export DATASET_PATH=$PWD/input/imagenet ``` ### MLflow setup - + Setup mlflow output path as a local storage (option with remote storage is not supported): + ```bash export MLFLOW_TRACKING_URI=/path/to/output/mlruns # e.g export MLFLOW_TRACKING_URI=$PWD/output/mlruns ``` Create once "Trainings" experiment + ```bash mlflow experiments create -n Trainings ``` + or check existing experiments: + ```bash mlflow experiments list ``` @@ -64,7 +70,7 @@ mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs ### Training on single node with multiple GPUs -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. +For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. By default, batch size is 64 per process. ```bash @@ -74,7 +80,7 @@ mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs ``` ## Training tracking - + ### MLflow dashboard To visualize experiments and runs, user can start mlflow dashboard: @@ -92,17 +98,18 @@ To visualize experiments and runs, user can start tensorboard: tensorboard --logdir /path/to/output/mlruns/1 # e.g tensorboard --logdir $PWD/output/mlruns/1 ``` -where `/1` points to "Training" experiment. +where `/1` points to "Training" experiment. ## Implementation details Files tree description: + ``` code -configs +configs experiments/mlflow : MLflow related files -notebooks +notebooks ``` ### Experiments @@ -111,8 +118,10 @@ notebooks - [MLproject](experiments/mlflow/MLproject): defines types of experiments we would like to perform by "entry points": - main : starts single-node multi-GPU training script -When we execute +When we execute + ```bash mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs/train/baseline_r50.py -P num_gpus=2 ``` + it executes `main` entry point from [MLproject](experiments/mlflow/MLproject) and runs provided command. diff --git a/examples/references/classification/imagenet/NOTES_Polyaxon.md b/examples/references/classification/imagenet/NOTES_Polyaxon.md index 0bf623e7db4e..a761f33146ae 100644 --- a/examples/references/classification/imagenet/NOTES_Polyaxon.md +++ b/examples/references/classification/imagenet/NOTES_Polyaxon.md @@ -11,25 +11,28 @@ In this case we assume, user has [Polyaxon](https://polyaxon.com/) installed on ### Setup Polyaxon project Create project on the cluster + ```bash polyaxon project create --name=imagenet --description="Classification on ImageNet" ``` + Initialize local project + ```bash polyaxon init imagenet -``` +``` -Please rename and modify `experiments/plx/xp_training.yml.tmpl` to `experiments/plx/xp_training.yml` +Please rename and modify `experiments/plx/xp_training.yml.tmpl` to `experiments/plx/xp_training.yml` to adapt to your cluster configuration. #### Download ImageNet dataset Since 10/2019, we need to register an account in order to download the dataset. -To download the dataset, use the following form : http://www.image-net.org/download.php +To download the dataset, use the following form : http://www.image-net.org/download.php ### Training on single node with single or multiple GPU -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. +For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. By default, batch size is 64 per process. Please, adapt `xp_training.yml` to your cluster configuration and run it, for example, as ```bash @@ -40,18 +43,18 @@ polyaxon run -u -f experiments/plx/xp_training.yml --name="baseline_resnet50" -- Please, see Polyaxon dashboard usage at https://docs.polyaxon.com/ - ## Implementation details Files tree description: + ``` code -configs +configs experiments/plx : Polyaxon related files -notebooks +notebooks ``` ### Experiments -File [xp_training.yml.tmpl](experiments/mlflow/xp_training.yml.tmpl) defines all configurations and dependencies -necessary for our experimentations. Part `run.cmd` starts single-node multi-GPU training script. +File [xp_training.yml.tmpl](experiments/mlflow/xp_training.yml.tmpl) defines all configurations and dependencies +necessary for our experimentations. Part `run.cmd` starts single-node multi-GPU training script. diff --git a/examples/references/classification/imagenet/README.md b/examples/references/classification/imagenet/README.md index b4b08f1616e0..9228d483a022 100644 --- a/examples/references/classification/imagenet/README.md +++ b/examples/references/classification/imagenet/README.md @@ -79,9 +79,9 @@ uses distributed sampling, scales batch size etc). ### Results -Model | Training Top-1 Accuracy | Training Top-5 Accuracy | Test Top-1 Accuracy | Test Top-5 Accuracy ----|---|---|---|--- -ResNet-50 | 78% | 92% | 77% | 94% +| Model | Training Top-1 Accuracy | Training Top-5 Accuracy | Test Top-1 Accuracy | Test Top-5 Accuracy | +| --------- | ----------------------- | ----------------------- | ------------------- | ------------------- | +| ResNet-50 | 78% | 92% | 77% | 94% | ## Acknowledgements diff --git a/examples/references/classification/imagenet/experiments/mlflow/conda.yaml b/examples/references/classification/imagenet/experiments/mlflow/conda.yaml index ec42b74dce4c..212ec199d591 100644 --- a/examples/references/classification/imagenet/experiments/mlflow/conda.yaml +++ b/examples/references/classification/imagenet/experiments/mlflow/conda.yaml @@ -9,11 +9,11 @@ dependencies: - torchvision - pip - pip: - - mlflow - - albumentations - - tqdm - - tensorboardX - - py_config_runner - - pynvml - - pytorch-ignite - - git+https://github.com/vfdev-5/ImageDatasetViz.git + - mlflow + - albumentations + - tqdm + - tensorboardX + - py_config_runner + - pynvml + - pytorch-ignite + - git+https://github.com/vfdev-5/ImageDatasetViz.git diff --git a/examples/references/segmentation/pascal_voc2012/.gitignore b/examples/references/segmentation/pascal_voc2012/.gitignore index 024d5f16ec64..5ffcda8666da 100644 --- a/examples/references/segmentation/pascal_voc2012/.gitignore +++ b/examples/references/segmentation/pascal_voc2012/.gitignore @@ -4,4 +4,4 @@ output experiments/plx/*.yml experiments/plx/*.yaml .polyaxon/ -.polyaxonignore \ No newline at end of file +.polyaxonignore diff --git a/examples/references/segmentation/pascal_voc2012/NOTES_MLflow.md b/examples/references/segmentation/pascal_voc2012/NOTES_MLflow.md index b0589b895335..2ca231589148 100644 --- a/examples/references/segmentation/pascal_voc2012/NOTES_MLflow.md +++ b/examples/references/segmentation/pascal_voc2012/NOTES_MLflow.md @@ -4,8 +4,8 @@ User can run Pascal VOC training using MLflow experiments tracking system on the ## Requirements -We use `conda` and [MLflow](https://github.com/mlflow/mlflow) to -handle experiments/runs and all python dependencies. +We use `conda` and [MLflow](https://github.com/mlflow/mlflow) to +handle experiments/runs and all python dependencies. Please, install these tools: - [MLflow](https://github.com/mlflow/mlflow): `pip install mlflow` @@ -14,9 +14,10 @@ Please, install these tools: We need to also install Nvidia/APEX and libraries for opencv. APEX is automatically installed on the first run. Manually, all can be installed with the following commands. **Important**, please, check the content of `experiments/setup_opencv.sh` before running. + ```bash sh experiments/setup_apex.sh - + sh experiments/setup_opencv.sh ``` @@ -34,6 +35,7 @@ mlflow run experiments/mlflow -e download -P output_path=/path/where/download/ ### Setup dataset path To configure the path to already existing PASCAL VOC2012 dataset, please specify `DATASET_PATH` environment variable + ```bash export DATASET_PATH=/path/to/pascal_voc2012 # e.g. export DATASET_PATH=$PWD/input/ where VOCdevkit is located @@ -42,24 +44,29 @@ export DATASET_PATH=/path/to/pascal_voc2012 #### With SBD dataset Optionally, user can configure the path to already existing SBD dataset, please specify `SBD_DATASET_PATH` environment variable + ```bash export SBD_DATASET_PATH=/path/to/SBD/benchmark_RELEASE/dataset/ # e.g. export SBD_DATASET_PATH=/data/SBD/benchmark_RELEASE/dataset/ where "cls img inst train.txt train_noval.txt val.txt" are located ``` ### MLflow setup - + Setup mlflow output path as a local storage (option with remote storage is not supported): + ```bash export MLFLOW_TRACKING_URI=/path/to/output/mlruns # e.g export MLFLOW_TRACKING_URI=$PWD/output/mlruns ``` Create once "Trainings" experiment + ```bash mlflow experiments create -n Trainings ``` + or check existing experiments: + ```bash mlflow experiments list ``` @@ -76,7 +83,7 @@ mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs ### Training on single node with multiple GPUs -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. +For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. For example, a single GPU with 11GB can have a batch size of 8-9, thus, on N devices, we can set it as `N * 9`. ```bash @@ -86,7 +93,7 @@ mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs ``` ## Training tracking - + ### MLflow dashboard To visualize experiments and runs, user can start mlflow dashboard: @@ -104,17 +111,18 @@ To visualize experiments and runs, user can start tensorboard: tensorboard --logdir /path/to/output/mlruns/1 # e.g tensorboard --logdir $PWD/output/mlruns/1 ``` -where `/1` points to "Training" experiment. +where `/1` points to "Training" experiment. ## Implementation details Files tree description: + ``` code -configs +configs experiments/mlflow : MLflow related files -notebooks +notebooks ``` ### Experiments @@ -123,8 +131,10 @@ notebooks - [MLproject](experiments/mlflow/MLproject): defines types of experiments we would like to perform by "entry points": - main : starts single-node multi-GPU training script -When we execute +When we execute + ```bash mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs/train/baseline_resnet101.py -P num_gpus=2 ``` + it executes `main` entry point from [MLproject](experiments/mlflow/MLproject) and runs provided command. diff --git a/examples/references/segmentation/pascal_voc2012/NOTES_Polyaxon.md b/examples/references/segmentation/pascal_voc2012/NOTES_Polyaxon.md index 0e806c232458..25c19ec22cc8 100644 --- a/examples/references/segmentation/pascal_voc2012/NOTES_Polyaxon.md +++ b/examples/references/segmentation/pascal_voc2012/NOTES_Polyaxon.md @@ -11,28 +11,32 @@ In this case we assume, user has [Polyaxon](https://polyaxon.com/) installed on ### Setup Polyaxon project Create project on the cluster + ```bash polyaxon project create --name=pascal-voc2012 --description="Semantic segmentation on Pascal VOC2012" ``` + Initialize local project + ```bash polyaxon init pascal-voc2012 -``` +``` -Please rename and modify `experiments/plx/xp_training.yml.tmpl` to `experiments/plx/xp_training.yml` +Please rename and modify `experiments/plx/xp_training.yml.tmpl` to `experiments/plx/xp_training.yml` to adapt to your cluster configuration. #### Download Pascal VOC 2012 and SBD -Optionally, it is possible to download the datasets as a job. +Optionally, it is possible to download the datasets as a job. Please rename and modify `experiments/plx/job_download_datasets.yml.tmpl` to `experiments/plx/job_download_datasets.yml` + ```bash polyaxon run -u -f experiments/plx/job_download_datasets.yml ``` ### Training on single node with single or multiple GPU -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. +For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. For example, a single GPU with 11GB can have a batch size of 8-9, thus, on N devices, we can set it as `N * 9`. Please, adapt `xp_training.yml` to your cluster configuration and run it, for example, as @@ -44,18 +48,18 @@ polyaxon run -u -f experiments/plx/xp_training.yml --name="baseline_resnet101_sb Please, see Polyaxon dashboard usage at https://docs.polyaxon.com/ - ## Implementation details Files tree description: + ``` code -configs +configs experiments/plx : Polyaxon related files -notebooks +notebooks ``` ### Experiments -File [xp_training.yml.tmpl](experiments/plx/xp_training.yml.tmpl) defines all configurations and dependencies -necessary for our experimentations. Part `run.cmd` starts single-node multi-GPU training script. +File [xp_training.yml.tmpl](experiments/plx/xp_training.yml.tmpl) defines all configurations and dependencies +necessary for our experimentations. Part `run.cmd` starts single-node multi-GPU training script. diff --git a/examples/references/segmentation/pascal_voc2012/README.md b/examples/references/segmentation/pascal_voc2012/README.md index 923f1efbe053..a291bfcfb302 100644 --- a/examples/references/segmentation/pascal_voc2012/README.md +++ b/examples/references/segmentation/pascal_voc2012/README.md @@ -8,9 +8,9 @@ Features: - Distributed training with mixed precision by [nvidia/apex](https://github.com/NVIDIA/apex/) - Experiments tracking with [MLflow](https://mlflow.org/) or [Polyaxon](https://polyaxon.com/) or [ClearML](https://github.com/allegroai/clearml) - Tensorboard | MLflow ----|--- - ![tb_dashboard](assets/tb_dashboard.png) | ![mlflow_dashboard](assets/mlflow_dashboard.png) +| Tensorboard | MLflow | +| ---------------------------------------- | ------------------------------------------------ | +| ![tb_dashboard](assets/tb_dashboard.png) | ![mlflow_dashboard](assets/mlflow_dashboard.png) | @@ -84,9 +84,9 @@ uses distributed sampling, scales batch size etc). ### Results -Model | with SBD | Training mIoU+BG | Test mIoU+BG ----|---|---|--- -DeepLabV3 ResNet-101 | X | 86% | 68% +| Model | with SBD | Training mIoU+BG | Test mIoU+BG | +| -------------------- | -------- | ---------------- | ------------ | +| DeepLabV3 ResNet-101 | X | 86% | 68% | ## Acknowledgements diff --git a/examples/references/segmentation/pascal_voc2012/experiments/mlflow/conda.yaml b/examples/references/segmentation/pascal_voc2012/experiments/mlflow/conda.yaml index d9ea176ab94f..d2998f5c7ab2 100644 --- a/examples/references/segmentation/pascal_voc2012/experiments/mlflow/conda.yaml +++ b/examples/references/segmentation/pascal_voc2012/experiments/mlflow/conda.yaml @@ -9,11 +9,11 @@ dependencies: - torchvision - pip - pip: - - mlflow - - albumentations - - tqdm - - tensorboardX - - py_config_runner - - pynvml - - pytorch-ignite - - git+https://github.com/vfdev-5/ImageDatasetViz.git + - mlflow + - albumentations + - tqdm + - tensorboardX + - py_config_runner + - pynvml + - pytorch-ignite + - git+https://github.com/vfdev-5/ImageDatasetViz.git diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index fb2b977315e3..aab0d7abcf17 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -9,4 +9,3 @@ python reinforce.py # For actor critic: python actor_critic.py ``` - From 801a6a91797387ff922d9a632bc798bc495746c4 Mon Sep 17 00:00:00 2001 From: Debojyoti Chakraborty Date: Tue, 9 Feb 2021 14:12:27 +0530 Subject: [PATCH 11/19] added requirements.txt and updated readme.md (#1624) * added requirements.txt and updated readme.md * Update examples/contrib/cifar10/README.md Co-authored-by: vfdev * Update examples/contrib/cifar10/requirements.txt Co-authored-by: vfdev Co-authored-by: vfdev --- examples/contrib/cifar10/README.md | 2 ++ examples/contrib/cifar10/requirements.txt | 6 ++++++ 2 files changed, 8 insertions(+) create mode 100644 examples/contrib/cifar10/requirements.txt diff --git a/examples/contrib/cifar10/README.md b/examples/contrib/cifar10/README.md index 4ab0fb6ce52b..8f4747b76216 100644 --- a/examples/contrib/cifar10/README.md +++ b/examples/contrib/cifar10/README.md @@ -23,6 +23,8 @@ Configurations: - [python-fire](https://github.com/google/python-fire): `pip install fire` - Optional: [clearml](https://github.com/allegroai/clearml): `pip install clearml` +Alternatively, install the all requirements using `pip install -r requirements.txt`. + ## Usage: Run the example on a single GPU: diff --git a/examples/contrib/cifar10/requirements.txt b/examples/contrib/cifar10/requirements.txt new file mode 100644 index 000000000000..40f0771f5d94 --- /dev/null +++ b/examples/contrib/cifar10/requirements.txt @@ -0,0 +1,6 @@ +pytorch-ignite +torchvision +tqdm +tensorboardX +fire +clearml From bd4ab8c077908f61e278b6294567fb9255f48939 Mon Sep 17 00:00:00 2001 From: Devanshu Shah <56106207+Devanshu24@users.noreply.github.com> Date: Thu, 11 Feb 2021 22:31:19 +0530 Subject: [PATCH 12/19] Replace relative paths with raw.githubusercontent (#1629) --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c8475bb2d501..d2aa7f8e9f68 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,9 @@ def find_version(*file_paths): raise RuntimeError("Unable to find version string.") -readme = read("README.md") +readme = read("README.md").replace( + 'src="assets/', 'src="https://raw.githubusercontent.com/pytorch/ignite/master/assets/' +) VERSION = find_version("ignite", "__init__.py") From 944afab6cf22a09d25c4791f2fa960ef360e49ca Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 11 Feb 2021 22:42:41 +0100 Subject: [PATCH 13/19] Updated cifar10 example (#1632) * Updates for cifar10 example * Updates for cifar10 example * More updates * Updated code * Fixed code-formatting --- examples/contrib/cifar10/main.py | 69 +++++++++-------- examples/contrib/cifar10_qat/main.py | 111 +++++++++++++++++++-------- 2 files changed, 116 insertions(+), 64 deletions(-) diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 1a5773e71e7b..4838f2e096be 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -6,13 +6,14 @@ import torch.nn as nn import torch.optim as optim import utils +from torch.cuda.amp import GradScaler, autocast import ignite import ignite.distributed as idist from ignite.contrib.engines import common from ignite.contrib.handlers import PiecewiseLinear from ignite.engine import Engine, Events, create_supervised_evaluator -from ignite.handlers import Checkpoint, DiskSaver +from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine from ignite.metrics import Accuracy, Loss from ignite.utils import manual_seed, setup_logger @@ -76,8 +77,8 @@ def training(local_rank, config): # Let's now setup evaluator engine to perform model's validation and compute metrics metrics = { - "accuracy": Accuracy(), - "loss": Loss(criterion), + "Accuracy": Accuracy(), + "Loss": Loss(criterion), } # We define two evaluators as they wont have exactly similar roles: @@ -102,15 +103,18 @@ def run_validation(engine): evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) - # Store 3 best models by validation accuracy: - common.gen_save_best_models_by_val_score( - save_handler=get_save_handler(config), - evaluator=evaluator, - models={"model": model}, - metric_name="accuracy", - n_saved=3, - trainer=trainer, - tag="test", + # Store 2 best models by validation accuracy starting from num_epochs / 2: + best_model_handler = Checkpoint( + {"model": model}, + get_save_handler(config), + filename_prefix="best", + n_saved=2, + global_step_transform=global_step_from_engine(trainer), + score_name="test_accuracy", + score_function=Checkpoint.get_default_score_fn("accuracy"), + ) + evaluator.add_event_handler( + Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) # In order to check training resuming we can stop training on a given iteration @@ -124,9 +128,8 @@ def _(): try: trainer.run(train_loader, max_epochs=config["num_epochs"]) except Exception as e: - import traceback - - print(traceback.format_exc()) + logger.exception("") + raise e if rank == 0: tb_logger.close() @@ -145,13 +148,14 @@ def run( learning_rate=0.4, num_warmup_epochs=4, validate_every=3, - checkpoint_every=200, + checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, stop_iteration=None, with_clearml=False, + with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. @@ -179,6 +183,7 @@ def run( It can be 0 to disable it. Default, 15. stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint. with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. + with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ @@ -245,13 +250,17 @@ def initialize(config): def log_metrics(logger, epoch, elapsed, tag, metrics): metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) - logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {int(elapsed)} - {tag} metrics:\n {metrics_output}") + logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}") def log_basic_info(logger, config): logger.info(f"Train {config['model']} on CIFAR10") logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") logger.info("\n") logger.info("Configuration:") @@ -279,6 +288,9 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations + with_amp = config["with_amp"] + scaler = GradScaler(enabled=with_amp) + def train_step(engine, batch): x, y = batch[0], batch[1] @@ -288,28 +300,21 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - # Supervised part - y_pred = model(x) - loss = criterion(y_pred, y) - optimizer.zero_grad() - loss.backward() - optimizer.step() + with autocast(enabled=with_amp): + y_pred = model(x) + loss = criterion(y_pred, y) - # This can be helpful for XLA to avoid performance slow down if fetch loss.item() every iteration - if config["log_every_iters"] > 0 and (engine.state.iteration - 1) % config["log_every_iters"] == 0: - batch_loss = loss.item() - engine.state.saved_batch_loss = batch_loss - else: - batch_loss = engine.state.saved_batch_loss + optimizer.zero_grad() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() return { - "batch loss": batch_loss, + "batch loss": loss.item(), } trainer = Engine(train_step) - trainer.state.saved_batch_loss = -1.0 - trainer.state_dict_user_keys.append("saved_batch_loss") trainer.logger = logger to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index 5e612bfd1184..bac448f846e6 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -6,13 +6,14 @@ import torch.nn as nn import torch.optim as optim import utils +from torch.cuda.amp import GradScaler, autocast import ignite import ignite.distributed as idist from ignite.contrib.engines import common from ignite.contrib.handlers import PiecewiseLinear from ignite.engine import Engine, Events, create_supervised_evaluator -from ignite.handlers import Checkpoint, DiskSaver +from ignite.handlers import Checkpoint, DiskSaver, global_step_from_engine from ignite.metrics import Accuracy, Loss from ignite.utils import manual_seed, setup_logger @@ -31,16 +32,37 @@ def training(local_rank, config): if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") - folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now) + folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name if not output_path.exists(): output_path.mkdir(parents=True) config["output_path"] = output_path.as_posix() - logger.info("Output path: {}".format(config["output_path"])) + logger.info(f"Output path: {config['output_path']}") if "cuda" in device.type: config["cuda device name"] = torch.cuda.get_device_name(local_rank) + if config["with_clearml"]: + try: + from clearml import Task + except ImportError: + # Backwards-compatibility for legacy Trains SDK + from trains import Task + + task = Task.init("CIFAR10-Training", task_name=output_path.stem) + task.connect_configuration(config) + # Log hyper parameters + hyper_params = [ + "model", + "batch_size", + "momentum", + "weight_decay", + "num_epochs", + "learning_rate", + "num_warmup_epochs", + ] + task.connect({k: config[k] for k in hyper_params}) + # Setup dataflow, model, optimizer, criterion train_loader, test_loader = get_dataflow(config) @@ -78,15 +100,18 @@ def run_validation(engine): evaluators = {"training": train_evaluator, "test": evaluator} tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators) - # Store 3 best models by validation accuracy: - common.save_best_model_by_val_score( - output_path=config["output_path"], - evaluator=evaluator, - model=model, - metric_name="Accuracy", - n_saved=1, - trainer=trainer, - tag="test", + # Store 2 best models by validation accuracy starting from num_epochs / 2: + best_model_handler = Checkpoint( + {"model": model}, + get_save_handler(config), + filename_prefix="best", + n_saved=2, + global_step_transform=global_step_from_engine(trainer), + score_name="test_accuracy", + score_function=Checkpoint.get_default_score_fn("accuracy"), + ) + evaluator.add_event_handler( + Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) trainer.run(train_loader, max_epochs=config["num_epochs"]) @@ -108,11 +133,13 @@ def run( learning_rate=0.4, num_warmup_epochs=4, validate_every=3, - checkpoint_every=200, + checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, + with_clearml=False, + with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. @@ -138,6 +165,8 @@ def run( resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 15. + with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. + with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ @@ -149,10 +178,8 @@ def run( spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: - try: - parallel.run(training, config) - except Exception as e: - raise e + + parallel.run(training, config) def get_dataflow(config): @@ -167,7 +194,7 @@ def get_dataflow(config): # Ensure that only rank 0 download the dataset idist.barrier() - # Setup data loader also adapted to distributed config + # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) @@ -180,6 +207,7 @@ def get_dataflow(config): def initialize(config): model = utils.get_model(config["model"]) + # Adapt model for distributed settings if configured model = idist.auto_model(model, find_unused_parameters=True) optimizer = optim.SGD( @@ -205,24 +233,28 @@ def initialize(config): def log_metrics(logger, epoch, elapsed, tag, metrics): metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) - logger.info(f"\nEpoch {epoch} - Time taken (seconds) : {elapsed:.02f} - {tag} metrics:\n {metrics_output}") + logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}") def log_basic_info(logger, config): - logger.info("Quantization Aware Training {} on CIFAR10".format(config["model"])) - logger.info("- PyTorch version: {}".format(torch.__version__)) - logger.info("- Ignite version: {}".format(ignite.__version__)) + logger.info(f"Quantization Aware Training {config['model']} on CIFAR10") + logger.info(f"- PyTorch version: {torch.__version__}") + logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") logger.info("\n") logger.info("Configuration:") for key, value in config.items(): - logger.info("\t{}: {}".format(key, value)) + logger.info(f"\t{key}: {value}") logger.info("\n") if idist.get_world_size() > 1: logger.info("\nDistributed setting:") - logger.info("\tbackend: {}".format(idist.backend())) - logger.info("\tworld size: {}".format(idist.get_world_size())) + logger.info(f"\tbackend: {idist.backend()}") + logger.info(f"\tworld size: {idist.get_world_size()}") logger.info("\n") @@ -239,6 +271,9 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con # - RunningAverage` on `train_step` output # - Two progress bars on epochs and optionally on iterations + with_amp = config["with_amp"] + scaler = GradScaler(enabled=with_amp) + def train_step(engine, batch): x, y = batch[0], batch[1] @@ -248,12 +283,15 @@ def train_step(engine, batch): y = y.to(device, non_blocking=True) model.train() - y_pred = model(x) - loss = criterion(y_pred, y) + + with autocast(enabled=with_amp): + y_pred = model(x) + loss = criterion(y_pred, y) optimizer.zero_grad() - loss.backward() - optimizer.step() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() return { "batch loss": loss.item(), @@ -272,7 +310,7 @@ def train_step(engine, batch): train_sampler=train_sampler, to_save=to_save, save_every_iters=config["checkpoint_every"], - output_path=config["output_path"], + save_handler=get_save_handler(config), lr_scheduler=lr_scheduler, output_names=metric_names if config["log_every_iters"] > 0 else None, with_pbars=False, @@ -282,13 +320,22 @@ def train_step(engine, batch): resume_from = config["resume_from"] if resume_from is not None: checkpoint_fp = Path(resume_from) - assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix()) - logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix())) + assert checkpoint_fp.exists(), f"Checkpoint '{checkpoint_fp.as_posix()}' is not found" + logger.info(f"Resume from a checkpoint: {checkpoint_fp.as_posix()}") checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu") Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) return trainer +def get_save_handler(config): + if config["with_clearml"]: + from ignite.contrib.handlers.clearml_logger import ClearMLSaver + + return ClearMLSaver(dirname=config["output_path"]) + + return DiskSaver(config["output_path"], require_empty=False) + + if __name__ == "__main__": fire.Fire({"run": run}) From 02e767ed66918351391f7ce06d2bc6ac493181f5 Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 12 Feb 2021 09:40:03 +0100 Subject: [PATCH 14/19] Fixed failling CI and typos for cifar10 examples (#1633) * Updates for cifar10 example * Updates for cifar10 example * More updates * Updated code * Fixed code-formatting * Fixed typo and failing CI * Fixed hvd spawn fail and better synced qat code --- .circleci/config.yml | 10 +++++----- examples/contrib/cifar10/main.py | 10 +++++++--- examples/contrib/cifar10_qat/main.py | 14 +++++++++++--- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b62b52c3dd78..6a9e1af11565 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -256,7 +256,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python ${example_path}/main.py run" + export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" @@ -268,7 +268,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl" + export test_cmd="CI=1 python -u -m torch.distributed.launch --nproc_per_node=2 --use_env ${example_path}/main.py run --backend=nccl --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -280,7 +280,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2" + export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" @@ -334,7 +334,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod" + export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" @@ -346,7 +346,7 @@ jobs: export example_path="examples/contrib/cifar10" # initial run export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2" + export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200" docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" # resume export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" diff --git a/examples/contrib/cifar10/main.py b/examples/contrib/cifar10/main.py index 4838f2e096be..c27c80906444 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/contrib/cifar10/main.py @@ -111,7 +111,7 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler @@ -173,7 +173,7 @@ def run( learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.4. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 4. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3. - checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. + checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 1000. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, @@ -258,9 +258,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:") diff --git a/examples/contrib/cifar10_qat/main.py b/examples/contrib/cifar10_qat/main.py index bac448f846e6..364e4d4cdbeb 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/contrib/cifar10_qat/main.py @@ -108,13 +108,17 @@ def run_validation(engine): n_saved=2, global_step_transform=global_step_from_engine(trainer), score_name="test_accuracy", - score_function=Checkpoint.get_default_score_fn("accuracy"), + score_function=Checkpoint.get_default_score_fn("Accuracy"), ) evaluator.add_event_handler( Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler ) - trainer.run(train_loader, max_epochs=config["num_epochs"]) + try: + trainer.run(train_loader, max_epochs=config["num_epochs"]) + except Exception as e: + logger.exception("") + raise e if rank == 0: tb_logger.close() @@ -241,9 +245,13 @@ def log_basic_info(logger, config): logger.info(f"- PyTorch version: {torch.__version__}") logger.info(f"- Ignite version: {ignite.__version__}") if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") logger.info(f"- CUDA version: {torch.version.cuda}") - logger.info(f"- CUDNN version: {torch.backends.cudnn.version()}") + logger.info(f"- CUDNN version: {cudnn.version()}") logger.info("\n") logger.info("Configuration:") From 61d8c2fb135ec81636ab4544fed7490055837939 Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 14 Feb 2021 01:50:31 +0100 Subject: [PATCH 15/19] Removed temporary hack to install pth 1.7.1 (#1638) - updated default pth image for gpu tests - updated TORCH_CUDA_ARCH_LIST - fixed /merge -> /head in trigger ci pipeline --- .circleci/config.yml | 8 ++++---- .github/workflows/trigger_circle_ci.py | 3 +++ docker/hvd/Dockerfile.hvd-apex | 8 +------- docker/hvd/Dockerfile.hvd-base | 6 ------ docker/main/Dockerfile.apex | 8 +------- docker/main/Dockerfile.base | 3 --- docker/msdp/Dockerfile.msdp-apex | 5 +---- 7 files changed, 10 insertions(+), 31 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 6a9e1af11565..84bc47732d38 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,11 +4,11 @@ parameters: pytorch_stable_image: type: string # https://hub.docker.com/r/pytorch/pytorch/tags - default: "pytorch/pytorch:1.7.0-cuda11.0-cudnn8-runtime" + default: "pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime" pytorch_stable_image_devel: type: string # https://hub.docker.com/r/pytorch/pytorch/tags - default: "pytorch/pytorch:1.7.0-cuda11.0-cudnn8-devel" + default: "pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel" workingdir: type: string default: "/tmp/ignite" @@ -20,7 +20,7 @@ parameters: default: false build_docker_image_pytorch_version: type: string - default: "1.7.0-cuda11.0-cudnn8" + default: "1.7.1-cuda11.0-cudnn8" build_docker_image_hvd_version: type: string default: "v0.21.0" @@ -195,7 +195,7 @@ jobs: name: Install dependencies command: | conda --version - conda install -y pytorch torchvision cudatoolkit=10.1 -c pytorch + conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch pip install -r requirements-dev.txt pip install . diff --git a/.github/workflows/trigger_circle_ci.py b/.github/workflows/trigger_circle_ci.py index 1ee35a493ae3..52514e2e79a7 100644 --- a/.github/workflows/trigger_circle_ci.py +++ b/.github/workflows/trigger_circle_ci.py @@ -101,6 +101,9 @@ def assert_workflows_successful(pipeline_id, headers): print(f"- should_publish_docker_images: {should_publish_docker_images}") print(f"- Branch: {branch}") + if branch.startswith("refs/pull") and branch.endswith("/merge"): + branch = branch.replace("/merge", "/head") + print(f"Replaced /merge -> /head : {branch}") headers = {"authorization": "Basic", "content-type": "application/json", "Circle-Token": os.environ["CIRCLE_TOKEN"]} diff --git a/docker/hvd/Dockerfile.hvd-apex b/docker/hvd/Dockerfile.hvd-apex index 1da1368a547c..2ef8b10f66ba 100644 --- a/docker/hvd/Dockerfile.hvd-apex +++ b/docker/hvd/Dockerfile.hvd-apex @@ -6,10 +6,7 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-hvd-builder -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - -ARG ARG_TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST # Install git @@ -37,9 +34,6 @@ RUN apt-get update && apt-get install -y git && \ # Build runtime image FROM pytorch/pytorch:${PTH_VERSION}-runtime -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - # Apex COPY --from=apex-hvd-builder /tmp/apex/apex-*.whl /tmp/apex/ RUN pip install --no-cache-dir /tmp/apex/apex-*.whl && \ diff --git a/docker/hvd/Dockerfile.hvd-base b/docker/hvd/Dockerfile.hvd-base index 7475df564b4c..8ed83a7fea7e 100644 --- a/docker/hvd/Dockerfile.hvd-base +++ b/docker/hvd/Dockerfile.hvd-base @@ -5,9 +5,6 @@ ARG PTH_VERSION FROM pytorch/pytorch:${PTH_VERSION}-devel as builder -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - ARG HVD_VERSION # Build Horovod @@ -21,9 +18,6 @@ RUN apt-get update && apt-get install -y git && \ # Build runtime image FROM pytorch/pytorch:${PTH_VERSION}-runtime -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ diff --git a/docker/main/Dockerfile.apex b/docker/main/Dockerfile.apex index 509419dfbd98..dc52abc5cd2d 100644 --- a/docker/main/Dockerfile.apex +++ b/docker/main/Dockerfile.apex @@ -6,10 +6,7 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-builder -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - -ARG ARG_TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST # Install git @@ -27,9 +24,6 @@ RUN echo "Setup NVIDIA Apex" && \ # 2/ Build the runtime image FROM pytorch/pytorch:${PTH_VERSION}-runtime -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - COPY --from=apex-builder /tmp/apex/apex-*.whl /tmp/apex/ RUN pip install --no-cache-dir /tmp/apex/apex-*.whl && \ rm -fr /tmp/apex diff --git a/docker/main/Dockerfile.base b/docker/main/Dockerfile.base index a904cc83814d..2c70a5b23b0b 100644 --- a/docker/main/Dockerfile.base +++ b/docker/main/Dockerfile.base @@ -3,9 +3,6 @@ ARG PTH_VERSION FROM pytorch/pytorch:${PTH_VERSION}-runtime -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ diff --git a/docker/msdp/Dockerfile.msdp-apex b/docker/msdp/Dockerfile.msdp-apex index 1265107b7db0..5d815272ac12 100644 --- a/docker/msdp/Dockerfile.msdp-apex +++ b/docker/msdp/Dockerfile.msdp-apex @@ -6,10 +6,7 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-msdp-builder -# Temporary hack to install pth 1.7.1 -RUN conda install -y pytorch torchvision cudatoolkit=11.0 -c pytorch - -ARG ARG_TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST # Install git From 27eca292803624ac06f61c771dcfde7210a0debb Mon Sep 17 00:00:00 2001 From: vfdev Date: Sun, 14 Feb 2021 16:47:21 +0100 Subject: [PATCH 16/19] [docker] Pillow -> Pillow-SIMD (#1509) (#1639) * [docker] Pillow -> Pillow-SIMD (#1509) * [docker] Pillow -> Pillow-SIMD * replace pillow with pillow-simd in base docker files * chore(docker): apt-get autoremove after pillow-simd installation * apt-get install at once, autoremove g++ * install g++ in pillow installation layer Co-authored-by: Sylvain Desroziers * Fix g++ install issue Co-authored-by: Jeff Yang Co-authored-by: Sylvain Desroziers --- docker/hvd/Dockerfile.hvd-apex | 13 +++++++++++-- docker/hvd/Dockerfile.hvd-apex-vision | 1 - docker/hvd/Dockerfile.hvd-base | 13 +++++++++++-- docker/hvd/Dockerfile.hvd-vision | 1 - docker/main/Dockerfile.apex | 13 +++++++++++-- docker/main/Dockerfile.apex-vision | 1 - docker/main/Dockerfile.base | 13 +++++++++++-- docker/main/Dockerfile.vision | 1 - docker/msdp/Dockerfile.msdp-apex | 13 +++++++++++-- docker/msdp/Dockerfile.msdp-apex-vision | 1 - 10 files changed, 55 insertions(+), 15 deletions(-) diff --git a/docker/hvd/Dockerfile.hvd-apex b/docker/hvd/Dockerfile.hvd-apex index 2ef8b10f66ba..3a6f09a3d23a 100644 --- a/docker/hvd/Dockerfile.hvd-apex +++ b/docker/hvd/Dockerfile.hvd-apex @@ -42,9 +42,10 @@ RUN pip install --no-cache-dir /tmp/apex/apex-*.whl && \ # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ + apt-get -y install --no-install-recommends tzdata git && \ dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get -y install --no-install-recommends git && \ + apt-get autoremove -y && \ + apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Ignite main dependencies @@ -52,6 +53,14 @@ RUN pip install --upgrade --no-cache-dir pytorch-ignite \ tensorboard \ tqdm +# replace pillow with pillow-simd +RUN apt-get update && apt-get -y install --no-install-recommends g++ && \ + pip uninstall -y pillow && \ + CC="cc -mavx2" pip install --upgrade --no-cache-dir --force-reinstall pillow-simd && \ + apt-get remove -y g++ && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + # Checkout Ignite examples only RUN mkdir -p pytorch-ignite-examples && \ cd pytorch-ignite-examples && \ diff --git a/docker/hvd/Dockerfile.hvd-apex-vision b/docker/hvd/Dockerfile.hvd-apex-vision index b64c09545f33..42bcfeb79780 100644 --- a/docker/hvd/Dockerfile.hvd-apex-vision +++ b/docker/hvd/Dockerfile.hvd-apex-vision @@ -15,5 +15,4 @@ RUN pip install --upgrade --no-cache-dir albumentations \ numpy \ opencv-python \ py_config_runner \ - pillow \ clearml diff --git a/docker/hvd/Dockerfile.hvd-base b/docker/hvd/Dockerfile.hvd-base index 8ed83a7fea7e..8cdda5404a3d 100644 --- a/docker/hvd/Dockerfile.hvd-base +++ b/docker/hvd/Dockerfile.hvd-base @@ -21,9 +21,10 @@ FROM pytorch/pytorch:${PTH_VERSION}-runtime # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ + apt-get -y install --no-install-recommends tzdata git && \ dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get -y install --no-install-recommends git && \ + apt-get autoremove -y && \ + apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Ignite main dependencies @@ -31,6 +32,14 @@ RUN pip install --upgrade --no-cache-dir pytorch-ignite \ tensorboard \ tqdm +# replace pillow with pillow-simd +RUN apt-get update && apt-get -y install --no-install-recommends g++ && \ + pip uninstall -y pillow && \ + CC="cc -mavx2" pip install --upgrade --no-cache-dir --force-reinstall pillow-simd && \ + apt-get remove -y g++ && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + # Checkout Ignite examples only RUN mkdir -p pytorch-ignite-examples && \ cd pytorch-ignite-examples && \ diff --git a/docker/hvd/Dockerfile.hvd-vision b/docker/hvd/Dockerfile.hvd-vision index ba15e3dbb49c..249c71622d63 100644 --- a/docker/hvd/Dockerfile.hvd-vision +++ b/docker/hvd/Dockerfile.hvd-vision @@ -15,5 +15,4 @@ RUN pip install --upgrade --no-cache-dir albumentations \ numpy \ opencv-python \ py_config_runner \ - pillow \ clearml diff --git a/docker/main/Dockerfile.apex b/docker/main/Dockerfile.apex index dc52abc5cd2d..eb6de03fb5f2 100644 --- a/docker/main/Dockerfile.apex +++ b/docker/main/Dockerfile.apex @@ -31,9 +31,10 @@ RUN pip install --no-cache-dir /tmp/apex/apex-*.whl && \ # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ + apt-get -y install --no-install-recommends tzdata git && \ dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get -y install --no-install-recommends git && \ + apt-get autoremove -y && \ + apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Ignite main dependencies @@ -41,6 +42,14 @@ RUN pip install --upgrade --no-cache-dir pytorch-ignite \ tensorboard \ tqdm +# replace pillow with pillow-simd +RUN apt-get update && apt-get -y install --no-install-recommends g++ && \ + pip uninstall -y pillow && \ + CC="cc -mavx2" pip install --upgrade --no-cache-dir --force-reinstall pillow-simd && \ + apt-get remove -y g++ && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + # Checkout Ignite examples only RUN mkdir -p pytorch-ignite-examples && \ cd pytorch-ignite-examples && \ diff --git a/docker/main/Dockerfile.apex-vision b/docker/main/Dockerfile.apex-vision index 63fe867a8048..626a9348cfa6 100644 --- a/docker/main/Dockerfile.apex-vision +++ b/docker/main/Dockerfile.apex-vision @@ -15,5 +15,4 @@ RUN pip install --upgrade --no-cache-dir albumentations \ numpy \ opencv-python \ py_config_runner \ - pillow \ clearml diff --git a/docker/main/Dockerfile.base b/docker/main/Dockerfile.base index 2c70a5b23b0b..be1b0f97b928 100644 --- a/docker/main/Dockerfile.base +++ b/docker/main/Dockerfile.base @@ -6,9 +6,10 @@ FROM pytorch/pytorch:${PTH_VERSION}-runtime # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ + apt-get -y install --no-install-recommends tzdata git && \ dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get -y install --no-install-recommends git && \ + apt-get autoremove -y && \ + apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Ignite main dependencies @@ -16,6 +17,14 @@ RUN pip install --upgrade --no-cache-dir pytorch-ignite \ tensorboard \ tqdm +# replace pillow with pillow-simd +RUN apt-get update && apt-get -y install --no-install-recommends g++ && \ + pip uninstall -y pillow && \ + CC="cc -mavx2" pip install --upgrade --no-cache-dir --force-reinstall pillow-simd && \ + apt-get remove -y g++ && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + # Checkout Ignite examples only RUN mkdir -p pytorch-ignite-examples && \ cd pytorch-ignite-examples && \ diff --git a/docker/main/Dockerfile.vision b/docker/main/Dockerfile.vision index ad6b90051ff0..43e3ee5d796c 100644 --- a/docker/main/Dockerfile.vision +++ b/docker/main/Dockerfile.vision @@ -15,5 +15,4 @@ RUN pip install --upgrade --no-cache-dir albumentations \ numpy \ opencv-python \ py_config_runner \ - pillow \ clearml diff --git a/docker/msdp/Dockerfile.msdp-apex b/docker/msdp/Dockerfile.msdp-apex index 5d815272ac12..c22a6a0730ec 100644 --- a/docker/msdp/Dockerfile.msdp-apex +++ b/docker/msdp/Dockerfile.msdp-apex @@ -53,9 +53,10 @@ RUN cd /msdp && export CUDA_HOME=/usr/local/cuda && \ # Install tzdata / git RUN apt-get update && \ ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime && \ - apt-get install -y tzdata && \ + apt-get -y install --no-install-recommends tzdata git && \ dpkg-reconfigure --frontend noninteractive tzdata && \ - apt-get -y install --no-install-recommends git && \ + apt-get autoremove -y && \ + apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Ignite main dependencies @@ -63,6 +64,14 @@ RUN pip install --upgrade --no-cache-dir pytorch-ignite \ tensorboard \ tqdm +# replace pillow with pillow-simd +RUN apt-get update && apt-get -y install --no-install-recommends g++ && \ + pip uninstall -y pillow && \ + CC="cc -mavx2" pip install --upgrade --no-cache-dir --force-reinstall pillow-simd && \ + apt-get remove -y g++ && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + # Checkout Ignite examples only RUN mkdir -p pytorch-ignite-examples && \ cd pytorch-ignite-examples && \ diff --git a/docker/msdp/Dockerfile.msdp-apex-vision b/docker/msdp/Dockerfile.msdp-apex-vision index cada97803e3d..1c630959dae9 100644 --- a/docker/msdp/Dockerfile.msdp-apex-vision +++ b/docker/msdp/Dockerfile.msdp-apex-vision @@ -15,5 +15,4 @@ RUN pip install --upgrade --no-cache-dir albumentations \ numpy \ opencv-python \ py_config_runner \ - pillow \ clearml From 1f47f3fa521e888cf5204b5463aa3e90c0fe6d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20COKELAER?= Date: Sun, 14 Feb 2021 20:42:22 +0100 Subject: [PATCH 17/19] Fix multinode tests script (#1631) * fix run_multinode_tests_in_docker.sh : run tests with docker python version * add missing modules * build an image with test env and add 'nnodes' 'nproc_per_node' 'gpu' as parameters * #1615 : change nproc_per_node default to 4 * #1615 : fix for gpu enabled tests / container rm step at the end of the script * add xfail decorator for tests/ignite/engine/test_deterministic.py::test_multinode_distrib_cpu * fix script gpu_options * add default tol=1e-6 for _test_distrib_compute_on_criterion * fix for "RuntimeError: trying to initialize the default process group twice!" * tolerance for test_multinode_distrib_cpu case only * fix assert None error * autopep8 fix Co-authored-by: vfdev Co-authored-by: Sylvain Desroziers Co-authored-by: fco-dv --- tests/ignite/conftest.py | 2 + tests/ignite/engine/test_deterministic.py | 1 + tests/ignite/metrics/test_loss.py | 9 ++-- tests/run_multinode_tests_in_docker.sh | 53 +++++++++++++++++++---- 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index 670fff29771a..4ee461445132 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -236,6 +236,8 @@ def distributed_context_multi_node_nccl(multi_node_conf): assert "MASTER_ADDR" in os.environ assert "MASTER_PORT" in os.environ + os.environ["MASTER_PORT"] = str(int(os.getenv("MASTER_PORT")) + 1) + dist_info = { "backend": "nccl", "init_method": "env://", diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index 17ecdfb0059d..53e9ef9c998e 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -573,6 +573,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo): _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") +@pytest.mark.xfail @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index 714b949d4aaa..592824b1b02e 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -75,7 +75,7 @@ def test_reset(): loss.compute() -def _test_distrib_compute_on_criterion(device): +def _test_distrib_compute_on_criterion(device, tol=None): def _test(metric_device): criterion = nn.NLLLoss().to(device) loss = Loss(criterion, device=metric_device) @@ -104,7 +104,10 @@ def _test(metric_device): y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) true_loss_value = criterion(y_pred, y) - assert_almost_equal(res, true_loss_value.item()) + if tol is None: + assert_almost_equal(res, true_loss_value.item()) + else: + assert pytest.approx(res, rel=tol) == true_loss_value.item() _test("cpu") if device.type != "xla": @@ -178,7 +181,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): device = torch.device("cpu") - _test_distrib_compute_on_criterion(device) + _test_distrib_compute_on_criterion(device, tol=1e-6) _test_distrib_accumulator_device(device) diff --git a/tests/run_multinode_tests_in_docker.sh b/tests/run_multinode_tests_in_docker.sh index fa0cfe28f8c1..0dca1b603278 100644 --- a/tests/run_multinode_tests_in_docker.sh +++ b/tests/run_multinode_tests_in_docker.sh @@ -1,9 +1,26 @@ #!/bin/bash # Tests configuration: -export nnodes=2 -export nproc_per_node=4 -export gpu=0 +if [[ -z "$1" || "$1" -lt 2 ]]; then + echo "nnodes setting default to 2" + export nnodes=2 +else + export nnodes=$1 +fi + +if [[ -z "$2" || "$2" -lt 1 ]]; then + echo "nproc_per_node setting default to 4" + export nproc_per_node=4 +else + export nproc_per_node=$2 +fi + +if [ -z "$3" ]; then + echo "gpu setting default to 0 ( False )" + export gpu=0 +else + export gpu=$3 +fi # Start script from ignite root folder if [ ! -d tests ]; then @@ -11,10 +28,15 @@ if [ ! -d tests ]; then exit 1 fi -docker_image="pytorch/pytorch:latest" -install_test_requirements="pip install mock pytest pytest-xdist scikit-learn" -cmd="pytest --dist=each --tx $nproc_per_node*popen//python=python3.6 tests -m multinode_distributed -vvv $@" +docker_image="pytorchignite/tests:latest" +docker build -t $docker_image -< Date: Mon, 15 Feb 2021 19:55:34 +0100 Subject: [PATCH 18/19] remove warning for average=False and is_multilabel=True --- ignite/metrics/precision.py | 8 -------- tests/ignite/metrics/test_precision.py | 8 +------- tests/ignite/metrics/test_recall.py | 8 +------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py index 5ef35f3d7dca..be840a2ab1aa 100644 --- a/ignite/metrics/precision.py +++ b/ignite/metrics/precision.py @@ -20,14 +20,6 @@ def __init__( is_multilabel: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ): - if idist.get_world_size() > 1: - if (not average) and is_multilabel: - warnings.warn( - "Precision/Recall metrics do not work in distributed setting when average=False " - "and is_multilabel=True. Results are not reduced across computing devices. Computed result " - "corresponds to the local rank's (single process) result.", - RuntimeWarning, - ) self._average = average self.eps = 1e-20 diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index 5db50442db38..3eb8013db089 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -825,13 +825,7 @@ def update(engine, i): _test(average=True, n_epochs=2, metric_device=metric_device) if idist.get_world_size() > 1: - with pytest.warns( - RuntimeWarning, - match="Precision/Recall metrics do not work in distributed setting when " - "average=False and is_multilabel=True", - ): - pr = Precision(average=False, is_multilabel=True) - + pr = Precision(average=False, is_multilabel=True) y_pred = torch.randint(0, 2, size=(4, 3, 6, 8)) y = torch.randint(0, 2, size=(4, 3, 6, 8)).long() pr.update((y_pred, y)) diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index ca6407e0c84d..78168403cb35 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -825,13 +825,7 @@ def update(engine, i): _test(average=True, n_epochs=2, metric_device=metric_device) if idist.get_world_size() > 1: - with pytest.warns( - RuntimeWarning, - match="Precision/Recall metrics do not work in distributed setting when " - "average=False and is_multilabel=True", - ): - re = Recall(average=False, is_multilabel=True) - + re = Recall(average=False, is_multilabel=True) y_pred = torch.randint(0, 2, size=(4, 3, 6, 8)) y = torch.randint(0, 2, size=(4, 3, 6, 8)).long() re.update((y_pred, y)) From f3998cb10f91ece2ef00f75de88ccc285bd27365 Mon Sep 17 00:00:00 2001 From: FrAnCOisCokELaER Date: Sun, 21 Feb 2021 20:56:37 +0100 Subject: [PATCH 19/19] update docstring and {precision, recall} tests according to test_multilabel_input_NCHW --- ignite/metrics/precision.py | 5 ----- ignite/metrics/recall.py | 5 ----- tests/ignite/metrics/test_precision.py | 30 +++++++++++++------------- tests/ignite/metrics/test_recall.py | 28 ++++++++++++------------ 4 files changed, 29 insertions(+), 39 deletions(-) diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py index be840a2ab1aa..50142b7f78a3 100644 --- a/ignite/metrics/precision.py +++ b/ignite/metrics/precision.py @@ -101,11 +101,6 @@ def thresholded_output_transform(output): as tensors before computing a metric. This can potentially lead to a memory error if the input data is larger than available RAM. - .. warning:: - - In multilabel cases, if average is False, current implementation does not work with distributed computations. - Results are not reduced across the GPUs. Computed result corresponds to the local rank's (single GPU) result. - Args: output_transform (callable, optional): a callable that is used to transform the diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py index 69e16155b6e6..a11cb7d583bf 100644 --- a/ignite/metrics/recall.py +++ b/ignite/metrics/recall.py @@ -48,11 +48,6 @@ def thresholded_output_transform(output): as tensors before computing a metric. This can potentially lead to a memory error if the input data is larger than available RAM. - .. warning:: - - In multilabel cases, if average is False, current implementation does not work with distributed computations. - Results are not reduced across the GPUs. Computed result corresponds to the local rank's (single GPU) result. - Args: output_transform (callable, optional): a callable that is used to transform the diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index 3eb8013db089..a4bc56cdd92b 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -792,7 +792,7 @@ def update(engine, i): engine = Engine(update) - pr = Precision(average=average, is_multilabel=True) + pr = Precision(average=average, is_multilabel=True, device=metric_device) pr.attach(engine, "pr") data = list(range(n_iters)) @@ -808,13 +808,13 @@ def update(engine, i): else: assert res == res2 + np_y_preds = to_numpy_multilabel(y_preds) + np_y_true = to_numpy_multilabel(y_true) + assert pr._type == "multilabel" + res = res if average else res.mean().item() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UndefinedMetricWarning) - true_res = precision_score( - to_numpy_multilabel(y_true), to_numpy_multilabel(y_preds), average="samples" if average else None - ) - - assert pytest.approx(res) == true_res + assert precision_score(np_y_true, np_y_preds, average="samples") == pytest.approx(res) metric_devices = ["cpu"] if device.type != "xla": @@ -823,16 +823,16 @@ def update(engine, i): for metric_device in metric_devices: _test(average=True, n_epochs=1, metric_device=metric_device) _test(average=True, n_epochs=2, metric_device=metric_device) + _test(average=False, n_epochs=1, metric_device=metric_device) + _test(average=False, n_epochs=2, metric_device=metric_device) - if idist.get_world_size() > 1: - pr = Precision(average=False, is_multilabel=True) - y_pred = torch.randint(0, 2, size=(4, 3, 6, 8)) - y = torch.randint(0, 2, size=(4, 3, 6, 8)).long() - pr.update((y_pred, y)) - pr_compute1 = pr.compute() - pr_compute2 = pr.compute() - assert len(pr_compute1) == idist.get_world_size() * 4 * 6 * 8 - assert (pr_compute1 == pr_compute2).all() + pr1 = Precision(is_multilabel=True, average=True) + pr2 = Precision(is_multilabel=True, average=False) + y_pred = torch.randint(0, 2, size=(10, 4, 20, 23)) + y = torch.randint(0, 2, size=(10, 4, 20, 23)).long() + pr1.update((y_pred, y)) + pr2.update((y_pred, y)) + assert pr1.compute() == pytest.approx(pr2.compute().mean().item()) def _test_distrib_accumulator_device(device): diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index 78168403cb35..70c1ba2dd23b 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -808,13 +808,13 @@ def update(engine, i): else: assert res == res2 + np_y_preds = to_numpy_multilabel(y_preds) + np_y_true = to_numpy_multilabel(y_true) + assert re._type == "multilabel" + res = res if average else res.mean().item() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UndefinedMetricWarning) - true_res = recall_score( - to_numpy_multilabel(y_true), to_numpy_multilabel(y_preds), average="samples" if average else None - ) - - assert pytest.approx(res) == true_res + assert recall_score(np_y_true, np_y_preds, average="samples") == pytest.approx(res) metric_devices = ["cpu"] if device.type != "xla": @@ -823,16 +823,16 @@ def update(engine, i): for metric_device in metric_devices: _test(average=True, n_epochs=1, metric_device=metric_device) _test(average=True, n_epochs=2, metric_device=metric_device) + _test(average=False, n_epochs=1, metric_device=metric_device) + _test(average=False, n_epochs=2, metric_device=metric_device) - if idist.get_world_size() > 1: - re = Recall(average=False, is_multilabel=True) - y_pred = torch.randint(0, 2, size=(4, 3, 6, 8)) - y = torch.randint(0, 2, size=(4, 3, 6, 8)).long() - re.update((y_pred, y)) - re_compute1 = re.compute() - re_compute2 = re.compute() - assert len(re_compute1) == idist.get_world_size() * 4 * 6 * 8 - assert (re_compute1 == re_compute2).all() + re1 = Recall(is_multilabel=True, average=True) + re2 = Recall(is_multilabel=True, average=False) + y_pred = torch.randint(0, 2, size=(10, 4, 20, 23)) + y = torch.randint(0, 2, size=(10, 4, 20, 23)).long() + re1.update((y_pred, y)) + re2.update((y_pred, y)) + assert re1.compute() == pytest.approx(re2.compute().mean().item()) def _test_distrib_accumulator_device(device):