[CI] Fix CI issues (#2084)

Vincent Moens · Vincent Moens · commit c875c848fd62 · 2024-04-17T14:07:54.000+01:00
(cherry picked from commit 730dd45)
diff --git a/.github/unittest/linux_libs/scripts_habitat/setup_env.sh b/.github/unittest/linux_libs/scripts_habitat/setup_env.sh
@@ -39,9 +39,11 @@ if [ ! -d "${env_dir}" ]; then
     conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
 fi
 conda activate "${env_dir}"
-#pip3 uninstall cython -y
-#pip uninstall cython -y
-#conda uninstall cython -y
+
+# set debug variables
+conda env config vars set MAGNUM_LOG=debug HABITAT_SIM_LOG=debug
+conda deactivate && conda activate "${env_dir}"
+
 pip3 install "cython<3"
 conda install -c anaconda cython="<3.0.0" -y
 
diff --git a/.github/workflows/test-linux-habitat.yml b/.github/workflows/test-linux-habitat.yml
@@ -19,14 +19,14 @@ jobs:
   tests:
     strategy:
       matrix:
-        python_version: ["3.9"] # "3.8", "3.9", "3.10", "3.11"
-        cuda_arch_version: ["11.6"] # "11.6", "11.7"
+        python_version: ["3.9"]
+        cuda_arch_version: ["12.1"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       repository: pytorch/rl
-      docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04"
+      docker-image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
       gpu-arch-type: cuda
       gpu-arch-version: ${{ matrix.cuda_arch_version }}
       timeout: 90
diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml
@@ -53,14 +53,16 @@ jobs:
   unittests-brax:
     strategy:
       matrix:
-        python_version: ["3.9"]
+        python_version: ["3.11"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -73,7 +75,7 @@ jobs:
 
         set -euo pipefail
 
-        export PYTHON_VERSION="3.9"
+        export PYTHON_VERSION="3.11"
         export CU_VERSION="12.1"
         export TAR_OPTIONS="--no-same-owner"
         export UPLOAD_CHANNEL="nightly"
@@ -123,7 +125,7 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
-    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }}
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
@@ -224,12 +226,14 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -324,12 +328,14 @@ jobs:
         bash .github/unittest/linux_libs/scripts_openx/post_process.sh
 
   unittests-pettingzoo:
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -360,6 +366,7 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
@@ -468,6 +475,7 @@ jobs:
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
@@ -532,12 +540,14 @@ jobs:
       matrix:
         python_version: ["3.9"]
         cuda_arch_version: ["12.1"]
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }}
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       repository: pytorch/rl
       runner: "linux.g5.4xlarge.nvidia.gpu"
       gpu-arch-type: cuda
       gpu-arch-version: "11.7"
+      docker-image: "pytorch/manylinux-cuda124"
       timeout: 120
       script: |
         if [[ "${{ github.ref }}" =~ release/* ]]; then
diff --git a/test/test_env.py b/test/test_env.py
@@ -109,6 +109,10 @@
 
 IS_OSX = platform == "darwin"
 IS_WIN = platform == "win32"
+if IS_WIN:
+    mp_ctx = "spawn"
+else:
+    mp_ctx = "fork"
 
 ## TO BE FIXED: DiscreteActionProjection queries a randint on each worker, which leads to divergent results between
 ## the serial and parallel batched envs
diff --git a/test/test_modules.py b/test/test_modules.py
@@ -8,7 +8,8 @@
 import numpy as np
 import pytest
 import torch
-from _utils_internal import get_default_devices
+
+from _utils_internal import get_default_devices, retry
 from mocking_classes import MockBatchedUnLockedEnv
 from packaging import version
 from tensordict import TensorDict
@@ -889,6 +890,7 @@ def _get_mock_input_td(
         )
         return td
 
+    @retry(AssertionError, 3)
     @pytest.mark.parametrize("n_agents", [1, 3])
     @pytest.mark.parametrize("share_params", [True, False])
     @pytest.mark.parametrize("centralised", [True, False])
diff --git a/test/test_transforms.py b/test/test_transforms.py
@@ -13,6 +13,7 @@
 import sys
 from copy import copy
 from functools import partial
+from sys import platform
 
 import numpy as np
 import pytest
@@ -119,6 +120,12 @@
 from torchrl.envs.utils import check_env_specs, step_mdp
 from torchrl.modules import GRUModule, LSTMModule, MLP, ProbabilisticActor, TanhNormal
 
+IS_WIN = platform == "win32"
+if IS_WIN:
+    mp_ctx = "spawn"
+else:
+    mp_ctx = "fork"
+
 TIMEOUT = 100.0
 
 _has_gymnasium = importlib.util.find_spec("gymnasium") is not None