Update on "[DataLoader] Add generate_state for NumPy seeding"

After adding default seeding strategy for NumPy random module within each worker of DataLoader #56488, two concerns are raised: - We dropped the support for NumPy < 1.17 due to `SeedSequence` - In order to support seeding for NumPy < 1.17, how can we provide seed for `numpy.random`? - First option is set the same seed as `random`. But, the problem is a same algorithm is shared between `numpy.random` and `random`. With the same seed, they will have exact same state sequence. Thanks to @rkern, we noticed this so-called [bad things](Lightning-AI/pytorch-lightning#6960 (comment)). - Considering most of users do not aware this problem, we can provide a better seed by default for `numpy.random` using same `SeedSequence` algorithm as numpy. This is just a workaround with hard-coded function to generate an array of four int32 as the seed. To better coping with this problem since there are amount of 3rd party libraries not just `NumPy` having random module. We may at the end need to implement a `SeedSequence` within `torch.random` module, then users can `spawn` a new `SeedSequence` for each library. [ghstack-poisoned]
pytorch · Apr 26, 2021 · 27e8e04 · 27e8e04
2 parents e7e4bc6 + dde2bc4
commit 27e8e04
Show file tree

Hide file tree

Showing 243 changed files with 10,202 additions and 3,179 deletions.
diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
@@ -130,19 +130,16 @@ def TruePred(_):
 WORKFLOW_DATA = [
     # VS2019 CUDA-10.1
     WindowsJob(None, _VC2019, CudaVersion(10, 1)),
-    # Disable windows tests until https://github.com/pytorch/pytorch/issues/56654 is resolved
-    # WindowsJob(1, _VC2019, CudaVersion(10, 1)),
-    # WindowsJob(2, _VC2019, CudaVersion(10, 1)),
+    WindowsJob(1, _VC2019, CudaVersion(10, 1)),
+    WindowsJob(2, _VC2019, CudaVersion(10, 1)),
     # VS2019 CUDA-11.1
     WindowsJob(None, _VC2019, CudaVersion(11, 1)),
-    # Disable windows tests until https://github.com/pytorch/pytorch/issues/56654 is resolved
-    # WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
-    # WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
+    WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
+    WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
     # VS2019 CPU-only
     WindowsJob(None, _VC2019, None),
-    # Remove master only predicate until GPU tests can be re-enabled again
-    WindowsJob(1, _VC2019, None),
-    WindowsJob(2, _VC2019, None),
+    WindowsJob(1, _VC2019, None, master_only_pred=TruePred),
+    WindowsJob(2, _VC2019, None, master_only_pred=TruePred),
     WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only_pred=TruePred),
 ]
 

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -19,7 +19,7 @@ executors:
   windows-with-nvidia-gpu:
     machine:
       resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
+      image: windows-server-2019-nvidia:previous
       shell: bash.exe
 
   windows-xlarge-cpu-with-nvidia-cuda:
@@ -6790,6 +6790,32 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          executor: windows-with-nvidia-gpu
+          name: pytorch_windows_vs2019_py36_cuda10.1_test1
+          python_version: "3.6"
+          requires:
+            - pytorch_windows_vs2019_py36_cuda10.1_build
+          test_name: pytorch-windows-test1
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+          cuda_version: "10.1"
+          executor: windows-with-nvidia-gpu
+          name: pytorch_windows_vs2019_py36_cuda10.1_test2
+          python_version: "3.6"
+          requires:
+            - pytorch_windows_vs2019_py36_cuda10.1_build
+          test_name: pytorch-windows-test2
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
       - pytorch_windows_build:
           build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
           cuda_version: "11.1"
@@ -6799,6 +6825,44 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
+          cuda_version: "11.1"
+          executor: windows-with-nvidia-gpu
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          name: pytorch_windows_vs2019_py36_cuda11.1_test1
+          python_version: "3.6"
+          requires:
+            - pytorch_windows_vs2019_py36_cuda11.1_build
+          test_name: pytorch-windows-test1
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
+      - pytorch_windows_test:
+          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
+          cuda_version: "11.1"
+          executor: windows-with-nvidia-gpu
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
+          name: pytorch_windows_vs2019_py36_cuda11.1_test2
+          python_version: "3.6"
+          requires:
+            - pytorch_windows_vs2019_py36_cuda11.1_build
+          test_name: pytorch-windows-test2
+          use_cuda: "1"
+          vc_product: BuildTools
+          vc_version: ""
+          vc_year: "2019"
       - pytorch_windows_build:
           build_environment: pytorch-win-vs2019-cpu-py3
           cuda_version: cpu
@@ -6811,6 +6875,12 @@ workflows:
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cpu-py3
           cuda_version: cpu
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
           name: pytorch_windows_vs2019_py36_cpu_test1
           python_version: "3.6"
           requires:
@@ -6823,6 +6893,12 @@ workflows:
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cpu-py3
           cuda_version: cpu
+          filters:
+            branches:
+              only:
+                - master
+                - /ci-all\/.*/
+                - /release\/.*/
           name: pytorch_windows_vs2019_py36_cpu_test2
           python_version: "3.6"
           requires:

diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh
@@ -4,15 +4,18 @@ set -ex
 
 install_magma() {
     # "install" hipMAGMA into /opt/rocm/magma by copying after build
-    git clone https://bitbucket.org/icl/magma.git -b hipMAGMA
+    git clone https://bitbucket.org/icl/magma.git
     pushd magma
-    cp make.inc-examples/make.inc.hip-mkl-gcc make.inc
+    git checkout 878b1ce02e9cfe4a829be22c8f911e9c0b6bd88f
+    cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
     echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
     echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
     echo 'DEVCCFLAGS += --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906 --amdgpu-target=gfx908 --gpu-max-threads-per-block=256' >> make.inc
+    # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+    sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
     export PATH="${PATH}:/opt/rocm/bin"
     make -f make.gen.hipMAGMA -j $(nproc)
-    make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda
+    LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda
     make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda
     popd
     mv magma /opt/rocm

diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml
@@ -19,7 +19,7 @@ executors:
   windows-with-nvidia-gpu:
     machine:
       resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
+      image: windows-server-2019-nvidia:previous
       shell: bash.exe
 
   windows-xlarge-cpu-with-nvidia-cuda:

diff --git a/.github/scripts/generate_linux_ci_workflows.py b/.github/scripts/generate_linux_ci_workflows.py
@@ -13,10 +13,16 @@
 
 
 class PyTorchLinuxWorkflow:
-    def __init__(self, build_environment: str, docker_image_base: str):
+    def __init__(
+            self,
+            build_environment: str,
+            docker_image_base: str,
+            on_pull_request: bool = False
+    ):
         self.build_environment = build_environment
         self.docker_image_base = docker_image_base
         self.test_runner_type = CPU_TEST_RUNNER
+        self.on_pull_request = on_pull_request
         if "cuda" in build_environment:
             self.test_runner_type = CUDA_TEST_RUNNER
 
@@ -31,7 +37,11 @@ def generate_workflow_file(
                 workflow_template.render(
                     build_environment=self.build_environment,
                     docker_image_base=self.docker_image_base,
-                    test_runner_type=self.test_runner_type
+                    test_runner_type=self.test_runner_type,
+                    # two leading spaces is necessary to match yaml indent
+                    on_pull_request=(
+                        "  pull_request:" if self.on_pull_request else ""
+                    )
                 )
             )
             output_file.write('\n')
@@ -67,10 +77,10 @@ def generate_workflow_file(
     #     build_environment="pytorch-linux-xenial-py3-clang7-onnx",
     #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
     # ),
-    # PyTorchLinuxWorkflow(
-    #     build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-    #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-    # ),
+    PyTorchLinuxWorkflow(
+        build_environment="pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+        docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+    ),
     # PyTorchLinuxWorkflow(
     #     build_environment="pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
     #     docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
@@ -17,12 +17,16 @@ install_nvidia_docker2_amzn2() {
     )
 }
 
-install_nvidia_driver() {
+install_nvidia_driver_amzn2() {
     (
         set -x
         sudo yum groupinstall -y "Development Tools"
-        curl -fsL -o nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
-        sudo /bin/bash nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+        # ensure our kernel install is the same as our underlying kernel,
+        # groupinstall "Development Tools" has a habit of mismatching kernel headers
+        sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
+        sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+        sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+        sudo rm -fv /tmp/nvidia_driver
         nvidia-smi
     )
 }
@@ -40,4 +44,12 @@ case "${DISTRIBUTION}" in
 esac
 
 echo "== Installing nvidia driver ${DRIVER_FN} =="
-install_nvidia_driver
+case "${DISTRIBUTION}" in
+    amzn*)
+        install_nvidia_driver_amzn2
+        ;;
+    *)
+        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+        exit 1
+        ;;
+esac
diff --git a/.github/templates/linux_ci_workflow.yml.in b/.github/templates/linux_ci_workflow.yml.in
@@ -6,7 +6,7 @@ name: Linux CI (!{{ build_environment }})
 
 on:
   # TODO: Enable pull_request builds when we can verify capacity can be met by auto-scalers
-  # pull_request:
+!{{ on_pull_request }}
   push:
     branches:
       - master
@@ -61,6 +61,9 @@ jobs:
           mkdir -pv ../custom-op-build
           mkdir -pv ../custom-backend-build
           mkdir -pv ../jit-hook-build
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Build PyTorch
         run: |
           SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@@ -76,14 +79,15 @@ jobs:
             -e SCCACHE_BUCKET \
             -e SKIP_SCCACHE_INITIALIZATION=1 \
             -e TORCH_CUDA_ARCH_LIST \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --tty \
             --user jenkins \
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
             sh -c 'sudo chown -R jenkins ../ && .jenkins/pytorch/build.sh'
@@ -156,6 +160,9 @@ jobs:
       - name: Output disk space left
         run: |
           sudo df -H
+      - name: Preserve github env variables for use in docker
+        run: |
+          env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}"
       - name: Test PyTorch
         run: |
           SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
@@ -172,15 +179,16 @@ jobs:
             -e BUILD_ENVIRONMENT \
             -e IN_CI \
             -e MAX_JOBS \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --shm-size="${SHM_SIZE}" \
             --tty \
             --user jenkins \
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -v "${GITHUB_WORKSPACE}../custom-op-build:/var/lib/jenkins/custom-op-build" \
-            -v "${GITHUB_WORKSPACE}../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
-            -v "${GITHUB_WORKSPACE}../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
+            -v "${GITHUB_WORKSPACE}/../custom-op-build:/var/lib/jenkins/custom-op-build" \
+            -v "${GITHUB_WORKSPACE}/../custom-backend-build:/var/lib/jenkins/custom-backend-build" \
+            -v "${GITHUB_WORKSPACE}/../jit-hook-build:/var/lib/jenkins/jit-hook-build" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}" \
             sh -c 'sudo chown -R jenkins ../ && pip install dist/*.whl && .jenkins/pytorch/test.sh'

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -38,8 +38,11 @@ jobs:
           pip install ruamel.yaml==0.17.4
           .github/scripts/lint_native_functions.py
       - name: Extract scripts from GitHub Actions workflows
-        run: tools/extract_scripts.py --out=.extracted_scripts
-      - name: ShellCheck
+        run: |
+          # For local lints, remove the .extracted_scripts folder if it was already there
+          rm -rf .extracted_scripts
+          tools/extract_scripts.py --out=.extracted_scripts
+      - name: Install ShellCheck
         # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
         run: |
           set -x
@@ -48,6 +51,8 @@ jobs:
           sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
           rm -r "shellcheck-${scversion}"
           shellcheck --version
+      - name: Run ShellCheck
+        run: |
           tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
       - name: Ensure correct trailing newlines
         run: |
@@ -63,7 +68,9 @@ jobs:
       - name: Ensure no non-breaking spaces
         if: always()
         run: |
-          (! git --no-pager grep -In $'\u00a0' -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
+          # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
+          # does not support the '\u000a' syntax (which is relevant for local linters)
+          (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
       - name: Ensure canonical include
         if: always()
         run: |
@@ -202,8 +209,10 @@ jobs:
           path: flake8-output/
       - name: Fail if there were any warnings
         run: |
-          cat flake8-output.txt
-          [ ! -s flake8-output.txt ]
+          set -eux
+          # Re-output flake8 status so GitHub logs show it on the step that actually failed
+          cat "${GITHUB_WORKSPACE}"/flake8-output.txt
+          [ ! -s "${GITHUB_WORKSPACE}"/flake8-output.txt ]
 
   clang-tidy:
     if: github.event_name == 'pull_request'