Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into test_cuda_OptimInfo
Browse files Browse the repository at this point in the history
  • Loading branch information
jayanthd04 committed May 14, 2024
2 parents cd6249f + 1a28f73 commit 5f92373
Show file tree
Hide file tree
Showing 2,070 changed files with 25,897 additions and 269,169 deletions.
6 changes: 3 additions & 3 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
BASEKIT_VERSION=2024.0.0-49522
XPU_VERSION=0.5
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
Expand Down Expand Up @@ -366,7 +366,7 @@ if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
fi

# Build image
DOCKER_BUILDKIT=1 docker build \
docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
Expand Down Expand Up @@ -403,7 +403,7 @@ DOCKER_BUILDKIT=1 docker build \
--build-arg "DOCS=${DOCS}" \
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--build-arg "EXECUTORCH=${EXECUTORCH}" \
--build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
--build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "ACL=${ACL:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/centos-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
112 changes: 32 additions & 80 deletions .ci/docker/common/install_rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ ver() {
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
}

# Map ROCm version to AMDGPU version
declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )

install_ubuntu() {
apt-get update
if [[ $UBUNTU_VERSION == 18.04 ]]; then
Expand All @@ -26,31 +23,14 @@ install_ubuntu() {
apt-get install -y libc++1
apt-get install -y libc++abi1

if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
local amdgpu_baseurl
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
fi
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
fi

ROCM_REPO="ubuntu"
if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
ROCM_REPO="xenial"
fi

if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
ROCM_REPO="${UBUNTU_VERSION_NAME}"
fi
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

# Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
apt-get update --allow-insecure-repositories

DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
Expand All @@ -68,29 +48,18 @@ install_ubuntu() {
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
# search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
fi
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENKERNELS}" = x ]]; then
echo "miopenkernels package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
fi

# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
fi
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done

# Cleanup
apt-get autoclean && apt-get clean
Expand All @@ -107,25 +76,19 @@ install_centos() {
yum install -y epel-release
yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`

if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
# Add amdgpu repository
local amdgpu_baseurl
if [[ $OS_VERSION == 9 ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
else
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
fi
fi
echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
# Add amdgpu repository
local amdgpu_baseurl
if [[ $OS_VERSION == 9 ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
fi
echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo

local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
Expand All @@ -147,29 +110,18 @@ install_centos() {

# precompiled miopen kernels; search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
yum install -y ${MIOPENHIPGFX}
fi
MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENKERNELS}" = x ]]; then
echo "miopenkernels package not available" && exit 1
else
yum install -y ${MIOPENKERNELS}
fi
yum install -y ${MIOPENHIPGFX}
fi

# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
fi
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done

# Cleanup
yum clean all
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/common/install_triton.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ conda_reinstall() {
if [ -n "${ROCM_VERSION}" ]; then
TRITON_REPO="https://github.com/openai/triton"
TRITON_TEXT_FILE="triton-rocm"
elif [ -n "${BASEKIT_VERSION}" ]; then
elif [ -n "${XPU_VERSION}" ]; then
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
TRITON_TEXT_FILE="triton-xpu"
else
Expand Down
6 changes: 2 additions & 4 deletions .ci/docker/common/install_vision.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ set -ex
install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
libopencv-dev \
libavcodec-dev
libopencv-dev

# Cleanup
apt-get autoclean && apt-get clean
Expand All @@ -19,8 +18,7 @@ install_centos() {
yum --enablerepo=extras install -y epel-release

yum install -y \
opencv-devel \
ffmpeg-devel
opencv-devel

# Cleanup
yum clean all
Expand Down
25 changes: 12 additions & 13 deletions .ci/docker/common/install_xpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ set -xe


# Intel® software for general purpose GPU capabilities.
# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html

# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html

# Users should update to the latest version as it becomes available

Expand All @@ -17,14 +14,16 @@ function install_ubuntu() {
# Set up the repository. To do this, download the key to the system keyring
wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
| gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg

# Add the signed entry to APT sources and configure the APT client to use the Intel repository
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
| tee /etc/apt/sources.list.d/intel-gpu-jammy.list
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
| tee /etc/apt/sources.list.d/oneAPI.list
echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
| tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list

# Update the packages list and repository index
apt-get update
Expand All @@ -40,11 +39,11 @@ function install_ubuntu() {
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
# Development Packages
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
# Install Intel® oneAPI Base Toolkit
if [ -n "$BASEKIT_VERSION" ]; then
apt-get install intel-basekit=$BASEKIT_VERSION -y
# Install Intel Support Packages
if [ -n "$XPU_VERSION" ]; then
apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
else
apt-get install intel-basekit -y
apt-get install -y intel-for-pytorch-gpu-dev
fi

# Cleanup
Expand Down
6 changes: 4 additions & 2 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ ghstack==0.8.0
#Pinned versions: 0.8.0
#test that import:

jinja2==3.1.3
jinja2==3.1.4
#Description: jinja2 template engine
#Pinned versions: 3.1.3
#Pinned versions: 3.1.4
#test that import:

pytest-cpp==2.3.0
Expand Down Expand Up @@ -310,3 +310,5 @@ lxml==5.0.0.
#Description: This is a requirement of unittest-xml-reporting

# Python-3.9 binaries

PyGithub==2.3.0
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
4 changes: 2 additions & 2 deletions .ci/docker/ubuntu-xpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

# Install XPU Dependencies
ARG BASEKIT_VERSION
ARG XPU_VERSION
COPY ./common/install_xpu.sh install_xpu.sh
RUN bash ./install_xpu.sh && rm install_xpu.sh

Expand All @@ -83,7 +83,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}

# (optional) Install vision packages like OpenCV and ffmpeg
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
Expand Down
5 changes: 0 additions & 5 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
fi
fi

if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
echo "Caffe2 build is ON"
export BUILD_CAFFE2=ON
fi

if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
export ATEN_THREADING=TBB
export USE_TBB=1
Expand Down
5 changes: 4 additions & 1 deletion .ci/pytorch/multigpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state.py
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

# FSDP2 tests
time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

# Other tests
time python test/run_test.py --verbose -i test_cuda_primary_ctx
Expand Down

0 comments on commit 5f92373

Please sign in to comment.