diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml index ffdff587..70f54f39 100644 --- a/.github/workflows/flash_attention.yml +++ b/.github/workflows/flash_attention.yml @@ -34,7 +34,7 @@ jobs: - name: Run Flash Attention benchmark in Docker env: - DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.06-py3 + DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3 run: | set -eux @@ -52,21 +52,19 @@ jobs: "${DOCKER_IMAGE}" ) - # Install CuTe DSL - docker exec -t "${container_name}" bash -c " - set -x - echo 'Installing nvidia-cutlass-dsl' - pip install nvidia-cutlass-dsl==4.1.0 - " - # Build and run FlashAttention CuTe DSL docker exec -t "${container_name}" bash -c " set -x pushd fa4 python setup.py install - - echo '

B200 1000W

' >> /tmp/workspace/fa4_output.txt + pip install -e flash_attn/cute/ + nvidia-smi + + echo '

B200' >> /tmp/workspace/fa4_output.txt + nvidia-smi -q -d POWER | grep 'Current Power Limit' | head -1 | cut -d : -f 2 >> /tmp/workspace/fa4_output.txt + echo '

' >> /tmp/workspace/fa4_output.txt + export PYTHONPATH=\$(pwd) python benchmarks/benchmark_attn.py >> /tmp/workspace/fa4_output.txt popd