diff --git a/.github/scripts/run_vllm_profiling.sh b/.github/scripts/run_vllm_profiling.sh index 6203e47..2b30e3f 100644 --- a/.github/scripts/run_vllm_profiling.sh +++ b/.github/scripts/run_vllm_profiling.sh @@ -59,6 +59,12 @@ cleanup_server() { run_profiling_tests() { # run profiling tests using JSON configuration local profiling_test_file="$1" + local base_profiler_dir="${VLLM_TORCH_PROFILER_DIR:-}" + + if [[ -z "${base_profiler_dir}" ]]; then + echo "Error: VLLM_TORCH_PROFILER_DIR is not set." + exit 1 + fi if [[ ! -f "$profiling_test_file" ]]; then echo "Error: Profiling test file $profiling_test_file not found!" @@ -92,18 +98,29 @@ run_profiling_tests() { # Clean up any existing processes first kill_gpu_processes + # Create a profiling sub-directory for each test case to isolate the + # generated traces (e.g. using the model name hierarchy) + local sanitized_test_name="${TEST_NAME// /_}" + local test_name_directory="${base_profiler_dir}/${sanitized_test_name}" + mkdir -p "${test_name_directory}" + chmod 755 "${test_name_directory}" + + # Override the profiler output directory for this test only + export VLLM_TORCH_PROFILER_DIR="${test_name_directory}" + # Run the profiling test if start_vllm_server "$server_args"; then run_profiling "$client_args" cleanup_server # Debug: Check if profiling files were created - echo "DEBUG: Checking profiling directory: ${VLLM_TORCH_PROFILER_DIR}" - if [ -d "${VLLM_TORCH_PROFILER_DIR}" ]; then + echo "DEBUG: Checking profiling directory: $test_name_directory" + if [ -d "$test_name_directory" ]; then echo "DEBUG: Profiling directory exists for test $TEST_NAME" - ls -la "${VLLM_TORCH_PROFILER_DIR}" || echo "DEBUG: Directory is empty or inaccessible" - find "${VLLM_TORCH_PROFILER_DIR}" -type f 2>/dev/null | head -10 | while read file; do + ls -la "$test_name_directory" || echo "DEBUG: Directory is empty or inaccessible" + find "$test_name_directory" -type f 2>/dev/null | head -10 | while read file; do echo "DEBUG: Found profiling file: ${file}" + rename_profiling_file "$file" "vllm" done else echo "DEBUG: Profiling directory does not exist for test $TEST_NAME!" @@ -115,6 +132,9 @@ run_profiling_tests() { continue fi done + + # Ensure the profiler directory is restored after processing all tests + export VLLM_TORCH_PROFILER_DIR="${base_profiler_dir}" } main() { diff --git a/.github/scripts/utilities.sh b/.github/scripts/utilities.sh index a4262e1..d8ca22e 100644 --- a/.github/scripts/utilities.sh +++ b/.github/scripts/utilities.sh @@ -133,3 +133,46 @@ check_hf_token() { echo "HF_TOKEN is set and valid." fi } + +rename_profiling_file() { + # Rename profiling files to standardized format + # $1: file path to rename + # $2: prefix name (e.g., "vllm", "sglang") + local file="$1" + local prefix_name="$2" + + # Process .pt.trace.json.gz files + if [[ "$file" == *.pt.trace.json.gz ]]; then + local dir_path=$(dirname "$file") + local basename_file=$(basename "$file") + + # Determine new filename based on content + local new_filename + if [[ "$basename_file" == *".async_llm."* ]]; then + new_filename="${prefix_name}.async_llm.pt.trace.json.gz" + else + new_filename="${prefix_name}.pt.trace.json.gz" + fi + + local new_filepath="${dir_path}/${new_filename}" + + # Only rename if the new filename is different + if [[ "$file" != "$new_filepath" ]]; then + echo "DEBUG: Renaming ${file} to ${new_filepath}" + mv "$file" "$new_filepath" + if [[ $? -eq 0 ]]; then + echo "DEBUG: Successfully renamed to ${new_filepath}" + return 0 + else + echo "DEBUG: Failed to rename ${file}" + return 1 + fi + else + echo "DEBUG: File ${file} already has correct name" + return 0 + fi + else + echo "DEBUG: Skipping non-profiling file: ${file}" + return 0 + fi +} diff --git a/.github/workflows/vllm-profiling.yml b/.github/workflows/vllm-profiling.yml index c9b68e7..e366518 100644 --- a/.github/workflows/vllm-profiling.yml +++ b/.github/workflows/vllm-profiling.yml @@ -214,6 +214,25 @@ jobs: ) docker exec -t "${container_name}" bash -c "cd vllm-profiling && bash ../.github/scripts/run_vllm_profiling.sh" + - name: Prepare S3 upload metadata + id: prepare_s3_upload + env: + REPOSITORY: vllm-project/vllm + run: | + set -eux + + UPLOAD_DATE=$(date -u +"%Y-%m-%d") + echo "upload-date=${UPLOAD_DATE}" >> "${GITHUB_OUTPUT}" + echo "s3-prefix=${UPLOAD_DATE}/${REPOSITORY}/${HEAD_SHA}/${GITHUB_RUN_ID}/${GITHUB_JOB}" >> "${GITHUB_OUTPUT}" + + - name: Upload profiling results to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-prefix: ${{ steps.prepare_s3_upload.outputs.s3-prefix }} + retention-days: 180 + path: vllm-profiling/profiling-results + if-no-files-found: warn + - uses: actions/upload-artifact@v4 with: name: profiling-results--${{ env.DEVICE_TYPE }} diff --git a/vllm-profiling/cuda/profiling-tests.json b/vllm-profiling/cuda/profiling-tests.json index 1d44f31..f54d13f 100644 --- a/vllm-profiling/cuda/profiling-tests.json +++ b/vllm-profiling/cuda/profiling-tests.json @@ -1,6 +1,6 @@ [ { - "test_name": "profiling_opt_125m_tp1_random", + "test_name": "facebook_opt_125m_tp1_random", "server_parameters": { "model": "facebook/opt-125m", "swap_space": 16,