Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions .github/scripts/run_vllm_profiling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ cleanup_server() {
run_profiling_tests() {
# run profiling tests using JSON configuration
local profiling_test_file="$1"
local base_profiler_dir="${VLLM_TORCH_PROFILER_DIR:-}"

if [[ -z "${base_profiler_dir}" ]]; then
echo "Error: VLLM_TORCH_PROFILER_DIR is not set."
exit 1
fi

if [[ ! -f "$profiling_test_file" ]]; then
echo "Error: Profiling test file $profiling_test_file not found!"
Expand Down Expand Up @@ -92,18 +98,29 @@ run_profiling_tests() {
# Clean up any existing processes first
kill_gpu_processes

# Create a profiling sub-directory for each test case to isolate the
# generated traces (e.g. using the model name hierarchy)
local sanitized_test_name="${TEST_NAME// /_}"
local test_name_directory="${base_profiler_dir}/${sanitized_test_name}"
mkdir -p "${test_name_directory}"
chmod 755 "${test_name_directory}"

# Override the profiler output directory for this test only
export VLLM_TORCH_PROFILER_DIR="${test_name_directory}"

# Run the profiling test
if start_vllm_server "$server_args"; then
run_profiling "$client_args"
cleanup_server

# Debug: Check if profiling files were created
echo "DEBUG: Checking profiling directory: ${VLLM_TORCH_PROFILER_DIR}"
if [ -d "${VLLM_TORCH_PROFILER_DIR}" ]; then
echo "DEBUG: Checking profiling directory: $test_name_directory"
if [ -d "$test_name_directory" ]; then
echo "DEBUG: Profiling directory exists for test $TEST_NAME"
ls -la "${VLLM_TORCH_PROFILER_DIR}" || echo "DEBUG: Directory is empty or inaccessible"
find "${VLLM_TORCH_PROFILER_DIR}" -type f 2>/dev/null | head -10 | while read file; do
ls -la "$test_name_directory" || echo "DEBUG: Directory is empty or inaccessible"
find "$test_name_directory" -type f 2>/dev/null | head -10 | while read file; do
echo "DEBUG: Found profiling file: ${file}"
rename_profiling_file "$file" "vllm"
done
else
echo "DEBUG: Profiling directory does not exist for test $TEST_NAME!"
Expand All @@ -115,6 +132,9 @@ run_profiling_tests() {
continue
fi
done

# Ensure the profiler directory is restored after processing all tests
export VLLM_TORCH_PROFILER_DIR="${base_profiler_dir}"
}

main() {
Expand Down
43 changes: 43 additions & 0 deletions .github/scripts/utilities.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,46 @@ check_hf_token() {
echo "HF_TOKEN is set and valid."
fi
}

rename_profiling_file() {
# Rename profiling files to standardized format
# $1: file path to rename
# $2: prefix name (e.g., "vllm", "sglang")
local file="$1"
local prefix_name="$2"

# Process .pt.trace.json.gz files
if [[ "$file" == *.pt.trace.json.gz ]]; then
local dir_path=$(dirname "$file")
local basename_file=$(basename "$file")

# Determine new filename based on content
local new_filename
if [[ "$basename_file" == *".async_llm."* ]]; then
new_filename="${prefix_name}.async_llm.pt.trace.json.gz"
else
new_filename="${prefix_name}.pt.trace.json.gz"
fi

local new_filepath="${dir_path}/${new_filename}"

# Only rename if the new filename is different
if [[ "$file" != "$new_filepath" ]]; then
echo "DEBUG: Renaming ${file} to ${new_filepath}"
mv "$file" "$new_filepath"
if [[ $? -eq 0 ]]; then
echo "DEBUG: Successfully renamed to ${new_filepath}"
return 0
else
echo "DEBUG: Failed to rename ${file}"
return 1
fi
else
echo "DEBUG: File ${file} already has correct name"
return 0
fi
else
echo "DEBUG: Skipping non-profiling file: ${file}"
return 0
fi
}
19 changes: 19 additions & 0 deletions .github/workflows/vllm-profiling.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,25 @@ jobs:
)
docker exec -t "${container_name}" bash -c "cd vllm-profiling && bash ../.github/scripts/run_vllm_profiling.sh"

- name: Prepare S3 upload metadata
id: prepare_s3_upload
env:
REPOSITORY: vllm-project/vllm
run: |
set -eux

UPLOAD_DATE=$(date -u +"%Y-%m-%d")
echo "upload-date=${UPLOAD_DATE}" >> "${GITHUB_OUTPUT}"
echo "s3-prefix=${UPLOAD_DATE}/${REPOSITORY}/${HEAD_SHA}/${GITHUB_RUN_ID}/${GITHUB_JOB}" >> "${GITHUB_OUTPUT}"

- name: Upload profiling results to S3
Copy link
Contributor

@huydhn huydhn Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works, I can see some profiles on S3, i.e.

Maybe you want to rename these files like 3746728f887_953.1758048457999122720.pt.trace.json.gz to make it easier to discover them later one, i.e. sglang.pt.trace.json.gz. IMO, it would be easier to do this on the workflow before the upload

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed..easy naming would be helpful in future.
I think this is an older commit's url, as I had also added "model_name" in the path, to make it easy for filtering out the data.
This is how it would look with the latest changes: https://gha-artifacts.s3.us-east-1.amazonaws.com/2025-09-17/vllm-project/vllm/ca2d1925ef5ad309061c2d5dd9a1e409c5ca28ee/17788403923/profiling/facebook_opt_125m_tp1_random/vllm.async_llm.pt.trace.json.gz

uses: seemethere/upload-artifact-s3@v5
Copy link
Contributor

@huydhn huydhn Sep 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another note here is that the upload step, as it stands, only works on AWS runner, linux.aws.a100 or linux.aws.h100 and will not work on linux.dgx.b200. You can ignore this for now if you don't plan to run anything on b200, but if you do, you will need this snippet https://github.com/pytorch/pytorch/blob/main/.github/workflows/_rocm-test.yml#L105-L111 to configure the credentials before the upload, i.e.

- name: Configure aws credentials
  if: contains(env.DEVICE_TYPE, 'B200')
  uses: aws-actions/configure-aws-credentials@v4.1.0
  with:
    role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    aws-region: us-east-1
    role-duration-seconds: 18000

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, noted. Right now, I think we are good with using only a100/h100 runners, but if in future, we will change, will add this step. Thanks for sharing.

with:
s3-prefix: ${{ steps.prepare_s3_upload.outputs.s3-prefix }}
retention-days: 180
path: vllm-profiling/profiling-results
if-no-files-found: warn

- uses: actions/upload-artifact@v4
with:
name: profiling-results--${{ env.DEVICE_TYPE }}
Expand Down
2 changes: 1 addition & 1 deletion vllm-profiling/cuda/profiling-tests.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"test_name": "profiling_opt_125m_tp1_random",
"test_name": "facebook_opt_125m_tp1_random",
"server_parameters": {
"model": "facebook/opt-125m",
"swap_space": 16,
Expand Down