Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Export model to CUDA format with optional quantization
# Export model to CUDA/Metal format with optional quantization

show_help() {
cat << EOF
Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]

Export a HuggingFace model to CUDA format with optional quantization.
Export a HuggingFace model to CUDA/Metal format with optional quantization.

Arguments:
device cuda or metal (required)

hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
Expand All @@ -29,9 +31,9 @@ Arguments:
output_dir Output directory for artifacts (optional, default: current directory)

Examples:
export_model_cuda_artifact.sh "openai/whisper-small"
export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
export_model_artifact.sh metal "openai/whisper-small"
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
EOF
}

Expand All @@ -48,9 +50,22 @@ fi

set -eux

HF_MODEL="$1"
QUANT_NAME="${2:-non-quantized}"
OUTPUT_DIR="${3:-.}"
DEVICE="$1"
HF_MODEL="$2"
QUANT_NAME="${3:-non-quantized}"
OUTPUT_DIR="${4:-.}"

case "$DEVICE" in
cuda)
;;
metal)
;;
*)
echo "Error: Unsupported device '$DEVICE'"
echo "Supported devices: cuda, metal"
exit 1
;;
esac

# Determine model configuration based on HF model ID
case "$HF_MODEL" in
Expand All @@ -75,6 +90,10 @@ case "$HF_MODEL" in
fi
;;
google/gemma-3-4b-it)
if [ "$DEVICE" = "metal" ]; then
echo "Error: Export for device 'metal' is not yet tested for model '$HF_MODEL'"
exit 1
fi
MODEL_NAME="gemma3"
TASK="multimodal-text-to-text"
MAX_SEQ_LEN="64"
Expand All @@ -95,9 +114,17 @@ case "$QUANT_NAME" in
EXTRA_ARGS=""
;;
quantized-int4-tile-packed)
if [ "$DEVICE" = "metal" ]; then
echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
exit 1
fi
EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
;;
quantized-int4-weight-only)
if [ "$DEVICE" = "metal" ]; then
echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
exit 1
fi
EXTRA_ARGS="--qlinear_encoder 4w"
;;
*)
Expand All @@ -118,12 +145,18 @@ MAX_SEQ_LEN_ARG=""
if [ -n "$MAX_SEQ_LEN" ]; then
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
fi

DEVICE_ARG=""
if [ "$DEVICE" = "cuda" ]; then
DEVICE_ARG="--device cuda"
fi

optimum-cli export executorch \
--model "$HF_MODEL" \
--task "$TASK" \
--recipe "cuda" \
--recipe "$DEVICE" \
--dtype bfloat16 \
--device cuda \
${DEVICE_ARG} \
${MAX_SEQ_LEN_ARG} \
${EXTRA_ARGS} \
--output_dir ./
Expand All @@ -137,18 +170,18 @@ if [ -n "$PREPROCESSOR_OUTPUT" ]; then
fi

test -f model.pte
test -f aoti_cuda_blob.ptd
test -f aoti_${DEVICE}_blob.ptd
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
test -f $PREPROCESSOR_OUTPUT
fi
echo "::endgroup::"

echo "::group::Store $MODEL_NAME Artifacts"
mkdir -p "${OUTPUT_DIR}"
cp model.pte "${OUTPUT_DIR}/"
cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
mv model.pte "${OUTPUT_DIR}/"
mv aoti_${DEVICE}_blob.ptd "${OUTPUT_DIR}/"
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
mv $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
fi
ls -al "${OUTPUT_DIR}"
echo "::endgroup::"
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first

show_help() {
cat << EOF
Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]

Build and run end-to-end tests for CUDA models.
Build and run end-to-end tests for CUDA/Metal models.

Arguments:
device cuda or metal (required)

hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
Expand All @@ -27,12 +29,12 @@ Arguments:
- quantized-int4-weight-only

model_dir Directory containing model artifacts (optional, default: current directory)
Expected files: model.pte, aoti_cuda_blob.ptd
Expected files: model.pte, aoti_cuda_blob.ptd/aoti_metal_blob.ptd
Tokenizers and test files will be downloaded to this directory

Examples:
test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
EOF
}

Expand All @@ -55,20 +57,21 @@ fi

set -eux

HF_MODEL="$1"
QUANT_NAME="$2"
DEVICE="$1"
HF_MODEL="$2"
QUANT_NAME="$3"
# Download tokenizers, audio, and image files to this directory
MODEL_DIR="${3:-.}"
MODEL_DIR="${4:-.}"

echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"

# Make sure model.pte and aoti_cuda_blob.ptd exist
# Make sure model.pte and aoti_${DEVICE}_blob.ptd exist
if [ ! -f "$MODEL_DIR/model.pte" ]; then
echo "Error: model.pte not found in $MODEL_DIR"
exit 1
fi
if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
if [ ! -f "$MODEL_DIR/aoti_${DEVICE}_blob.ptd" ]; then
echo "Error: aoti_${DEVICE}_blob.ptd not found in $MODEL_DIR"
exit 1
fi
# Locate EXECUTORCH_ROOT from the directory of this script
Expand Down Expand Up @@ -152,14 +155,24 @@ ls -al
echo "::endgroup::"

echo "::group::Build $MODEL_NAME Runner"

if [ "$DEVICE" = "cuda" ]; then
BUILD_BACKEND="EXECUTORCH_BUILD_CUDA"
elif [ "$DEVICE" = "metal" ]; then
BUILD_BACKEND="EXECUTORCH_BUILD_METAL"
else
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
exit 1
fi

cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-D${BUILD_BACKEND}=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(nproc) --target install --config Release

cmake -DEXECUTORCH_BUILD_CUDA=ON \
cmake -D${BUILD_BACKEND}=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/$RUNNER_PATH \
-Bcmake-out/examples/models/$RUNNER_PATH/
Expand All @@ -168,11 +181,13 @@ echo "::endgroup::"

echo "::group::Run $MODEL_NAME Runner"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
if [ "$DEVICE" = "cuda" ]; then
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
fi

# Build runner command with common arguments
RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_${DEVICE}_blob.ptd --temperature 0"

# Add model-specific arguments
case "$MODEL_NAME" in
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ jobs:
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
echo "::endgroup::"

source .ci/scripts/export_model_cuda_artifact.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"

benchmark-model-cuda:
name: benchmark-model-cuda
Expand Down Expand Up @@ -249,4 +249,4 @@ jobs:
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
script: |
source .ci/scripts/test_model_cuda_e2e.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
Loading
Loading