Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .ci/scripts/setup-webgpu-linux-deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set -ex

# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
install_swiftshader() {
_https_amazon_aws=https://ossci-android.s3.amazonaws.com
_swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
_swiftshader_dir=/tmp/swiftshader
mkdir -p $_swiftshader_dir

_tmp_archive="/tmp/${_swiftshader_archive}"

curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
--output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"

tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"

export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}"
export ETVK_USING_SWIFTSHADER=1
}

install_swiftshader
bash backends/webgpu/scripts/setup-wgpu-native.sh
8 changes: 8 additions & 0 deletions .ci/scripts/test_backend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ if [[ "$FLOW" == *vulkan* ]]; then
EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
fi

if [[ "$FLOW" == *webgpu* ]]; then
# Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
# which are required to build and run the WebGPU delegate.
source .ci/scripts/setup-webgpu-linux-deps.sh

EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
fi

if [[ "$FLOW" == *arm* ]]; then
if [[ "$SUITE" == "operators" ]]; then
PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
Expand Down
27 changes: 27 additions & 0 deletions .github/workflows/test-backend-webgpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Test WebGPU Backend

on:
schedule:
- cron: 0 2 * * *
push:
branches:
- main
- release/*
tags:
- ciflow/nightly/*
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true

jobs:
test-webgpu:
uses: ./.github/workflows/_test_backend.yml
with:
backend: webgpu
flows: '["webgpu"]'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 120
run-linux: true
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,10 @@ if(EXECUTORCH_BUILD_PYBIND)
list(APPEND _dep_libs vulkan_backend)
endif()

if(EXECUTORCH_BUILD_WEBGPU)
list(APPEND _dep_libs webgpu_backend)
endif()

# compile options for pybind
set(_pybind_compile_options
$<$<CXX_COMPILER_ID:MSVC>:/EHsc
Expand Down
7 changes: 7 additions & 0 deletions backends/test/suite/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def _load_vulkan() -> list[TestFlow]:
return [VULKAN_TEST_FLOW, VULKAN_STATIC_INT8_PER_CHANNEL_TEST_FLOW]


def _load_webgpu() -> list[TestFlow]:
from executorch.backends.test.suite.flows.webgpu import WEBGPU_TEST_FLOW

return [WEBGPU_TEST_FLOW]


def _load_openvino() -> list[TestFlow]:
from executorch.backends.test.suite.flows.openvino import (
OPENVINO_INT8_TEST_FLOW,
Expand Down Expand Up @@ -178,6 +184,7 @@ def all_flows() -> dict[str, TestFlow]:
+ _register_flow(_load_xnnpack, "XNNPACK")
+ _register_flow(_load_coreml, "Core ML")
+ _register_flow(_load_vulkan, "Vulkan")
+ _register_flow(_load_webgpu, "WebGPU")
+ _register_flow(_load_openvino, "OpenVINO")
+ _register_flow(_load_qnn, "QNN")
+ _register_flow(_load_arm, "ARM")
Expand Down
20 changes: 20 additions & 0 deletions backends/test/suite/flows/webgpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from executorch.backends.test.suite.flow import TestFlow
from executorch.backends.webgpu.test.tester import WebGPUTester


def _create_webgpu_flow() -> TestFlow:
return TestFlow(
"webgpu",
backend="webgpu",
tester_factory=WebGPUTester,
skip_patterns=["float16", "float64"], # Not supported in swiftshader
)


WEBGPU_TEST_FLOW = _create_webgpu_flow()
2 changes: 1 addition & 1 deletion backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ if(NOT TARGET vulkan_schema)
# target), but vulkan_schema is unconditionally defined.
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../vulkan
${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema
${CMAKE_CURRENT_BINARY_DIR}/_vulkan_schema EXCLUDE_FROM_ALL
)
endif()

Expand Down
5 changes: 5 additions & 0 deletions backends/webgpu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
6 changes: 6 additions & 0 deletions backends/webgpu/runtime/WebGPUDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,13 @@ WebGPUContext create_webgpu_context() {
device_cb.callback = on_device_request;
device_cb.userdata1 = &device_result;

// Request the adapter's full limits; software adapters default many to 0.
WGPULimits supported_limits = {};
WGPUDeviceDescriptor device_desc = {};
if (wgpuAdapterGetLimits(ctx.adapter, &supported_limits) ==
WGPUStatus_Success) {
device_desc.requiredLimits = &supported_limits;
}
device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;

wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);
Expand Down
51 changes: 51 additions & 0 deletions backends/webgpu/runtime/WebGPUUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <webgpu/webgpu.h>

#include <algorithm>
#include <cstdint>
#include <stdexcept>
#include <string>

namespace executorch::backends::webgpu::utils {

// Clamp workgroup size to device limit (SwiftShader caps at 128).
inline uint32_t clamp_workgroup_size(WGPUDevice device, uint32_t desired) {
WGPULimits limits = {};
if (wgpuDeviceGetLimits(device, &limits) == WGPUStatus_Success &&
limits.maxComputeInvocationsPerWorkgroup > 0) {
return std::min(desired, limits.maxComputeInvocationsPerWorkgroup);
}
return desired;
}

// 1D dispatch count (mirrors Vulkan div_up); throws if > device limit.
inline uint32_t compute_1d_workgroup_count(
WGPUDevice device,
uint32_t num_threads,
uint32_t workgroup_size,
const char* op_name) {
uint32_t count = (num_threads + workgroup_size - 1) / workgroup_size;
WGPULimits limits = {};
uint32_t max_count =
wgpuDeviceGetLimits(device, &limits) == WGPUStatus_Success &&
limits.maxComputeWorkgroupsPerDimension > 0
? limits.maxComputeWorkgroupsPerDimension
: 65535u; // WebGPU spec-default floor
if (count > max_count) {
throw std::runtime_error(
std::string("WebGPU ") + op_name +
": workgroup count exceeds the 1D dispatch limit");
}
return count;
}

} // namespace executorch::backends::webgpu::utils
18 changes: 14 additions & 4 deletions backends/webgpu/runtime/ops/add/BinaryOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
#include <executorch/backends/webgpu/runtime/ops/add/binary_add_wgsl.h>

Expand Down Expand Up @@ -50,6 +51,15 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
uint32_t num_elements =
static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));

uint32_t wg_size =
utils::clamp_workgroup_size(device, kBinaryAddWorkgroupSize);
uint32_t workgroup_count =
utils::compute_1d_workgroup_count(device, num_elements, wg_size, "add");

WGPUConstantEntry wg_size_constant = {};
wg_size_constant.key = {"wg_size", WGPU_STRLEN};
wg_size_constant.value = static_cast<double>(wg_size);

// Create uniform buffer for params
AddParams params = {};
params.num_elements = num_elements;
Expand Down Expand Up @@ -115,6 +125,8 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
pipeline_desc.layout = pipeline_layout;
pipeline_desc.compute.module = shader;
pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
pipeline_desc.compute.constantCount = 1;
pipeline_desc.compute.constants = &wg_size_constant;
WGPUComputePipeline pipeline =
wgpuDeviceCreateComputePipeline(device, &pipeline_desc);

Expand Down Expand Up @@ -146,16 +158,14 @@ void add_impl(WebGPUGraph& graph, const std::vector<int>& args) {
bg_desc.entries = bg_entries;
WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);

uint32_t workgroup_count =
(num_elements + kBinaryAddWorkgroupSize - 1) / kBinaryAddWorkgroupSize;

graph.add_dispatch({pipeline, bind_group, workgroup_count});

// Release intermediate objects (pipeline and bind_group are kept by dispatch)
wgpuShaderModuleRelease(shader);
wgpuBindGroupLayoutRelease(bgl);
wgpuPipelineLayoutRelease(pipeline_layout);
// uniform_buffer is kept alive by the bind group
// Drop our ref; the bind group keeps the uniform buffer alive until release.
wgpuBufferRelease(uniform_buffer);
}

} // namespace
Expand Down
4 changes: 3 additions & 1 deletion backends/webgpu/runtime/ops/add/binary_add.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ struct Params {
}
@group(0) @binding(3) var<uniform> params: Params;

@compute @workgroup_size(256)
override wg_size: u32 = 256;

@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let idx = gid.x;
if (idx >= params.num_elements) {
Expand Down
4 changes: 3 additions & 1 deletion backends/webgpu/runtime/ops/add/binary_add_wgsl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ struct Params {
}
@group(0) @binding(3) var<uniform> params: Params;
@compute @workgroup_size(256)
override wg_size: u32 = 256;
@compute @workgroup_size(wg_size)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let idx = gid.x;
if (idx >= params.num_elements) {
Expand Down
27 changes: 27 additions & 0 deletions backends/webgpu/test/TARGETS
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

oncall("executorch")

# AOT export coverage only (lowers via VulkanPartitioner, asserts a VulkanBackend delegate); no GPU runtime.
python_unittest(
name = "test_add",
srcs = [
"ops/add/test_add.py",
],
deps = [
"//caffe2:torch",
"//executorch/backends/vulkan/partitioner:vulkan_partitioner",
"//executorch/backends/vulkan:vulkan_preprocess",
"//executorch/exir:lib",
],
)

runtime.python_library(
name = "tester",
srcs = ["tester.py"],
deps = [
"//executorch/backends/vulkan/partitioner:vulkan_partitioner",
"//executorch/backends/vulkan:vulkan_preprocess",
],
)
2 changes: 1 addition & 1 deletion backends/webgpu/test/ops/add/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import unittest

import torch
from executorch.backends.vulkan import VulkanPartitioner
from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
from executorch.exir import to_edge_transform_and_lower


Expand Down
65 changes: 65 additions & 0 deletions backends/webgpu/test/tester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import Any, List, Optional, Tuple

import executorch
import executorch.backends.test.harness.stages as BaseStages

import torch
from executorch.backends.test.harness import Tester as TesterBase
from executorch.backends.test.harness.stages import StageType
from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
from executorch.exir import EdgeCompileConfig
from executorch.exir.backend.partitioner import Partitioner


# Lowers via VulkanPartitioner (WebGPU consumes the Vulkan VK00 serialization).
class Partition(BaseStages.Partition):
def __init__(self, partitioner: Optional[Partitioner] = None):
super().__init__(
partitioner=partitioner or VulkanPartitioner({"skip_bool_tensors": True}),
)


class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
def __init__(
self,
partitioners: Optional[List[Partitioner]] = None,
edge_compile_config: Optional[EdgeCompileConfig] = None,
):
if partitioners is None:
partitioners = [VulkanPartitioner({"skip_bool_tensors": True})]

super().__init__(
default_partitioner_cls=VulkanPartitioner,
partitioners=partitioners,
edge_compile_config=edge_compile_config
or EdgeCompileConfig(_check_ir_validity=False),
)


class WebGPUTester(TesterBase):
def __init__(
self,
module: torch.nn.Module,
example_inputs: Tuple[torch.Tensor],
dynamic_shapes: Optional[Tuple[Any]] = None,
):
stage_classes = (
executorch.backends.test.harness.Tester.default_stage_classes()
| {
StageType.PARTITION: Partition,
StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
}
)

super().__init__(
module=module,
stage_classes=stage_classes,
example_inputs=example_inputs,
dynamic_shapes=dynamic_shapes,
)
Loading