diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95e33fb109e..e83d8ea11b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -817,6 +817,10 @@ endif()
 if(EXECUTORCH_BUILD_VULKAN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
+if(EXECUTORCH_BUILD_VGF)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 
 if(EXECUTORCH_BUILD_ANDROID_JNI)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index b5e76e778a5..11f61c0dfee 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -12,13 +12,17 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-add_compile_options("-Wall" "-Werror")
-
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
+
+# bare metal backend builds
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+
+add_compile_options("-Wall" "-Werror")
+
 # Third-party folder and Ethos-U driver inclued
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
@@ -36,3 +40,47 @@ target_include_directories(
 target_include_directories(
   executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}
 )
+
+# end config for bare metal builds
+endif()
+
+
+# VGF backend builds 
+if(EXECUTORCH_BUILD_VGF)
+
+# include libvgf
+set(LIBVGF_PATH "${EXECUTORCH_ROOT}/examples/arm/ethos-u-scratch/ml-sdk-for-vulkan-manifest/sw/vgf-lib/")
+
+set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party)
+set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
+set(VOLK_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
+
+set(LIBVGF_STATIC "${LIBVGF_PATH}/build/src/libvgf.a")
+set(LIBVGF_INCLUDE "${LIBVGF_PATH}/include/")
+
+add_library(vgf STATIC IMPORTED)
+set_property( TARGET vgf PROPERTY IMPORTED_LOCATION "${LIBVGF_STATIC}" )
+target_include_directories(vgf INTERFACE "${LIBVGF_INCLUDE}")
+
+# Add backend delegate for VGF
+set(_vgf_backend_sources backends/arm/runtime/VGFBackend.cpp
+			 backends/arm/runtime/VGFSetup.cpp )
+
+# vgf backend
+list(TRANSFORM  _vgf_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
+add_library(vgf_backend ${_vgf_backend_sources})
+target_include_directories(
+  vgf_backend PUBLIC
+  ${_common_include_directories}
+  ${VULKAN_HEADERS_PATH}
+  ${VOLK_HEADERS_PATH}
+)
+target_compile_options(vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOLK)
+
+
+target_link_libraries(vgf_backend PRIVATE executorch_core)
+target_link_libraries(vgf_backend PRIVATE vgf)
+executorch_target_link_options_shared_lib(vgf_backend)
+
+# end config for VGF builds
+endif()
diff --git a/backends/arm/runtime/VGFBackend.cpp b/backends/arm/runtime/VGFBackend.cpp
new file mode 100644
index 00000000000..ea4f4286eb9
--- /dev/null
+++ b/backends/arm/runtime/VGFBackend.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <list>
+#include <numeric>
+using namespace std;
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+using executorch::aten::Tensor;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+
+// We use the platform and runtime environment provided by the Vulkan delegate
+#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
+
+// Dependencies for processing VGF files into Vulkan calls
+#include <vgf/decoder.hpp>
+#include <vgf/vulkan_helpers.generated.hpp>
+
+#include <executorch/backends/arm/runtime/VGFSetup.h>
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+/*
+ * Simple function to populate function pointers for the relevant Tensor
+ * and DataGraph extension APIs.
+ */
+VkResult vkml_load_extensions(VkDevice const* device) {
+  // Note:
+  //    We no longer PFN_vkCreateTensorARM)vkGetDeviceProcAddr(*device,
+  //    "vkCreateTensorARM"); We just verify that the function pointers have
+  //    been populated by the loader
+  if (vkCreateTensorARM && vkDestroyTensorARM && vkCreateTensorViewARM &&
+      vkDestroyTensorViewARM && vkGetTensorMemoryRequirementsARM &&
+      vkBindTensorMemoryARM && vkCreateDataGraphPipelinesARM &&
+      vkCmdDispatchDataGraphARM && vkCreateDataGraphPipelineSessionARM) {
+    ET_LOG(Info, "VKML Extensions loaded");
+    return VK_SUCCESS;
+  }
+  ET_LOG(Error, "Failed to load VKML extensions");
+  return VK_ERROR_UNKNOWN;
+}
+
+/*
+ * Fetch vulkan basic objects - intended to be replaced with a shared
+ * device setup with the Vulkan backend.
+ */
+VkResult vkml_allocate_basics(
+    VkInstance* instance,
+    VkPhysicalDevice* physical_device,
+    VkDevice* device,
+    VkQueue* queue,
+    VkCommandPool* command_pool);
+
+void vkml_free_basics(
+    VkInstance* instance,
+    VkDevice* device,
+    VkCommandPool* command_pool) {
+  vkDestroyCommandPool(*device, *command_pool, nullptr);
+  // Note: These primitives are used by the emulation layer for vulkan
+  //       object allocation, the vulkan objects are freed in in library
+  //       shutdown, so we can't yet destroy these here without causing
+  //       a crash there.
+  //  vkDestroyDevice(*device, nullptr);
+  //  vkDestroyInstance(*instance, nullptr);
+}
+
+class VGFBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  VGFBackend() {
+    VkResult result;
+
+    // Fetch basic vulkan objects once
+    result = vkml_allocate_basics(
+        &vk_instance,
+        &vk_physical_device,
+        &vk_device,
+        &vk_queue,
+        &vk_command_pool);
+    if (result != VK_SUCCESS) {
+      ET_LOG(
+          Error, "Failed to initialize the Vulkan device error 0x%08X", result);
+      return;
+    }
+
+    // Query the device to ensure it has needed extensions
+    result = vkml_load_extensions(&vk_device);
+    if (result != VK_SUCCESS) {
+      ET_LOG(
+          Error,
+          "Failed to verify VKML extensions needed, error 0x%08X",
+          result);
+      return;
+    }
+  }
+  ~VGFBackend() {
+    vkml_free_basics(&vk_instance, &vk_device, &vk_command_pool);
+  }
+
+  bool is_available() const override {
+    VkResult result;
+
+    ET_LOG(Info, "Checking VGFBackend is available");
+    // Query the device prepared in constructor for needed extensions
+    result = vkml_load_extensions(&vk_device);
+    if (result != VK_SUCCESS)
+      return false;
+
+    return true;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+    ET_LOG(Info, "Entered VGF init");
+
+    const char* vgf_data = reinterpret_cast<const char*>(processed->data());
+
+    MemoryAllocator* allocator = context.get_runtime_allocator();
+    VgfRepr* repr = allocator->allocateInstance<VgfRepr>();
+    new (repr) VgfRepr(
+        vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool);
+
+    auto valid_vgf = repr->process_vgf(vgf_data, compile_specs);
+    if (!valid_vgf) {
+      ET_LOG(Error, "Failed to process VGF blob.");
+      return Error::Internal;
+    }
+
+    return repr;
+  }
+
+  Error execute(
+      ET_UNUSED BackendExecutionContext& context,
+      DelegateHandle* handle,
+      EValue** args) const override {
+    VgfRepr* repr = static_cast<VgfRepr*>(handle);
+
+    // Copy all inputs from EValue to VkDeviceMemory
+    for (int i = 0; i < repr->IOs.size(); i++) {
+      if (!args[i]->isTensor()) {
+        ET_LOG(
+            Error,
+            "Expected EValue %d to be tensor, got %d",
+            i,
+            static_cast<uint32_t>(args[i]->tag));
+        return Error::InvalidArgument;
+      }
+
+      Tensor* tensor = &args[i]->toTensor();
+      IO* io = &repr->IOs[i];
+
+      // skip non-inputs
+      if (!io->is_input)
+        continue;
+
+      size_t io_size = accumulate(
+          io->size.begin(), io->size.end(), io->elt_size, std::multiplies<>());
+
+      void* data;
+      if (!repr->map_io(io, &data)) {
+        ET_LOG(Error, "Failed to map Vulkan IO memory");
+        return Error::Internal;
+      }
+      memcpy(data, tensor->mutable_data_ptr(), io_size);
+      repr->unmap_io(io);
+    }
+
+    // Execute the workload
+    if (!repr->execute_vgf()) {
+      ET_LOG(Error, "Failed to execute the VGF representation");
+      return Error::Internal;
+    }
+
+    // Copy all outputs from VKDeviceMemory to EValue
+    for (int i = 0; i < repr->IOs.size(); i++) {
+      if (!args[i]->isTensor()) {
+        ET_LOG(
+            Error,
+            "Expected EValue %d to be tensor, got %d",
+            i,
+            static_cast<uint32_t>(args[i]->tag));
+        return Error::InvalidArgument;
+      }
+      Tensor* tensor = &args[i]->toTensor();
+      IO* io = &repr->IOs[i];
+
+      // skip non-outputs
+      if (io->is_input)
+        continue;
+
+      size_t io_size = accumulate(
+          io->size.begin(), io->size.end(), io->elt_size, std::multiplies<>());
+
+      void* data;
+      if (!repr->map_io(io, &data)) {
+        ET_LOG(Error, "Failed to map Vulkan IO memory");
+        return Error::Internal;
+      }
+      memcpy(tensor->mutable_data_ptr(), data, io_size);
+      repr->unmap_io(io);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    VgfRepr* repr = static_cast<VgfRepr*>(handle);
+    repr->~VgfRepr();
+  }
+
+ private:
+  VkInstance vk_instance;
+  VkPhysicalDevice vk_physical_device;
+  VkDevice vk_device;
+  VkQueue vk_queue;
+  VkCommandPool vk_command_pool;
+};
+
+namespace {
+auto cls = VGFBackend();
+Backend backend{"VgfBackend", &cls};
+static auto success_with_compiler = register_backend(backend);
+} // namespace
+
+VkResult vkml_allocate_basics(
+    VkInstance* instance,
+    VkPhysicalDevice* physical_device,
+    VkDevice* device,
+    VkQueue* queue,
+    VkCommandPool* command_pool) {
+  const char* dev_exts[] = {"VK_ARM_tensors", "VK_ARM_data_graph"};
+  VkResult result;
+
+  if (VK_SUCCESS != volkInitialize()) {
+    ET_LOG(Error, "Volk failed to initialize");
+  }
+
+  VkApplicationInfo app_info{
+      .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+      .pNext = nullptr,
+      .pApplicationName = "VGF",
+      .applicationVersion = 0,
+      .pEngineName = "executorch",
+      .engineVersion = 0,
+      .apiVersion = VK_API_VERSION_1_3,
+  };
+  VkInstanceCreateInfo instance_info{
+      .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .pApplicationInfo = &app_info,
+      0,
+      nullptr,
+      0,
+      nullptr};
+  result = vkCreateInstance(&instance_info, nullptr, instance);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkInstance");
+    return result;
+  }
+  volkLoadInstance(*instance);
+
+  // Pick first GPU
+  uint32_t gpu_count = 0;
+  vkEnumeratePhysicalDevices(*instance, &gpu_count, nullptr);
+  if (gpu_count == 0) {
+    ET_LOG(Error, "Found no suitable devices");
+    return VK_ERROR_UNKNOWN;
+  }
+  vector<VkPhysicalDevice> gpus(gpu_count);
+  result = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus.data());
+  *physical_device = gpus[0];
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to select physical device");
+    return result;
+  }
+
+  // Find suitable queue family
+  uint32_t qf_count;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      *physical_device, &qf_count, nullptr);
+  vector<VkQueueFamilyProperties> qps(qf_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      *physical_device, &qf_count, qps.data());
+  uint32_t qf = UINT32_MAX;
+  for (uint32_t i = 0; i < qf_count; ++i) {
+    if (qps[i].queueFlags &
+        (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_DATA_GRAPH_BIT_ARM)) {
+      qf = i;
+      break;
+    }
+  }
+  if (qf == UINT32_MAX) {
+    ET_LOG(Error, "Failed to find suitable queue");
+    return VK_ERROR_UNKNOWN;
+  }
+
+  // Device with ML tensor extension
+  float qp = 1.0f;
+  VkDeviceQueueCreateInfo queue_info{
+      .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .queueFamilyIndex = qf,
+      .queueCount = 1,
+      .pQueuePriorities = &qp,
+  };
+
+  VkDeviceCreateInfo dci{VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, nullptr};
+  dci.queueCreateInfoCount = 1;
+  dci.pQueueCreateInfos = &queue_info;
+  dci.enabledExtensionCount = 2;
+  dci.ppEnabledExtensionNames = dev_exts;
+  result = vkCreateDevice(*physical_device, &dci, nullptr, device);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkDevice");
+    return result;
+  }
+  // Load the device with volk and populate function pointers
+  volkLoadDevice(*device);
+
+  vkGetDeviceQueue(*device, qf, 0, queue);
+
+  VkCommandPoolCreateInfo poolInfo{
+      .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .queueFamilyIndex = qf,
+  };
+  result = vkCreateCommandPool(*device, &poolInfo, nullptr, command_pool);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkCommandPool");
+    return result;
+  }
+
+  return result;
+}
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
new file mode 100644
index 00000000000..18c9dbc9727
--- /dev/null
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -0,0 +1,780 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * VGF functions which prepare a graph for execution by allocating the
+ * appropriate vulkan structures.
+ */
+
+#include <executorch/backends/arm/runtime/VGFSetup.h>
+
+#include <vgf/decoder.hpp>
+#include <vgf/vulkan_helpers.generated.hpp>
+
+using namespace mlsdk;
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+/* static function to map format to byte count */
+static uint32_t get_format_size(VkFormat format);
+
+// Debug function to inspect memory properties
+static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
+  if (flags == 0)
+    return "0";
+
+  vector<string> parts;
+#define TRY_FLAG(f)         \
+  if (flags & (f)) {        \
+    parts.emplace_back(#f); \
+    flags &= ~(f);          \
+  }
+
+  TRY_FLAG(VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)
+#ifdef VK_MEMORY_PROPERTY_PROTECTED_BIT
+  TRY_FLAG(VK_MEMORY_PROPERTY_PROTECTED_BIT)
+#endif
+#undef TRY_FLAG
+
+  if (flags) {
+    // any leftover bits we didn’t name
+    ostringstream hex;
+    hex << "0x" << std::hex << flags;
+    parts.emplace_back(hex.str());
+  }
+
+  ostringstream joined;
+  for (size_t i = 0; i < parts.size(); ++i) {
+    if (i)
+      joined << " | ";
+    joined << parts[i];
+  }
+  return joined.str();
+}
+
+/**
+ * Tensor free helper function
+ */
+void free_tensor(
+    VkDevice device,
+    VkTensorViewARM tensor_view,
+    VkTensorARM tensor,
+    VkDeviceMemory memory) {
+  vkDestroyTensorViewARM(device, tensor_view, nullptr);
+  vkDestroyTensorARM(device, tensor, nullptr);
+  vkFreeMemory(device, memory, nullptr);
+}
+
+/**
+ * Tensor allocation helper function
+ */
+VkResult allocate_tensor(
+    VkPhysicalDevice physical,
+    VkDevice device,
+    VkFormat format,
+    uint32_t shape_size,
+    const int64_t* shape,
+    uint32_t stride_size,
+    const int64_t* stride,
+    VkTensorDescriptionARM* description,
+    VkTensorViewARM* tensor_view,
+    VkTensorARM* tensor,
+    VkDeviceMemory* memory) {
+  VkResult result;
+
+  *description = VkTensorDescriptionARM{
+      .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
+      .pNext = nullptr,
+      .tiling = VK_TENSOR_TILING_LINEAR_ARM,
+      .format = format,
+      .dimensionCount = shape_size,
+      .pDimensions = shape,
+      // Note: stride_data of 0's causes size==0, null means stride==size
+      .pStrides = (0 == stride_size ? nullptr : stride),
+      .usage = VK_TENSOR_USAGE_SHADER_BIT_ARM |
+          VK_TENSOR_USAGE_TRANSFER_SRC_BIT_ARM |
+          VK_TENSOR_USAGE_TRANSFER_DST_BIT_ARM |
+          VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
+  };
+  const VkTensorCreateInfoARM create_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .pDescription = description,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .pQueueFamilyIndices = nullptr,
+  };
+
+  result = vkCreateTensorARM(device, &create_info, nullptr, tensor);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to CreateTensor, error %d", result);
+    return result;
+  }
+
+  // Get backing memory requirements
+  const VkTensorMemoryRequirementsInfoARM memory_requirements_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_MEMORY_REQUIREMENTS_INFO_ARM,
+      .pNext = nullptr,
+      .tensor = *tensor,
+  };
+  VkMemoryRequirements2 memory_requirements = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+      .pNext = nullptr,
+  };
+  vkGetTensorMemoryRequirementsARM(
+      device, &memory_requirements_info, &memory_requirements);
+
+  VkPhysicalDeviceMemoryProperties memProps;
+  vkGetPhysicalDeviceMemoryProperties(physical, &memProps);
+
+  // Allocate memory
+  uint32_t memory_type = 0;
+  for (size_t j = 0; j < 31; ++j) {
+    if (memory_requirements.memoryRequirements.memoryTypeBits & (0x1 << j)) {
+      memory_type = j;
+      uint32_t aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+      if ((memProps.memoryTypes[j].propertyFlags & aims) == aims)
+        break;
+    }
+  }
+  const VkMemoryAllocateInfo allocate_info = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .allocationSize = memory_requirements.memoryRequirements.size,
+      .memoryTypeIndex = memory_type};
+
+  vkAllocateMemory(device, &allocate_info, nullptr, memory);
+
+  // Bind tensor to memory
+  const VkBindTensorMemoryInfoARM bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_TENSOR_MEMORY_INFO_ARM,
+      .pNext = nullptr,
+      .tensor = *tensor,
+      .memory = *memory,
+      .memoryOffset = 0,
+  };
+  vkBindTensorMemoryARM(device, 1, &bind_info);
+
+  VkTensorViewCreateInfoARM tensor_view_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_VIEW_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .tensor = *tensor,
+      .format = format,
+  };
+  VkResult res_tv =
+      vkCreateTensorViewARM(device, &tensor_view_info, nullptr, tensor_view);
+  ET_LOG(Info, "    tensor view (success %d)", res_tv == VK_SUCCESS);
+
+  return res_tv;
+}
+
+static void debug_print_sequence(
+    unique_ptr<vgflib::ModelSequenceTableDecoder>& sequence_decoder) {
+  ET_LOG(Info, "VGF Sequences:");
+  for (int i = 0; i < sequence_decoder->modelSequenceTableSize(); i++) {
+    ET_LOG(
+        Info,
+        "  Sequence(%d) '%s':",
+        i,
+        string(sequence_decoder->getSegmentName(i)).c_str());
+    auto dispatch_shape = sequence_decoder->getSegmentDispatchShape(i);
+    ET_LOG(
+        Info,
+        "    dispatch shape %d %d %d",
+        dispatch_shape[0],
+        dispatch_shape[1],
+        dispatch_shape[2]);
+    ET_LOG(
+        Info,
+        "    is graph? %d",
+        vgflib::ModuleType::GRAPH == sequence_decoder->getSegmentType(i));
+    ET_LOG(
+        Info,
+        "    module index %d",
+        sequence_decoder->getSegmentModuleIndex(i));
+    auto input_names = sequence_decoder->getModelSequenceInputNamesHandle();
+    ET_LOG(
+        Info, "    names (%ld):", sequence_decoder->getNamesSize(input_names));
+    for (int j = 0; j < sequence_decoder->getNamesSize(input_names); j++) {
+      ET_LOG(
+          Info,
+          "      %d: %s",
+          i,
+          string(sequence_decoder->getName(input_names, i)).c_str());
+    }
+  }
+}
+
+static void debug_print_resources(
+    unique_ptr<vgflib::ModelResourceTableDecoder>& resource_decoder) {
+  ET_LOG(Info, "Resources:");
+  for (int i = 0; i < resource_decoder->size(); i++) {
+    ET_LOG(Info, "  MRT entry %d", i);
+    if (!resource_decoder->getDescriptorType(i).has_value()) {
+      ET_LOG(Info, "    DescriptorType NONE");
+    } else {
+      ET_LOG(
+          Info,
+          "    DescriptorType %u, is tensor? %d",
+          resource_decoder->getDescriptorType(i).value(),
+          resource_decoder->getDescriptorType(i).value() ==
+              VK_DESCRIPTOR_TYPE_TENSOR_ARM);
+    }
+    ET_LOG(
+        Info,
+        "    VkFormat %u from vgf format %u",
+        vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+        resource_decoder->getVkFormat(i));
+    switch (resource_decoder->getCategory(i)) {
+      case vgflib::ResourceCategory::INPUT:
+      case vgflib::ResourceCategory::OUTPUT: {
+        ET_LOG(Info, "    Category INPUT/OUTPUT");
+        // Get tensor shape and strides
+        auto shape = resource_decoder->getTensorShape(i);
+        const vector<int64_t> the_shape(shape.begin(), shape.end());
+        auto stride = resource_decoder->getTensorStride(i);
+        const vector<int64_t> the_stride(stride.begin(), stride.end());
+        ET_LOG(
+            Info,
+            "    rank %ld, stride rank %ld",
+            the_shape.size(),
+            the_stride.size());
+        for (int j = 0; j < the_shape.size(); j++) {
+          ET_LOG(Info, "      %d: dim %ld", j, the_shape[j]);
+        }
+        // Allocate a tensor with bound memory
+        break;
+      }
+      case vgflib::ResourceCategory::INTERMEDIATE:
+        ET_LOG(Info, "    Category INTERMEDIATE");
+        break;
+      case vgflib::ResourceCategory::CONSTANT:
+        ET_LOG(Info, "    Category CONSTANT");
+        break;
+      default:
+        ET_LOG(Info, "    Category UNKNOWN");
+        break;
+    }
+  }
+}
+
+static void debug_print_modules(
+    unique_ptr<vgflib::ModuleTableDecoder>& module_decoder) {
+  ET_LOG(Info, "VGF Modules:");
+  for (int i = 0; i < module_decoder->size(); i++) {
+    auto name = string(module_decoder->getModuleName(i));
+    auto entrypoint = string(module_decoder->getModuleEntryPoint(i));
+    auto type = module_decoder->getModuleType(i);
+    auto spirv = module_decoder->getModuleCode(i);
+    ET_LOG(Info, "  Module(%d) '%s':", i, name.c_str());
+    ET_LOG(
+        Info,
+        "    is graph? %d",
+        vgflib::ModuleType::GRAPH == module_decoder->getModuleType(i));
+    ET_LOG(Info, "    entrypoint '%s'", entrypoint.c_str());
+    ET_LOG(Info, "    has spirv %d", module_decoder->hasSPIRV(i));
+    ET_LOG(
+        Info, "    code size %lu", spirv.size()); // read the .begin() to .end()
+  }
+}
+
+bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
+  ET_LOG(Info, "Preparing VGF as Vulkan objects");
+
+  VkResult result;
+
+  // Prepare temporary decoders
+  unique_ptr<vgflib::HeaderDecoder> header_decoder =
+      vgflib::CreateHeaderDecoder(vgf_data);
+  unique_ptr<vgflib::ModelSequenceTableDecoder> sequence_decoder =
+      vgflib::CreateModelSequenceTableDecoder(
+          vgf_data + header_decoder->GetModelSequenceTableOffset());
+  unique_ptr<vgflib::ModuleTableDecoder> module_decoder =
+      vgflib::CreateModuleTableDecoder(
+          vgf_data + header_decoder->GetModuleTableOffset());
+  unique_ptr<vgflib::ModelResourceTableDecoder> resource_decoder =
+      vgflib::CreateModelResourceTableDecoder(
+          vgf_data + header_decoder->GetModelResourceTableOffset());
+  unique_ptr<vgflib::ConstantDecoder> constant_decoder =
+      vgflib::CreateConstantDecoder(
+          vgf_data + header_decoder->GetConstantsOffset());
+  // Check the VGF decoders
+  if (not(header_decoder && module_decoder && sequence_decoder &&
+          resource_decoder && constant_decoder && header_decoder->IsValid() &&
+          header_decoder->CheckVersion())) {
+    ET_LOG(Error, "Failed to process VGF file internalsr");
+    return false;
+  }
+
+  // Parse the sequences in the VGF (while there can be multiple sequences of
+  // COMPUTE and GRAPH segments in the sequence, we currently expect a single
+  // GRAPH segment to be present.
+  debug_print_sequence(sequence_decoder);
+  if (sequence_decoder->modelSequenceTableSize() != 1) {
+    ET_LOG(Error, "Expected sequence length 1");
+    return false;
+  }
+  if (sequence_decoder->getSegmentType(0) != vgflib::ModuleType::GRAPH) {
+    ET_LOG(Error, "Expected segment to be of type GRAPH");
+    return false;
+  }
+
+  // Extract first segment and it's associated module
+  debug_print_modules(module_decoder);
+  auto segment_name = string(sequence_decoder->getSegmentName(0));
+  auto segment_module = sequence_decoder->getSegmentModuleIndex(0);
+
+  auto segment_m_name = string(module_decoder->getModuleName(segment_module));
+  auto segment_m_entrypoint =
+      string(module_decoder->getModuleEntryPoint(segment_module));
+  auto segment_m_spirv = module_decoder->getModuleCode(segment_module);
+
+  // Build a shader from the module
+  VkShaderModuleCreateInfo smci{
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .codeSize = segment_m_spirv.size() * sizeof(uint32_t),
+      .pCode = segment_m_spirv.begin(),
+  };
+  result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to load shader from segment %d", segment_module);
+    return false;
+  }
+
+  // Record our shader and entrypoint string
+  vector<tuple<VkShaderModule, string>> shader_modules;
+  shader_modules.push_back({vk_shader, segment_m_entrypoint});
+
+  // Load our resource (tensors, constants) into their appropriate Vk objects
+  vector<VkTensorDescriptionARM> descriptors;
+  vector<tuple<VkTensorARM, VkTensorViewARM>> resources;
+  vector<VkDataGraphPipelineConstantARM> constants;
+
+  int IO_count = resource_decoder->size();
+  for (int i = 0; i < IO_count; i++) {
+    auto resource_type = resource_decoder->getDescriptorType(i).value_or(0);
+    auto resource_format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i));
+
+    // Get tensor shape and strides
+    auto shape = resource_decoder->getTensorShape(i);
+    auto stride = resource_decoder->getTensorStride(i);
+
+    switch (resource_decoder->getCategory(i)) {
+      case vgflib::ResourceCategory::INPUT:
+      case vgflib::ResourceCategory::OUTPUT: {
+        // Expect IO to be a tensor type
+        if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) {
+          ET_LOG(
+              Error,
+              "Expected tensor type descriptor %u got %u",
+              VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+              resource_type);
+          return false;
+        }
+
+        // Allocate a tensor with backing memory
+        VkTensorARM tensor;
+        VkTensorViewARM tensor_view;
+        VkDeviceMemory tensor_memory;
+        VkTensorDescriptionARM tensor_description;
+        result = allocate_tensor(
+            vk_physical,
+            vk_device,
+            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+            static_cast<uint32_t>(shape.size()),
+            shape.begin(),
+            static_cast<uint32_t>(stride.size()),
+            stride.begin(),
+            &tensor_description,
+            &tensor_view,
+            &tensor,
+            &tensor_memory);
+        if (result != VK_SUCCESS) {
+          ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
+          return false;
+        }
+        size_t e_size = get_format_size(
+            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)));
+        if (0 == e_size) {
+          ET_LOG(Error, "failed to get element size of VkFormat");
+          return false;
+        }
+
+        bool is_in =
+            resource_decoder->getCategory(i) == vgflib::ResourceCategory::INPUT;
+        IOs.push_back(
+            IO{vector<int64_t>(shape.begin(), shape.end()),
+               vector<int64_t>(stride.begin(), stride.end()),
+               e_size,
+               tensor,
+               tensor_view,
+               tensor_memory,
+               is_in});
+        resources.push_back({tensor, tensor_view});
+        descriptors.push_back(tensor_description);
+        break;
+      }
+      case vgflib::ResourceCategory::CONSTANT:
+        // Constants just need a descriptor
+        descriptors.push_back(VkTensorDescriptionARM{
+            .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
+            .pNext = nullptr,
+            .tiling = VK_TENSOR_TILING_LINEAR_ARM,
+            .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+            .dimensionCount = static_cast<uint32_t>(shape.size()),
+            .pDimensions = shape.begin(),
+            // Note: stride_data of 0's causes size==0, null means stride==size
+            .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
+            .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
+        });
+        break;
+      case vgflib::ResourceCategory::INTERMEDIATE:
+        ET_LOG(Error, "Unsupported resource category INTERMEDIATE");
+        return false;
+      default:
+        ET_LOG(Info, "Unsupported resource category UNKNOWN");
+        return false;
+    }
+  }
+
+  // Constants table - mapping of shader bindings to MRT's and their descriptors
+  for (int i = 0; i < constant_decoder->size(); i++) {
+    auto mrt_i = constant_decoder->getConstantMrtIndex(i);
+    auto constant_data = constant_decoder->getConstant(i);
+    constants.push_back(VkDataGraphPipelineConstantARM{
+        .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM,
+        .pNext = &descriptors[mrt_i],
+        .id = mrt_i,
+        .pConstantData = constant_data.begin(),
+    });
+  }
+
+  // Prepare our layout bindings from the segment's information
+  vector<VkDescriptorSetLayoutBinding> layout_bindings;
+  vector<VkDataGraphPipelineResourceInfoARM> data_graph_resources;
+
+  auto set_count = sequence_decoder->getSegmentDescriptorSetInfosSize(0);
+  for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) {
+    auto handle = sequence_decoder->getDescriptorBindingSlotsHandle(0, d_idx);
+    auto binding_count = sequence_decoder->getBindingsSize(handle);
+    for (int binding = 0; binding < binding_count; binding++) {
+      auto binding_index =
+          sequence_decoder->getBindingSlotBinding(handle, binding);
+      auto MRT_index =
+          sequence_decoder->getBindingSlotMrtIndex(handle, binding);
+      auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value();
+
+      const VkDescriptorSetLayoutBinding layout_binding{
+          .binding = binding_index,
+          .descriptorType = vgflib::ToVkDescriptorType(MRT_type),
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_ALL,
+          .pImmutableSamplers = nullptr,
+      };
+      layout_bindings.push_back(layout_binding);
+
+      const VkDataGraphPipelineResourceInfoARM resource{
+          .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM,
+          // Note: we populate the resource_descriptors 1:1 with the MRT table,
+          // so can directly use that index into the resource_descriptors
+          .pNext = &descriptors[MRT_index],
+          .descriptorSet = d_idx,
+          .binding = binding_index,
+          .arrayElement = 0,
+      };
+      data_graph_resources.push_back(resource);
+    }
+  }
+
+  // create fixed layout for this module
+  const VkDescriptorSetLayoutCreateInfo layout_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .bindingCount = static_cast<uint32_t>(layout_bindings.size()),
+      layout_bindings.data(),
+  };
+  result =
+      vkCreateDescriptorSetLayout(vk_device, &layout_info, nullptr, &vk_layout);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create descriptor layout");
+    return false;
+  }
+
+  // Create descriptor pool and descriptors for pipeline
+  const VkDescriptorPoolCreateInfo descriptor_pool_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .maxSets = static_cast<uint32_t>(set_count),
+      .poolSizeCount = 0,
+      .pPoolSizes = nullptr,
+  };
+  result = vkCreateDescriptorPool(
+      vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create descriptor pool");
+    return false;
+  }
+
+  const VkDescriptorSetAllocateInfo descriptor_set_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .descriptorPool = vk_descriptor_pool,
+      .descriptorSetCount = static_cast<uint32_t>(set_count),
+      .pSetLayouts = &vk_layout,
+  };
+
+  // Alloc descriptor sets
+  // currently, as we require modelSequenceTableSize to == 1
+  // we can only get one descriptor set.
+  vector<VkDescriptorSet> descriptor_sets;
+  descriptor_sets.resize(1);
+  result = vkAllocateDescriptorSets(
+      vk_device, &descriptor_set_info, descriptor_sets.data());
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to allocate descriptor sets");
+    return false;
+  }
+
+  // write descriptor updates for every input
+  auto input_slots = sequence_decoder->getSegmentInputBindingSlotsHandle(0);
+  auto input_size = sequence_decoder->getBindingsSize(input_slots);
+  for (uint32_t i = 0; i < input_size; i++) {
+    auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i);
+    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i);
+
+    VkWriteDescriptorSetTensorARM write_desc = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+        .pNext = nullptr,
+        .tensorViewCount = 1,
+        .pTensorViews = &get<1>(resources[i]),
+    };
+    VkWriteDescriptorSet desc_set = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = &write_desc,
+        .dstSet = descriptor_sets[0],
+        .dstBinding = binding,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+        .pImageInfo = nullptr,
+        .pBufferInfo = nullptr,
+        .pTexelBufferView = nullptr,
+    };
+    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+  }
+
+  // write descriptor updates for every output
+  auto output_slots = sequence_decoder->getSegmentOutputBindingSlotsHandle(0);
+  auto output_size = sequence_decoder->getBindingsSize(output_slots);
+  for (uint32_t i = 0; i < output_size; i++) {
+    auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i);
+    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i);
+
+    VkWriteDescriptorSetTensorARM write_desc = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+        .pNext = nullptr,
+        .tensorViewCount = 1,
+        .pTensorViews = &get<1>(resources[i + input_size]),
+    };
+    VkWriteDescriptorSet desc_set = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = &write_desc,
+        .dstSet = descriptor_sets[0],
+        .dstBinding = binding,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+        .pImageInfo = nullptr,
+        .pBufferInfo = nullptr,
+        .pTexelBufferView = nullptr,
+    };
+    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+  }
+
+  // create our pipeline
+  VkPipelineLayoutCreateInfo pipeline_layout_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .setLayoutCount = 1,
+      .pSetLayouts = &vk_layout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = nullptr,
+  };
+  result = vkCreatePipelineLayout(
+      vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create pipeline layout");
+    return false;
+  }
+
+  // Shader Module Create
+  VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{
+      .sType =
+          VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .module = get<0>(shader_modules[0]),
+      .pName = get<1>(shader_modules[0]).c_str(),
+      .pSpecializationInfo = nullptr,
+      .constantCount = static_cast<uint32_t>(constants.size()),
+      .pConstants = constants.data(),
+  };
+
+  // Prepare Graph Pipeline
+  VkDataGraphPipelineCreateInfoARM graph_pipeline_info{
+      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM,
+      .pNext = &shader_info,
+      .flags = VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT |
+          VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR,
+      .layout = vk_pipeline_layout,
+      .resourceInfoCount = static_cast<uint32_t>(data_graph_resources.size()),
+      .pResourceInfos = data_graph_resources.data(),
+  };
+
+  result = vkCreateDataGraphPipelinesARM(
+      vk_device, // device
+      VK_NULL_HANDLE, // deferredOperation
+      VK_NULL_HANDLE, // VkPipelineCache
+      1, // createInfoCount
+      &graph_pipeline_info, // pCreateInfos
+      nullptr, // pAllocator
+      &vk_pipeline // pPipelines (VkPipeline*)
+  );
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create DataGraphPipeline");
+    return result;
+  }
+
+  // prepare the graph pipeline session
+  VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{
+      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .dataGraphPipeline = vk_pipeline,
+  };
+  result = vkCreateDataGraphPipelineSessionARM(
+      vk_device, &pipeline_session_info, nullptr, &vk_session);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create DataGraphPipelineSession");
+    return result;
+  }
+
+  // Allocate command buffer
+  VkCommandBufferAllocateInfo allocate_info{
+      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .commandPool = vk_command_pool,
+      .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+      .commandBufferCount = 1};
+  result = vkAllocateCommandBuffers(vk_device, &allocate_info, &vk_execute_cmd);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to allocate command buffers");
+    return result;
+  }
+
+  // Populate command once with our dispatch information
+  VkCommandBufferBeginInfo beginInfo{
+      VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
+  vkBeginCommandBuffer(vk_execute_cmd, &beginInfo);
+
+  // bind pipeline + descriptor set
+  vkCmdBindPipeline(
+      vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline);
+
+  vkCmdBindDescriptorSets(
+      vk_execute_cmd,
+      VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM,
+      vk_pipeline_layout,
+      0, // first set
+      1,
+      descriptor_sets.data(), // descriptor set count + pointer
+      0,
+      nullptr // no dynamic offsets
+  );
+
+  // Dispatch the graph command
+  vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr);
+
+  // end the command buffer
+  vkEndCommandBuffer(vk_execute_cmd);
+
+  return true;
+}
+
+bool VgfRepr::execute_vgf() {
+  ET_LOG(Info, "Executing vgf");
+
+  // Submit & wait for idle
+  VkSubmitInfo submit{VK_STRUCTURE_TYPE_SUBMIT_INFO};
+  submit.commandBufferCount = 1;
+  submit.pCommandBuffers = &vk_execute_cmd;
+  VkResult result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "VGF/VkCommandBuffer command submission failed");
+    return false;
+  }
+  vkQueueWaitIdle(vk_queue);
+
+  return true;
+}
+
+void VgfRepr::free_vgf() {
+  vkFreeCommandBuffers(vk_device, vk_command_pool, 1, &vk_execute_cmd);
+  vkDestroyDataGraphPipelineSessionARM(vk_device, vk_session, nullptr);
+  vkDestroyPipeline(vk_device, vk_pipeline, nullptr);
+  vkDestroyPipelineLayout(vk_device, vk_pipeline_layout, nullptr);
+  vkDestroyDescriptorPool(vk_device, vk_descriptor_pool, nullptr);
+  vkDestroyDescriptorSetLayout(vk_device, vk_layout, nullptr);
+  vkDestroyShaderModule(vk_device, vk_shader, nullptr);
+  for (int i = 0; i < IOs.size(); i++) {
+    free_tensor(
+        vk_device, IOs[i].tensor_view, IOs[i].tensor, IOs[i].tensor_memory);
+  }
+}
+
+static uint32_t get_format_size(VkFormat format) {
+  // Note: While this is a small subset of VkFormat, this supports all base
+  //       types for tensors coming from the compiler flow. Tensor formats only
+  //       specify single element type.
+  switch (format) {
+    case VK_FORMAT_R8_BOOL_ARM:
+    case VK_FORMAT_R8_UINT:
+    case VK_FORMAT_R8_SINT:
+      return 1;
+    case VK_FORMAT_R16_UINT:
+    case VK_FORMAT_R16_SINT:
+    case VK_FORMAT_R16_SFLOAT:
+      return 2;
+    case VK_FORMAT_R32_UINT:
+    case VK_FORMAT_R32_SINT:
+    case VK_FORMAT_R32_SFLOAT:
+      return 4;
+    case VK_FORMAT_R64_SINT:
+      return 8;
+    default:
+      ET_LOG(Error, "Unknown tensor format");
+      return 0;
+  }
+}
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VGFSetup.h b/backends/arm/runtime/VGFSetup.h
new file mode 100644
index 00000000000..29fc287865e
--- /dev/null
+++ b/backends/arm/runtime/VGFSetup.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <memory>
+#include <string>
+#include <vector>
+using namespace std;
+
+#include <executorch/runtime/backend/interface.h>
+
+using executorch::runtime::ArrayRef;
+using executorch::runtime::CompileSpec;
+
+// We use the platform and runtime environment provided by the Vulkan delegate
+#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+class VgfRepr;
+
+/*
+ * Info about IOs used during execution
+ */
+typedef struct IO {
+  vector<int64_t> size;
+  vector<int64_t> stride;
+  size_t elt_size;
+  VkTensorARM tensor;
+  VkTensorViewARM tensor_view;
+  VkDeviceMemory tensor_memory;
+  bool is_input;
+} IO;
+
+/*
+ * In memory, and in-vulkan-object representation of the loaded
+ * VGF graph - ready to be dispatched based on provided inputs.
+ */
+class VgfRepr {
+ public:
+  VgfRepr(
+      VkInstance inst,
+      VkPhysicalDevice phys,
+      VkDevice dev,
+      VkQueue queue,
+      VkCommandPool pool)
+      : vk_instance(inst),
+        vk_physical(phys),
+        vk_device(dev),
+        vk_queue(queue),
+        vk_command_pool(pool) {}
+
+  /*
+   * Process a VGF ready for execution, allocate necessary Vulkan objects.
+   */
+  bool process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs);
+
+  /*
+   * Execute the VGF we've previously processed.
+   */
+  bool execute_vgf();
+
+  /*
+   * Free any allocations made in process_vgf.
+   */
+  void free_vgf();
+
+  /*
+   * input and outputs from the VGF - these are memory mapped and populated
+   * with the EValues coming the backend execute call
+   */
+  vector<IO> IOs;
+
+  bool map_io(IO* io, void** handle) {
+    VkResult result =
+        vkMapMemory(vk_device, io->tensor_memory, 0, VK_WHOLE_SIZE, 0, handle);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to map Vulkan IO memory");
+      return false;
+    }
+    return true;
+  }
+
+  void unmap_io(IO* io) {
+    vkUnmapMemory(vk_device, io->tensor_memory);
+  }
+
+  ~VgfRepr() {
+    free_vgf();
+  }
+
+ private:
+  // Basic Vulkan objects passed to us and re-used
+  VkInstance vk_instance;
+  VkPhysicalDevice vk_physical;
+  VkDevice vk_device;
+  VkQueue vk_queue;
+  VkCommandPool vk_command_pool;
+
+  // per-VgfRepr-instance objects allocated in process_vgf, used (can be more
+  // than once) in execute_vgf
+  VkCommandBuffer vk_execute_cmd = VK_NULL_HANDLE;
+  VkDataGraphPipelineSessionARM vk_session = VK_NULL_HANDLE;
+  VkPipeline vk_pipeline = VK_NULL_HANDLE;
+  VkPipelineLayout vk_pipeline_layout = VK_NULL_HANDLE;
+  VkDescriptorPool vk_descriptor_pool;
+  VkDescriptorSetLayout vk_layout;
+  VkShaderModule vk_shader;
+  // Note: the vector of tensor memory is stored in IOs above
+};
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index cb1b8a06afd..0b805aef5f4 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -127,11 +128,18 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   set(VULKAN_RUNNER_SRCS ${_executor_runner__srcs})
   list(TRANSFORM VULKAN_RUNNER_SRCS PREPEND "${EXECUTORCH_ROOT}/")
 
+  set(VGF_BACKEND )
+  if(EXECUTORCH_BUILD_VGF)
+  set(VGF_BACKEND vgf_backend)
+  endif()
+
   add_executable(vulkan_executor_runner ${VULKAN_RUNNER_SRCS})
   target_link_libraries(
     vulkan_executor_runner ${_executor_runner_libs} vulkan_schema
     vulkan_backend
+    ${VGF_BACKEND}
   )
+
   target_compile_options(vulkan_executor_runner PUBLIC ${VULKAN_CXX_FLAGS})
 endif()
 
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
index 5840d1695ee..48232179e06 100644
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ b/backends/vulkan/runtime/graph/containers/Types.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,6 +9,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <ostream>
 
 namespace vkcompute {
diff --git a/backends/vulkan/third-party/Vulkan-Headers b/backends/vulkan/third-party/Vulkan-Headers
index 0c5928795a6..10739e8e00a 160000
--- a/backends/vulkan/third-party/Vulkan-Headers
+++ b/backends/vulkan/third-party/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 0c5928795a66e93f65e5e68a36d8daa79a209dc2
+Subproject commit 10739e8e00a7b6f74d22dd0a547f1406ff1f5eb9
diff --git a/backends/vulkan/third-party/volk b/backends/vulkan/third-party/volk
index b3bc21e584f..49ba6858c13 160000
--- a/backends/vulkan/third-party/volk
+++ b/backends/vulkan/third-party/volk
@@ -1 +1 @@
-Subproject commit b3bc21e584f97400b6884cb2a541a56c6a5ddba3
+Subproject commit 49ba6858c13516019d699d94c31d5814025dd005
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 551c69bc93e..06558b85460 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -145,6 +145,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." BOOL
   OFF