diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index f07211de5..9a2ffd573 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -73,4 +73,10 @@ timesteps Timesteps denoises denoise -denoising \ No newline at end of file +denoising +threadpool +chrono +setpriority +errno +ifdef +elif diff --git a/apps/llm/app.json b/apps/llm/app.json index 5eef49366..db61ee89d 100644 --- a/apps/llm/app.json +++ b/apps/llm/app.json @@ -58,7 +58,11 @@ "foregroundImage": "./assets/icons/adaptive-icon.png", "backgroundColor": "#ffffff" }, - "package": "com.anonymous.llm" + "package": "com.anonymous.llm", + "permissions": [ + "android.permission.READ_CALENDAR", + "android.permission.WRITE_CALENDAR" + ] }, "web": { "favicon": "./assets/icons/favicon.png" diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index ed0d37f92..6d5e48902 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -13,12 +13,8 @@ #include #include #include - -#if defined(__ANDROID__) && defined(__aarch64__) -#include -#include -#include -#endif +#include +#include namespace rnexecutorch { @@ -97,21 +93,8 @@ void RnExecutorchInstaller::injectJSIBindings( RnExecutorchInstaller::loadModel( jsiRuntime, jsCallInvoker, "loadSpeechToText")); -#if defined(__ANDROID__) && defined(__aarch64__) - auto num_of_perf_cores = - ::executorch::extension::cpuinfo::get_num_performant_cores(); - log(LOG_LEVEL::Info, "Detected ", num_of_perf_cores, " performant cores"); - // setting num_of_cores to floor(num_of_perf_cores / 2) + 1) because depending - // on cpu arch as when possible we want to leave at least 2 performant cores - // for other tasks (setting more actually results in drop of performance). For - // older devices (i.e. samsung s22) resolves to 3 cores, and for newer ones - // (like OnePlus 12) resolves to 4, which when benchamrked gives highest - // throughput. - auto num_of_cores = static_cast(num_of_perf_cores / 2) + 1; - ::executorch::extension::threadpool::get_threadpool() - ->_unsafe_reset_threadpool(num_of_cores); - log(LOG_LEVEL::Info, "Configuring xnnpack for ", num_of_cores, " threads"); -#endif + threads::utils::unsafeSetupThreadPool(); + threads::GlobalThreadPool::initialize(); } } // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index f512dce9d..97f769e3a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -21,6 +21,7 @@ #include #include #include +#include namespace rnexecutorch { @@ -201,58 +202,60 @@ template class ModelHostObject : public JsiHostObject { // We need to dispatch a thread if we want the function to be // asynchronous. In this thread all accesses to jsi::Runtime need to // be done via the callInvoker. - std::thread([this, promise, - argsConverted = std::move(argsConverted)]() { - try { - if constexpr (std::is_void_v) { - // For void functions, just call the function and resolve with - // undefined - std::apply(std::bind_front(FnPtr, model), - std::move(argsConverted)); - callInvoker->invokeAsync([promise](jsi::Runtime &runtime) { - promise->resolve(jsi::Value::undefined()); - }); - } else { - // For non-void functions, capture the result and convert it - auto result = std::apply(std::bind_front(FnPtr, model), - std::move(argsConverted)); - // The result is copied. It should either be quickly copiable, - // or passed with a shared_ptr. - callInvoker->invokeAsync( - [promise, result](jsi::Runtime &runtime) { - promise->resolve(jsi_conversion::getJsiValue( - std::move(result), runtime)); - }); - } - } catch (const std::runtime_error &e) { - // This catch should be merged with the next two - // (std::runtime_error and jsi::JSError inherits from - // std::exception) HOWEVER react native has broken RTTI which - // breaks proper exception type checking. Remove when the - // following change is present in our version: - // https://github.com/facebook/react-native/commit/3132cc88dd46f95898a756456bebeeb6c248f20e - callInvoker->invokeAsync([e = std::move(e), promise]() { - promise->reject(e.what()); + threads::GlobalThreadPool::detach( + [this, promise, argsConverted = std::move(argsConverted)]() { + try { + if constexpr (std::is_void_v) { + // For void functions, just call the function and resolve + // with undefined + std::apply(std::bind_front(FnPtr, model), + std::move(argsConverted)); + callInvoker->invokeAsync( + [promise](jsi::Runtime &runtime) { + promise->resolve(jsi::Value::undefined()); + }); + } else { + // For non-void functions, capture the result and convert + // it + auto result = std::apply(std::bind_front(FnPtr, model), + std::move(argsConverted)); + // The result is copied. It should either be quickly + // copiable, or passed with a shared_ptr. + callInvoker->invokeAsync( + [promise, result](jsi::Runtime &runtime) { + promise->resolve(jsi_conversion::getJsiValue( + std::move(result), runtime)); + }); + } + } catch (const std::runtime_error &e) { + // This catch should be merged with the next two + // (std::runtime_error and jsi::JSError inherits from + // std::exception) HOWEVER react native has broken RTTI + // which breaks proper exception type checking. Remove when + // the following change is present in our version: + // https://github.com/facebook/react-native/commit/3132cc88dd46f95898a756456bebeeb6c248f20e + callInvoker->invokeAsync([e = std::move(e), promise]() { + promise->reject(e.what()); + }); + return; + } catch (const jsi::JSError &e) { + callInvoker->invokeAsync([e = std::move(e), promise]() { + promise->reject(e.what()); + }); + return; + } catch (const std::exception &e) { + callInvoker->invokeAsync([e = std::move(e), promise]() { + promise->reject(e.what()); + }); + return; + } catch (...) { + callInvoker->invokeAsync( + [promise]() { promise->reject("Unknown error"); }); + return; + } }); - return; - } catch (const jsi::JSError &e) { - callInvoker->invokeAsync([e = std::move(e), promise]() { - promise->reject(e.what()); - }); - return; - } catch (const std::exception &e) { - callInvoker->invokeAsync([e = std::move(e), promise]() { - promise->reject(e.what()); - }); - return; - } catch (...) { - callInvoker->invokeAsync( - [promise]() { promise->reject("Unknown error"); }); - return; - } - }).detach(); } catch (...) { promise->reject("Couldn't parse JS arguments in a native function"); } diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h new file mode 100644 index 000000000..ec5ddeaa0 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h @@ -0,0 +1,79 @@ +// GlobalThreadPool.h +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace rnexecutorch::threads { + +class GlobalThreadPool { +public: + GlobalThreadPool() = delete; + GlobalThreadPool(const GlobalThreadPool &) = delete; + GlobalThreadPool &operator=(const GlobalThreadPool &) = delete; + GlobalThreadPool(GlobalThreadPool &&) = delete; + GlobalThreadPool &operator=(GlobalThreadPool &&) = delete; + + static HighPerformanceThreadPool &get() { + if (!instance) { + initialize(); + } + return *instance; + } + + static void initialize(std::optional numThreads = std::nullopt, + ThreadConfig config = {}) { + std::call_once(initFlag, [&numThreads, config]() { + if (!numThreads) { + numThreads = + ::executorch::extension::cpuinfo::get_num_performant_cores(); + } + + log(rnexecutorch::LOG_LEVEL::Info, "Initializing global thread pool with", + numThreads, "threads"); + instance = std::make_unique(numThreads.value(), + config); + }); + } + + // Convenience methods that mirror std::thread interface + template + static auto async(Func &&func, Args &&...args) { + return get().submit(std::forward(func), std::forward(args)...); + } + + template + static auto async_high_priority(Func &&func, Args &&...args) { + return get().submitWithPriority(Priority::HIGH, std::forward(func), + std::forward(args)...); + } + + // Fire and forget (like std::thread{}.detach()) + template + static void detach(Func &&func, Args &&...args) { + get().submitDetached(std::forward(func), std::forward(args)...); + } + + // Execute and wait (like std::thread{}.join()) + template + static auto execute(Func &&func, Args &&...args) { + return get().execute(std::forward(func), std::forward(args)...); + } + + static void shutdown() { + if (instance) { + instance->shutdown(); + instance.reset(); + } + } + +private: + inline static std::unique_ptr instance; + inline static std::once_flag initFlag; +}; + +} // namespace rnexecutorch::threads diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/HighPerformanceThreadPool.h b/packages/react-native-executorch/common/rnexecutorch/threads/HighPerformanceThreadPool.h new file mode 100644 index 000000000..fd856f990 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/threads/HighPerformanceThreadPool.h @@ -0,0 +1,364 @@ +// HighPerformanceThreadPool.h +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef __APPLE__ +#include +#endif + +#ifdef __ANDROID__ +#include +#endif + +namespace rnexecutorch::threads { + +enum class Priority { LOW, NORMAL, HIGH, REALTIME }; + +struct ThreadConfig { + bool pinToPerformanceCores{true}; + std::string namePrefix{"RN_ET_Worker"}; +}; + +class HighPerformanceThreadPool { +public: + explicit HighPerformanceThreadPool(size_t numThreads = 1, + ThreadConfig cfg = ThreadConfig()) + : config(std::move(cfg)) { + +#ifdef __ANDROID__ + detectCPUTopology(); + numThreads = std::min(numThreads, performanceCores.size()); +#endif + + for (size_t i = 0; i < numThreads; i++) { + workers.emplace_back(&HighPerformanceThreadPool::workerThread, this, i); + } + + log(LOG_LEVEL::Debug, "Thread pool initialized with", numThreads, + "workers."); + } + + ~HighPerformanceThreadPool() { shutdown(); } + + // Submit a task and get a future for the result + template + auto submit(Func &&func, Args &&...args) + -> std::future { + return submitWithPriority(Priority::NORMAL, std::forward(func), + std::forward(args)...); + } + + // Submit a task with specific priority + template + auto submitWithPriority(Priority priority, Func &&func, Args &&...args) + -> std::future { + + using ReturnType = decltype(func(args...)); + + // Create a packaged task + auto boundFunc = + std::bind(std::forward(func), std::forward(args)...); + auto task = std::make_unique>( + std::move(boundFunc)); + auto future = task->getFuture(); + + // Add to queue + { + std::scoped_lock lock(queueMutex); + + if (!running) { + throw std::runtime_error("Thread pool is shutting down"); + } + + WorkItem item(std::move(task), priority, + std::chrono::steady_clock::now()); + + taskQueue.push(std::move(item)); + } + + condition.notify_one(); + return future; + } + + // Execute a task and wait for result + template + auto execute(Func &&func, Args &&...args) -> decltype(func(args...)) { + auto future = submit(std::forward(func), std::forward(args)...); + return future.get(); + } + + // Fire and forget task + template + void submitDetached(Func &&func, Args &&...args) { + submit(std::forward(func), std::forward(args)...); + // Future is destroyed, task still runs + } + + void shutdown() { + if (!running.exchange(false)) { + return; + } + + condition.notify_all(); + + for (auto &worker : workers) { + if (worker.joinable()) { + worker.join(); + } + } + } + +private: + // Task wrapper that can hold any callable + class ITask { + public: + virtual ~ITask() = default; + virtual void execute() = 0; + }; + + template class Task : public ITask { + public: + Task(Func &&f) : func(std::forward(f)) {} + + void execute() override { + try { + if constexpr (std::is_void_v) { + func(); + promise.set_value(); + } else { + promise.set_value(func()); + } + } catch (...) { + promise.set_exception(std::current_exception()); + } + } + + std::future getFuture() { return promise.get_future(); } + + private: + Func func; + std::promise promise; + }; + + class WorkItem { + public: + WorkItem() = default; + WorkItem(std::unique_ptr task, Priority priority, + std::chrono::steady_clock::time_point enqueueTime) + : task(std::move(task)), priority(priority), enqueueTime(enqueueTime) {} + + std::unique_ptr task; + + bool operator<(const WorkItem &other) const { + return priority != other.priority ? priority < other.priority + : enqueueTime > other.enqueueTime; + } + + private: + Priority priority; + std::chrono::steady_clock::time_point enqueueTime; + }; + + // Thread pool state + std::vector workers; + std::priority_queue taskQueue; + std::mutex queueMutex; + std::condition_variable condition; + std::atomic running{true}; + std::atomic activeWorkers{0}; + std::atomic totalTasksProcessed{0}; + +#ifdef __ANDROID__ + // Performance cores + std::vector performanceCores; + std::vector efficiencyCores; +#endif + + // Configuration + ThreadConfig config; + + void detectCPUTopology() { +#ifdef __ANDROID__ + struct CoreInfo { + int32_t id; + int64_t maxFreq; + }; + + std::vector cores; + const auto numOfCores = std::thread::hardware_concurrency(); + + for (int32_t i = 0; std::cmp_less(i, numOfCores); ++i) { + std::string path = "/sys/devices/system/cpu/cpu" + std::to_string(i) + + "/cpufreq/cpuinfo_max_freq"; + std::ifstream file(path); + if (!file.good()) { + break; + } + + CoreInfo info; + info.id = i; + file >> info.maxFreq; + cores.push_back(info); + } + + if (cores.empty()) { + log(LOG_LEVEL::Debug, "Could not detect CPU topology"); + return; + } + + // Sort by frequency + std::ranges::sort(cores, [](const CoreInfo &a, const CoreInfo &b) { + return a.maxFreq > b.maxFreq; + }); + + // Classify cores + const auto numOfPerfCores = + ::executorch::extension::cpuinfo::get_num_performant_cores(); + + constexpr float kKiloToGigaRatio = 1e6; + for (int32_t i = 0; i < cores.size(); ++i) { + if (i < numOfPerfCores) { + performanceCores.push_back(cores[i].id); + log(LOG_LEVEL::Debug, "Performance core:", cores[i].id, "(", + cores[i].maxFreq / kKiloToGigaRatio, "GHz)"); + } else { + efficiencyCores.push_back(cores[i].id); + log(LOG_LEVEL::Debug, "Efficiency core:", cores[i].id, "(", + cores[i].maxFreq / kKiloToGigaRatio, "GHz)"); + } + } +#endif + } + +#ifdef __ANDROID__ + inline uint64_t getCurrentThreadId() { return gettid(); } +#endif + + inline void setCurrentThreadName(const std::string &name) { +#ifdef __ANDROID__ + pthread_setname_np(pthread_self(), name.c_str()); +#elif defined(__APPLE__) + pthread_setname_np(name.c_str()); +#endif + } + + void configureThread(uint32_t workerIndex) { + std::string threadName = config.namePrefix + std::to_string(workerIndex); + setCurrentThreadName(threadName.c_str()); + +#ifdef __ANDROID__ + if (config.pinToPerformanceCores && !performanceCores.empty()) { + setCPUAffinity(); + } +#endif + + setThreadPriority(); + + log(LOG_LEVEL::Debug, "Worker", workerIndex, + "configured:", threadName.c_str()); + } + + void setCPUAffinity() { + // AFAIK it is not possible on iOS +#ifdef __ANDROID__ + if (performanceCores.empty()) { + log(LOG_LEVEL::Error, "No cores specified for affinity setting"); + return; + } + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + for (int32_t core : performanceCores) { + CPU_SET(core, &cpuset); + } + + pid_t tid = getCurrentThreadId(); + log(LOG_LEVEL::Debug, "Thread id", tid); + if (sched_setaffinity(tid, sizeof(cpuset), &cpuset) == 0) { + log(LOG_LEVEL::Debug, "Thread pinned to cores:", performanceCores); + } else { + log(LOG_LEVEL::Debug, "Failed to set CPU affinity (error:", errno, + "). Continuing without affinity."); + } +#endif + } + + void setThreadPriority() { + // pthread_setschedparam doesn't work on android because permissions reasons + // and in general does not provide visible improvements on iOS + + // Set nice value as fallback or additional priority boost + constexpr int nice_value = 0; + if (setpriority(PRIO_PROCESS, 0, nice_value) != 0) { + log(LOG_LEVEL::Debug, "Failed to set nice value"); + } else { + log(LOG_LEVEL::Debug, "Set nice value", nice_value); + } + } + + void processTask(const WorkItem &item) { + activeWorkers++; + + try { + item.task->execute(); + } catch (const std::exception &e) { + log(LOG_LEVEL::Error, "Task failed:", e.what()); + activeWorkers--; + throw; + } + + activeWorkers--; + totalTasksProcessed++; + } + + void workerThread(int workerIndex) { + configureThread(workerIndex); + + while (running) { + WorkItem item; + + { + std::unique_lock lock(queueMutex); + condition.wait(lock, [this] { return !taskQueue.empty() || !running; }); + + if (!running && taskQueue.empty()) { + break; + } + + if (!taskQueue.empty()) { + item = std::move(const_cast(taskQueue.top())); + taskQueue.pop(); + } else { + continue; + } + } + + processTask(item); + } + + log(LOG_LEVEL::Debug, "Worker", workerIndex, "shutting down"); + } +}; + +} // namespace rnexecutorch::threads diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/utils/ThreadUtils.h b/packages/react-native-executorch/common/rnexecutorch/threads/utils/ThreadUtils.h new file mode 100644 index 000000000..664480d38 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/threads/utils/ThreadUtils.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include + +namespace rnexecutorch::threads::utils { + +void unsafeSetupThreadPool(uint32_t num_of_cores = 0) { + auto num_of_perf_cores = + ::executorch::extension::cpuinfo::get_num_performant_cores(); + log(LOG_LEVEL::Info, "Detected ", num_of_perf_cores, " performant cores"); + // setting num_of_cores to floor(num_of_perf_cores / 2) + 1) because + // depending on cpu arch as when possible we want to leave at least 2 + // performant cores for other tasks (setting more actually results in drop + // of performance). For older devices (i.e. samsung s22) resolves to 3 + // cores, and for newer ones (like OnePlus 12) resolves to 4, which when + // benchmarked gives highest throughput. For iPhones they usually have 2 + // performance cores + auto _num_of_cores = num_of_cores + ? num_of_cores + : static_cast(num_of_perf_cores / 2) + 1; + const auto threadpool = ::executorch::extension::threadpool::get_threadpool(); + threadpool->_unsafe_reset_threadpool(_num_of_cores); + log(LOG_LEVEL::Info, "Configuring xnnpack for", + threadpool->get_thread_count(), "threads"); +} + +} // namespace rnexecutorch::threads::utils diff --git a/packages/react-native-executorch/ios/libs/cpuinfo/libcpuinfo.a b/packages/react-native-executorch/ios/libs/cpuinfo/libcpuinfo.a new file mode 100644 index 000000000..b5d389569 Binary files /dev/null and b/packages/react-native-executorch/ios/libs/cpuinfo/libcpuinfo.a differ diff --git a/packages/react-native-executorch/ios/libs/pthreadpool/physical-arm64-release/libpthreadpool.a b/packages/react-native-executorch/ios/libs/pthreadpool/physical-arm64-release/libpthreadpool.a new file mode 100644 index 000000000..652afff7b Binary files /dev/null and b/packages/react-native-executorch/ios/libs/pthreadpool/physical-arm64-release/libpthreadpool.a differ diff --git a/packages/react-native-executorch/ios/libs/pthreadpool/simulator-arm64-debug/libpthreadpool.a b/packages/react-native-executorch/ios/libs/pthreadpool/simulator-arm64-debug/libpthreadpool.a new file mode 100644 index 000000000..e3c3053a1 Binary files /dev/null and b/packages/react-native-executorch/ios/libs/pthreadpool/simulator-arm64-debug/libpthreadpool.a differ diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec index 381d3a99f..2c63c0923 100644 --- a/packages/react-native-executorch/react-native-executorch.podspec +++ b/packages/react-native-executorch/react-native-executorch.podspec @@ -24,6 +24,9 @@ Pod::Spec.new do |s| 'MetalPerformanceShadersGraph' ] + pthreadpool_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/ios/libs/pthreadpool', __dir__) + cpuinfo_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/ios/libs/cpuinfo', __dir__) + s.user_target_xcconfig = { "HEADER_SEARCH_PATHS" => "$(PODS_TARGET_SRCROOT)/third-party/include", @@ -39,7 +42,9 @@ Pod::Spec.new do |s| "-force_load \"#{et_binaries_path}\"/libthreadpool_ios.a", "\"#{tokenizers_binaries_path}/physical-arm64-release/libtokenizers_cpp.a\"", "\"#{tokenizers_binaries_path}/physical-arm64-release/libsentencepiece.a\"", - "\"#{tokenizers_binaries_path}/physical-arm64-release/libtokenizers_c.a\"" + "\"#{tokenizers_binaries_path}/physical-arm64-release/libtokenizers_c.a\"", + "\"#{pthreadpool_binaries_path}/physical-arm64-release/libpthreadpool.a\"", + "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"" ].join(' '), "OTHER_LDFLAGS[sdk=iphonesimulator*]" => [ @@ -54,7 +59,9 @@ Pod::Spec.new do |s| "-force_load \"#{et_binaries_path}\"/libthreadpool_simulator.a", "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libtokenizers_cpp.a\"", "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libsentencepiece.a\"", - "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libtokenizers_c.a\"" + "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libtokenizers_c.a\"", + "\"#{pthreadpool_binaries_path}/simulator-arm64-debug/libpthreadpool.a\"", + "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"" ].join(' '), 'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64', diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/cpuinfo_utils.h b/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/cpuinfo_utils.h index d559738b7..8f54889f7 100644 --- a/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/cpuinfo_utils.h +++ b/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/cpuinfo_utils.h @@ -8,7 +8,7 @@ #pragma once -#include +#include namespace executorch::extension::cpuinfo { diff --git a/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/threadpool.h b/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/threadpool.h index 95678d2c6..edd3be02a 100644 --- a/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/threadpool.h +++ b/packages/react-native-executorch/third-party/include/executorch/extension/threadpool/threadpool.h @@ -12,7 +12,7 @@ #include #include -#include +#include namespace executorch::extension::threadpool {