From 19d2230e07d1dfe5d1212a16f2db575658fba186 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 6 Oct 2025 16:42:10 -0700 Subject: [PATCH 1/2] Add Gemma 3 test. Differential Revision: D84001548 Pull Request resolved: https://github.com/pytorch/executorch/pull/14825 (cherry picked from commit d8a21260d35a4acf2073266820950a819aafb8ae) --- .../Exported/ExecuTorchLLMMultimodalRunner.h | 16 ++ .../Exported/ExecuTorchLLMMultimodalRunner.mm | 84 +++++++- .../__tests__/MultimodalRunnerTest.swift | 179 ++++++++++++++---- .../__tests__/TextRunnerTest.swift | 4 +- 4 files changed, 233 insertions(+), 50 deletions(-) diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h index 8523581da8a..250241b9c9d 100644 --- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h +++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h @@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted)) channels:(NSInteger)channels NS_DESIGNATED_INITIALIZER; +- (instancetype)initWithFloatData:(NSData *)data + width:(NSInteger)width + height:(NSInteger)height + channels:(NSInteger)channels + NS_DESIGNATED_INITIALIZER; + @property(nonatomic, readonly) NSData *data; @property(nonatomic, readonly) NSInteger width; @@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted)) @property(nonatomic, readonly) NSInteger channels; +@property(nonatomic, readonly) BOOL isFloat; + + (instancetype)new NS_UNAVAILABLE; - (instancetype)init NS_UNAVAILABLE; @@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted)) frames:(NSInteger)frames NS_DESIGNATED_INITIALIZER; +- (instancetype)initWithFloatData:(NSData *)data + batchSize:(NSInteger)batchSize + bins:(NSInteger)bins + frames:(NSInteger)frames + NS_DESIGNATED_INITIALIZER; + @property(nonatomic, readonly) NSData *data; @property(nonatomic, readonly) NSInteger batchSize; @@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted)) @property(nonatomic, readonly) NSInteger frames; +@property(nonatomic, readonly) BOOL isFloat; + + (instancetype)new NS_UNAVAILABLE; - (instancetype)init NS_UNAVAILABLE; diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm index a3dc3e6afd1..964805053e2 100644 --- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm +++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm @@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data _width = width; _height = height; _channels = channels; + _isFloat = NO; + } + return self; +} + +- (instancetype)initWithFloatData:(NSData *)data + width:(NSInteger)width + height:(NSInteger)height + channels:(NSInteger)channels { + self = [super init]; + if (self) { + _data = [data copy]; + _width = width; + _height = height; + _channels = channels; + _isFloat = YES; } return self; } @@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data _batchSize = batchSize; _bins = bins; _frames = frames; + _isFloat = NO; + } + return self; +} + +- (instancetype)initWithFloatData:(NSData *)data + batchSize:(NSInteger)batchSize + bins:(NSInteger)bins + frames:(NSInteger)frames { + self = [super init]; + if (self) { + _data = [data copy]; + _batchSize = batchSize; + _bins = bins; + _frames = frames; + _isFloat = YES; } return self; } @@ -170,6 +202,7 @@ - (BOOL)generateWithInputs:(NSArray *)inputs return NO; } std::vector nativeInputs; + nativeInputs.reserve((size_t)inputs.count); for (ExecuTorchLLMMultimodalInput *input in inputs) { switch (input.type) { case ExecuTorchLLMMultimodalInputTypeText: @@ -177,13 +210,50 @@ - (BOOL)generateWithInputs:(NSArray *)inputs break; case ExecuTorchLLMMultimodalInputTypeImage: { ExecuTorchLLMImage *image = input.image; - std::vector data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length); - nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( - std::move(data), - (int32_t)image.width, - (int32_t)image.height, - (int32_t)image.channels - ))); + if (image.isFloat) { + const float *buffer = (const float *)image.data.bytes; + size_t elementCount = (size_t)image.data.length / sizeof(float); + std::vector data(buffer, buffer + elementCount); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( + std::move(data), + (int32_t)image.width, + (int32_t)image.height, + (int32_t)image.channels + ))); + } else { + const uint8_t *buffer = (const uint8_t *)image.data.bytes; + std::vector data(buffer, buffer + image.data.length); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( + std::move(data), + (int32_t)image.width, + (int32_t)image.height, + (int32_t)image.channels + ))); + } + break; + } + case ExecuTorchLLMMultimodalInputTypeAudio: { + ExecuTorchLLMAudio *audio = input.audio; + if (audio.isFloat) { + const float *buffer = (const float *)audio.data.bytes; + size_t elementCount = (size_t)audio.data.length / sizeof(float); + std::vector data(buffer, buffer + elementCount); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio( + std::move(data), + (int32_t)audio.batchSize, + (int32_t)audio.bins, + (int32_t)audio.frames + ))); + } else { + const uint8_t *buffer = (const uint8_t *)audio.data.bytes; + std::vector data(buffer, buffer + audio.data.length); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio( + std::move(data), + (int32_t)audio.batchSize, + (int32_t)audio.bins, + (int32_t)audio.frames + ))); + } break; } default: { diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift index 7ae9da4969b..7281740c3af 100644 --- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift +++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift @@ -10,60 +10,157 @@ import ExecuTorchLLM import XCTest extension UIImage { - func asImage() -> Image { - let targetSide = CGFloat(336) - let scale = max(targetSide / size.width, targetSide / size.height) - let scaledSize = CGSize(width: size.width * scale, height: size.height * scale) + func centerCropped(to sideSize: CGFloat) -> UIImage { + precondition(sideSize > 0) let format = UIGraphicsImageRendererFormat.default() format.scale = 1 - let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in - draw(in: CGRect(origin: .zero, size: scaledSize)) - } - guard let scaledCGImage = scaledImage.cgImage else { - return Image(data: Data(), width: 336, height: 336, channels: 3) - } - let cropRect = CGRect( - x: ((scaledSize.width - targetSide) * 0.5).rounded(.down), - y: ((scaledSize.height - targetSide) * 0.5).rounded(.down), - width: targetSide.rounded(.down), - height: targetSide.rounded(.down) - ) - let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage - let imageWidth = croppedCGImage.width - let imageHeight = croppedCGImage.height - let pixelCount = imageWidth * imageHeight - var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4) - let context = CGContext( + format.opaque = false + return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format) + .image { _ in + let scaleFactor = max(sideSize / size.width, sideSize / size.height) + let scaledWidth = size.width * scaleFactor + let scaledHeight = size.height * scaleFactor + let originX = (sideSize - scaledWidth) / 2 + let originY = (sideSize - scaledHeight) / 2 + draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight)) + } + } + + func rgbBytes() -> [UInt8]? { + guard let cgImage = cgImage else { return nil } + let pixelWidth = Int(cgImage.width) + let pixelHeight = Int(cgImage.height) + let pixelCount = pixelWidth * pixelHeight + let bytesPerPixel = 4 + let bytesPerRow = pixelWidth * bytesPerPixel + var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel) + guard let context = CGContext( data: &rgbaBuffer, - width: imageWidth, - height: imageHeight, + width: pixelWidth, + height: pixelHeight, bitsPerComponent: 8, - bytesPerRow: imageWidth * 4, + bytesPerRow: bytesPerRow, space: CGColorSpaceCreateDeviceRGB(), bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue - )! - context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight)) - var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3) + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight)) + + var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3) for pixelIndex in 0.. [Float]? { + precondition(mean.count == 3 && std.count == 3) + precondition(std[0] != 0 && std[1] != 0 && std[2] != 0) + guard let rgbBytes = rgbBytes() else { return nil } + let pixelCount = rgbBytes.count / 3 + var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3) + for pixelIndex in 0.. Image { + return Image( + data: Data(centerCropped(to: sideSize).rgbBytes() ?? []), + width: Int(sideSize), + height: Int(sideSize), + channels: 3 + ) + } + + func asNormalizedImage( + _ sideSize: CGFloat, + mean: [Float] = [0.485, 0.456, 0.406], + std: [Float] = [0.229, 0.224, 0.225] + ) -> Image { + return Image( + float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) }, + width: Int(sideSize), + height: Int(sideSize), + channels: 3 + ) } } class MultimodalRunnerTest: XCTestCase { - let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: " - let assistantPrompt = "ASSISTANT: " + let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." let userPrompt = "What's on the picture?" - let sequenceLength = 768 + + func testGemma() { + let chatTemplate = "user\n%@\nmodel" + let sideSize: CGFloat = 896 + let sequenceLength = 768 + let bundle = Bundle(for: type(of: self)) + guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"), + let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"), + let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"), + let uiImage = UIImage(contentsOfFile: imagePath) else { + XCTFail("Couldn't find model or tokenizer files") + return + } + let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath) + var text = "" + + do { + try runner.generate([ + MultimodalInput(systemPrompt), + MultimodalInput(uiImage.asNormalizedImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), + ], Config { + $0.sequenceLength = sequenceLength + }) { token in + text += token + if token == "" { + runner.stop() + } + } + } catch { + XCTFail("Failed to generate text with error \(error)") + } + XCTAssertTrue(text.lowercased().contains("waterfall")) + + text = "" + runner.reset() + do { + try runner.generate([ + MultimodalInput(systemPrompt), + MultimodalInput(uiImage.asNormalizedImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), + ], Config { + $0.sequenceLength = sequenceLength + }) { token in + text += token + if token == "" { + runner.stop() + } + } + } catch { + XCTFail("Failed to generate text with error \(error)") + } + XCTAssertTrue(text.lowercased().contains("waterfall")) + } func testLLaVA() { + let chatTemplate = "USER: %@ ASSISTANT: " + let sideSize: CGFloat = 336 + let sequenceLength = 768 let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"), + let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"), let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"), let uiImage = UIImage(contentsOfFile: imagePath) else { XCTFail("Couldn't find model or tokenizer files") @@ -75,8 +172,8 @@ class MultimodalRunnerTest: XCTestCase { do { try runner.generate([ MultimodalInput(systemPrompt), - MultimodalInput(uiImage.asImage()), - MultimodalInput("\(userPrompt) \(assistantPrompt)"), + MultimodalInput(uiImage.asImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), ], Config { $0.sequenceLength = sequenceLength }) { token in @@ -92,8 +189,8 @@ class MultimodalRunnerTest: XCTestCase { do { try runner.generate([ MultimodalInput(systemPrompt), - MultimodalInput(uiImage.asImage()), - MultimodalInput("\(userPrompt) \(assistantPrompt)"), + MultimodalInput(uiImage.asImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), ], Config { $0.sequenceLength = sequenceLength }) { token in diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift index f7124fec640..0fa2b59d05d 100644 --- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift +++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift @@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase { func testLLaMA() { let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else { + let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else { XCTFail("Couldn't find model or tokenizer files") return } @@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase { func testPhi4() { let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else { + let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else { XCTFail("Couldn't find model or tokenizer files") return } From 8f2944964e8b8d6808c3193014bc4f8a87a5e927 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 13 Oct 2025 13:13:33 -0700 Subject: [PATCH 2/2] Update MultimodalRunnerTest.swift