diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h index ce943be0dd0..b2e36e0a1f2 100644 --- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h +++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h @@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted)) channels:(NSInteger)channels NS_DESIGNATED_INITIALIZER; +- (instancetype)initWithFloatData:(NSData *)data + width:(NSInteger)width + height:(NSInteger)height + channels:(NSInteger)channels + NS_DESIGNATED_INITIALIZER; + @property(nonatomic, readonly) NSData *data; @property(nonatomic, readonly) NSInteger width; @@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted)) @property(nonatomic, readonly) NSInteger channels; +@property(nonatomic, readonly) BOOL isFloat; + + (instancetype)new NS_UNAVAILABLE; - (instancetype)init NS_UNAVAILABLE; @@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted)) frames:(NSInteger)frames NS_DESIGNATED_INITIALIZER; +- (instancetype)initWithFloatData:(NSData *)data + batchSize:(NSInteger)batchSize + bins:(NSInteger)bins + frames:(NSInteger)frames + NS_DESIGNATED_INITIALIZER; + @property(nonatomic, readonly) NSData *data; @property(nonatomic, readonly) NSInteger batchSize; @@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted)) @property(nonatomic, readonly) NSInteger frames; +@property(nonatomic, readonly) BOOL isFloat; + + (instancetype)new NS_UNAVAILABLE; - (instancetype)init NS_UNAVAILABLE; diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm index a3dc3e6afd1..964805053e2 100644 --- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm +++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm @@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data _width = width; _height = height; _channels = channels; + _isFloat = NO; + } + return self; +} + +- (instancetype)initWithFloatData:(NSData *)data + width:(NSInteger)width + height:(NSInteger)height + channels:(NSInteger)channels { + self = [super init]; + if (self) { + _data = [data copy]; + _width = width; + _height = height; + _channels = channels; + _isFloat = YES; } return self; } @@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data _batchSize = batchSize; _bins = bins; _frames = frames; + _isFloat = NO; + } + return self; +} + +- (instancetype)initWithFloatData:(NSData *)data + batchSize:(NSInteger)batchSize + bins:(NSInteger)bins + frames:(NSInteger)frames { + self = [super init]; + if (self) { + _data = [data copy]; + _batchSize = batchSize; + _bins = bins; + _frames = frames; + _isFloat = YES; } return self; } @@ -170,6 +202,7 @@ - (BOOL)generateWithInputs:(NSArray *)inputs return NO; } std::vector nativeInputs; + nativeInputs.reserve((size_t)inputs.count); for (ExecuTorchLLMMultimodalInput *input in inputs) { switch (input.type) { case ExecuTorchLLMMultimodalInputTypeText: @@ -177,13 +210,50 @@ - (BOOL)generateWithInputs:(NSArray *)inputs break; case ExecuTorchLLMMultimodalInputTypeImage: { ExecuTorchLLMImage *image = input.image; - std::vector data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length); - nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( - std::move(data), - (int32_t)image.width, - (int32_t)image.height, - (int32_t)image.channels - ))); + if (image.isFloat) { + const float *buffer = (const float *)image.data.bytes; + size_t elementCount = (size_t)image.data.length / sizeof(float); + std::vector data(buffer, buffer + elementCount); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( + std::move(data), + (int32_t)image.width, + (int32_t)image.height, + (int32_t)image.channels + ))); + } else { + const uint8_t *buffer = (const uint8_t *)image.data.bytes; + std::vector data(buffer, buffer + image.data.length); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Image( + std::move(data), + (int32_t)image.width, + (int32_t)image.height, + (int32_t)image.channels + ))); + } + break; + } + case ExecuTorchLLMMultimodalInputTypeAudio: { + ExecuTorchLLMAudio *audio = input.audio; + if (audio.isFloat) { + const float *buffer = (const float *)audio.data.bytes; + size_t elementCount = (size_t)audio.data.length / sizeof(float); + std::vector data(buffer, buffer + elementCount); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio( + std::move(data), + (int32_t)audio.batchSize, + (int32_t)audio.bins, + (int32_t)audio.frames + ))); + } else { + const uint8_t *buffer = (const uint8_t *)audio.data.bytes; + std::vector data(buffer, buffer + audio.data.length); + nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio( + std::move(data), + (int32_t)audio.batchSize, + (int32_t)audio.bins, + (int32_t)audio.frames + ))); + } break; } default: { diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift index f64b81908b8..3617245b8f8 100644 --- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift +++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift @@ -11,47 +11,89 @@ import ExecuTorchLLM import XCTest extension UIImage { - func asImage() -> Image { - let targetSide = CGFloat(336) - let scale = max(targetSide / size.width, targetSide / size.height) - let scaledSize = CGSize(width: size.width * scale, height: size.height * scale) + func centerCropped(to sideSize: CGFloat) -> UIImage { + precondition(sideSize > 0) let format = UIGraphicsImageRendererFormat.default() format.scale = 1 - let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in - draw(in: CGRect(origin: .zero, size: scaledSize)) - } - guard let scaledCGImage = scaledImage.cgImage else { - return Image(data: Data(), width: 336, height: 336, channels: 3) - } - let cropRect = CGRect( - x: ((scaledSize.width - targetSide) * 0.5).rounded(.down), - y: ((scaledSize.height - targetSide) * 0.5).rounded(.down), - width: targetSide.rounded(.down), - height: targetSide.rounded(.down) - ) - let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage - let imageWidth = croppedCGImage.width - let imageHeight = croppedCGImage.height - let pixelCount = imageWidth * imageHeight - var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4) - let context = CGContext( + format.opaque = false + return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format) + .image { _ in + let scaleFactor = max(sideSize / size.width, sideSize / size.height) + let scaledWidth = size.width * scaleFactor + let scaledHeight = size.height * scaleFactor + let originX = (sideSize - scaledWidth) / 2 + let originY = (sideSize - scaledHeight) / 2 + draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight)) + } + } + + func rgbBytes() -> [UInt8]? { + guard let cgImage = cgImage else { return nil } + let pixelWidth = Int(cgImage.width) + let pixelHeight = Int(cgImage.height) + let pixelCount = pixelWidth * pixelHeight + let bytesPerPixel = 4 + let bytesPerRow = pixelWidth * bytesPerPixel + var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel) + guard let context = CGContext( data: &rgbaBuffer, - width: imageWidth, - height: imageHeight, + width: pixelWidth, + height: pixelHeight, bitsPerComponent: 8, - bytesPerRow: imageWidth * 4, + bytesPerRow: bytesPerRow, space: CGColorSpaceCreateDeviceRGB(), bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue - )! - context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight)) - var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3) + ) else { return nil } + + context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight)) + + var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3) + for pixelIndex in 0.. [Float]? { + precondition(mean.count == 3 && std.count == 3) + precondition(std[0] != 0 && std[1] != 0 && std[2] != 0) + guard let rgbBytes = rgbBytes() else { return nil } + let pixelCount = rgbBytes.count / 3 + var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3) for pixelIndex in 0.. Image { + return Image( + data: Data(centerCropped(to: sideSize).rgbBytes() ?? []), + width: Int(sideSize), + height: Int(sideSize), + channels: 3 + ) + } + + func asNormalizedImage( + _ sideSize: CGFloat, + mean: [Float] = [0.485, 0.456, 0.406], + std: [Float] = [0.229, 0.224, 0.225] + ) -> Image { + return Image( + float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) }, + width: Int(sideSize), + height: Int(sideSize), + channels: 3 + ) } } @@ -120,7 +162,7 @@ class MultimodalRunnerTest: XCTestCase { let sequenceLength = 768 let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"), + let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"), let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"), let uiImage = UIImage(contentsOfFile: imagePath) else { XCTFail("Couldn't find model or tokenizer files") @@ -132,8 +174,8 @@ class MultimodalRunnerTest: XCTestCase { do { try runner.generate([ MultimodalInput(systemPrompt), - MultimodalInput(uiImage.asImage()), - MultimodalInput("\(userPrompt) \(assistantPrompt)"), + MultimodalInput(uiImage.asImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), ], Config { $0.sequenceLength = sequenceLength }) { token in @@ -149,8 +191,8 @@ class MultimodalRunnerTest: XCTestCase { do { try runner.generate([ MultimodalInput(systemPrompt), - MultimodalInput(uiImage.asImage()), - MultimodalInput("\(userPrompt) \(assistantPrompt)"), + MultimodalInput(uiImage.asImage(sideSize)), + MultimodalInput(String(format: chatTemplate, userPrompt)), ], Config { $0.sequenceLength = sequenceLength }) { token in diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift index f7124fec640..0fa2b59d05d 100644 --- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift +++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift @@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase { func testLLaMA() { let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else { + let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else { XCTFail("Couldn't find model or tokenizer files") return } @@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase { func testPhi4() { let bundle = Bundle(for: type(of: self)) guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"), - let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else { + let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else { XCTFail("Couldn't find model or tokenizer files") return }