Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted))
channels:(NSInteger)channels
NS_DESIGNATED_INITIALIZER;

- (instancetype)initWithFloatData:(NSData *)data
width:(NSInteger)width
height:(NSInteger)height
channels:(NSInteger)channels
NS_DESIGNATED_INITIALIZER;

@property(nonatomic, readonly) NSData *data;

@property(nonatomic, readonly) NSInteger width;
Expand All @@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted))

@property(nonatomic, readonly) NSInteger channels;

@property(nonatomic, readonly) BOOL isFloat;

+ (instancetype)new NS_UNAVAILABLE;
- (instancetype)init NS_UNAVAILABLE;

Expand Down Expand Up @@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted))
frames:(NSInteger)frames
NS_DESIGNATED_INITIALIZER;

- (instancetype)initWithFloatData:(NSData *)data
batchSize:(NSInteger)batchSize
bins:(NSInteger)bins
frames:(NSInteger)frames
NS_DESIGNATED_INITIALIZER;

@property(nonatomic, readonly) NSData *data;

@property(nonatomic, readonly) NSInteger batchSize;
Expand All @@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted))

@property(nonatomic, readonly) NSInteger frames;

@property(nonatomic, readonly) BOOL isFloat;

+ (instancetype)new NS_UNAVAILABLE;
- (instancetype)init NS_UNAVAILABLE;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
_width = width;
_height = height;
_channels = channels;
_isFloat = NO;
}
return self;
}

- (instancetype)initWithFloatData:(NSData *)data
width:(NSInteger)width
height:(NSInteger)height
channels:(NSInteger)channels {
self = [super init];
if (self) {
_data = [data copy];
_width = width;
_height = height;
_channels = channels;
_isFloat = YES;
}
return self;
}
Expand All @@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
_batchSize = batchSize;
_bins = bins;
_frames = frames;
_isFloat = NO;
}
return self;
}

- (instancetype)initWithFloatData:(NSData *)data
batchSize:(NSInteger)batchSize
bins:(NSInteger)bins
frames:(NSInteger)frames {
self = [super init];
if (self) {
_data = [data copy];
_batchSize = batchSize;
_bins = bins;
_frames = frames;
_isFloat = YES;
}
return self;
}
Expand Down Expand Up @@ -170,20 +202,58 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
return NO;
}
std::vector<llm::MultimodalInput> nativeInputs;
nativeInputs.reserve((size_t)inputs.count);
for (ExecuTorchLLMMultimodalInput *input in inputs) {
switch (input.type) {
case ExecuTorchLLMMultimodalInputTypeText:
nativeInputs.emplace_back(llm::MultimodalInput(input.text.UTF8String));
break;
case ExecuTorchLLMMultimodalInputTypeImage: {
ExecuTorchLLMImage *image = input.image;
std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
std::move(data),
(int32_t)image.width,
(int32_t)image.height,
(int32_t)image.channels
)));
if (image.isFloat) {
const float *buffer = (const float *)image.data.bytes;
size_t elementCount = (size_t)image.data.length / sizeof(float);
std::vector<float> data(buffer, buffer + elementCount);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
std::move(data),
(int32_t)image.width,
(int32_t)image.height,
(int32_t)image.channels
)));
} else {
const uint8_t *buffer = (const uint8_t *)image.data.bytes;
std::vector<uint8_t> data(buffer, buffer + image.data.length);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
std::move(data),
(int32_t)image.width,
(int32_t)image.height,
(int32_t)image.channels
)));
}
break;
}
case ExecuTorchLLMMultimodalInputTypeAudio: {
ExecuTorchLLMAudio *audio = input.audio;
if (audio.isFloat) {
const float *buffer = (const float *)audio.data.bytes;
size_t elementCount = (size_t)audio.data.length / sizeof(float);
std::vector<float> data(buffer, buffer + elementCount);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
std::move(data),
(int32_t)audio.batchSize,
(int32_t)audio.bins,
(int32_t)audio.frames
)));
} else {
const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
std::vector<uint8_t> data(buffer, buffer + audio.data.length);
nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
std::move(data),
(int32_t)audio.batchSize,
(int32_t)audio.bins,
(int32_t)audio.frames
)));
}
break;
}
default: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,60 +10,157 @@ import ExecuTorchLLM
import XCTest

extension UIImage {
func asImage() -> Image {
let targetSide = CGFloat(336)
let scale = max(targetSide / size.width, targetSide / size.height)
let scaledSize = CGSize(width: size.width * scale, height: size.height * scale)
func centerCropped(to sideSize: CGFloat) -> UIImage {
precondition(sideSize > 0)
let format = UIGraphicsImageRendererFormat.default()
format.scale = 1
let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in
draw(in: CGRect(origin: .zero, size: scaledSize))
}
guard let scaledCGImage = scaledImage.cgImage else {
return Image(data: Data(), width: 336, height: 336, channels: 3)
}
let cropRect = CGRect(
x: ((scaledSize.width - targetSide) * 0.5).rounded(.down),
y: ((scaledSize.height - targetSide) * 0.5).rounded(.down),
width: targetSide.rounded(.down),
height: targetSide.rounded(.down)
)
let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage
let imageWidth = croppedCGImage.width
let imageHeight = croppedCGImage.height
let pixelCount = imageWidth * imageHeight
var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
let context = CGContext(
format.opaque = false
return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
.image { _ in
let scaleFactor = max(sideSize / size.width, sideSize / size.height)
let scaledWidth = size.width * scaleFactor
let scaledHeight = size.height * scaleFactor
let originX = (sideSize - scaledWidth) / 2
let originY = (sideSize - scaledHeight) / 2
draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
}
}

func rgbBytes() -> [UInt8]? {
guard let cgImage = cgImage else { return nil }
let pixelWidth = Int(cgImage.width)
let pixelHeight = Int(cgImage.height)
let pixelCount = pixelWidth * pixelHeight
let bytesPerPixel = 4
let bytesPerRow = pixelWidth * bytesPerPixel
var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
guard let context = CGContext(
data: &rgbaBuffer,
width: imageWidth,
height: imageHeight,
width: pixelWidth,
height: pixelHeight,
bitsPerComponent: 8,
bytesPerRow: imageWidth * 4,
bytesPerRow: bytesPerRow,
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
)!
context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
) else { return nil }

context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))

var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
for pixelIndex in 0..<pixelCount {
let sourceOffset = pixelIndex * 4
planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
let sourceIndex = pixelIndex * bytesPerPixel
rgbBytes[pixelIndex] = rgbaBuffer[sourceIndex + 0]
rgbBytes[pixelIndex + pixelCount] = rgbaBuffer[sourceIndex + 1]
rgbBytes[pixelIndex + 2 * pixelCount] = rgbaBuffer[sourceIndex + 2]
}
return Image(data: Data(planarRGB), width: 336, height: 336, channels: 3)
return rgbBytes
}

func rgbBytesNormalized(mean: [Float] = [0, 0, 0], std: [Float] = [1, 1, 1]) -> [Float]? {
precondition(mean.count == 3 && std.count == 3)
precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
guard let rgbBytes = rgbBytes() else { return nil }
let pixelCount = rgbBytes.count / 3
var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
for pixelIndex in 0..<pixelCount {
rgbBytesNormalized[pixelIndex] =
(Float(rgbBytes[pixelIndex]) / 255.0 - mean[0]) / std[0]
rgbBytesNormalized[pixelIndex + pixelCount] =
(Float(rgbBytes[pixelIndex + pixelCount]) / 255.0 - mean[1]) / std[1]
rgbBytesNormalized[pixelIndex + 2 * pixelCount] =
(Float(rgbBytes[pixelIndex + 2 * pixelCount]) / 255.0 - mean[2]) / std[2]
}
return rgbBytesNormalized
}

func asImage(_ sideSize: CGFloat) -> Image {
return Image(
data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
width: Int(sideSize),
height: Int(sideSize),
channels: 3
)
}

func asNormalizedImage(
_ sideSize: CGFloat,
mean: [Float] = [0.485, 0.456, 0.406],
std: [Float] = [0.229, 0.224, 0.225]
) -> Image {
return Image(
float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
width: Int(sideSize),
height: Int(sideSize),
channels: 3
)
}
}

class MultimodalRunnerTest: XCTestCase {
let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "
let assistantPrompt = "ASSISTANT: "
let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
let userPrompt = "What's on the picture?"
let sequenceLength = 768

func testGemma() {
let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
let sideSize: CGFloat = 896
let sequenceLength = 768
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
let uiImage = UIImage(contentsOfFile: imagePath) else {
XCTFail("Couldn't find model or tokenizer files")
return
}
let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
var text = ""

do {
try runner.generate([
MultimodalInput(systemPrompt),
MultimodalInput(uiImage.asNormalizedImage(sideSize)),
MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
text += token
if token == "<end_of_turn>" {
runner.stop()
}
}
} catch {
XCTFail("Failed to generate text with error \(error)")
}
XCTAssertTrue(text.lowercased().contains("waterfall"))

text = ""
runner.reset()
do {
try runner.generate([
MultimodalInput(systemPrompt),
MultimodalInput(uiImage.asNormalizedImage(sideSize)),
MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
text += token
if token == "<end_of_turn>" {
runner.stop()
}
}
} catch {
XCTFail("Failed to generate text with error \(error)")
}
XCTAssertTrue(text.lowercased().contains("waterfall"))
}

func testLLaVA() {
let chatTemplate = "USER: %@ ASSISTANT: "
let sideSize: CGFloat = 336
let sequenceLength = 768
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
let uiImage = UIImage(contentsOfFile: imagePath) else {
XCTFail("Couldn't find model or tokenizer files")
Expand All @@ -75,8 +172,8 @@ class MultimodalRunnerTest: XCTestCase {
do {
try runner.generate([
MultimodalInput(systemPrompt),
MultimodalInput(uiImage.asImage()),
MultimodalInput("\(userPrompt) \(assistantPrompt)"),
MultimodalInput(uiImage.asImage(sideSize)),
MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
Expand All @@ -92,8 +189,8 @@ class MultimodalRunnerTest: XCTestCase {
do {
try runner.generate([
MultimodalInput(systemPrompt),
MultimodalInput(uiImage.asImage()),
MultimodalInput("\(userPrompt) \(assistantPrompt)"),
MultimodalInput(uiImage.asImage(sideSize)),
MultimodalInput(String(format: chatTemplate, userPrompt)),
], Config {
$0.sequenceLength = sequenceLength
}) { token in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase {
func testLLaMA() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
XCTFail("Couldn't find model or tokenizer files")
return
}
Expand Down Expand Up @@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase {
func testPhi4() {
let bundle = Bundle(for: type(of: self))
guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
XCTFail("Couldn't find model or tokenizer files")
return
}
Expand Down
Loading