diff --git a/QEfficient/cloud/execute.py b/QEfficient/cloud/execute.py index 27ea529cd..09e989ea0 100644 --- a/QEfficient/cloud/execute.py +++ b/QEfficient/cloud/execute.py @@ -115,7 +115,7 @@ def main( "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples/sample_prompts folder", ) parser.add_argument("--generation_len", "--generation-len", type=int, help="Number of tokens to generate") parser.add_argument( diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index fbff5b18b..ef05d29ab 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -390,7 +390,7 @@ def main( "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples/sample_prompts folder", ) parser.add_argument("--generation_len", "--generation-len", type=int, help="Number of tokens to generate") parser.add_argument( diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 98ec72b7c..9358f9c4a 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -125,10 +125,10 @@ You can pass input prompts in single string but separate with pipe (|) symbol". python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompt "My name is|The flat earth theory is the belief that|The sun rises from" --mxfp6 --mos 1 --aic_enable_depth_first ``` -You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples folder. +You can also pass path of txt file with input prompts when you want to run inference on lot of prompts, Example below, sample txt file(prompts.txt) is present in examples/sample_prompts folder. ```bash -python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 3 --prompt_len 32 --ctx_len 128 --num_cores 16 --device_group [0] --prompts_txt_file_path examples/sample_prompts/prompts.txt --mxfp6 --mos 1 --aic_enable_depth_first ``` **QNN CLI Inference Command** diff --git a/docs/source/release_docs.md b/docs/source/release_docs.md index 79e4bd181..97389e571 100644 --- a/docs/source/release_docs.md +++ b/docs/source/release_docs.md @@ -13,7 +13,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th - Text & Image+Text support - Chunk attention, Single/Dual QPC support - Multi-image prompts enabled via VLLM interface - - [Llama4 Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/llama4_example.py) + - [Llama4 Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text/models/llama_vision/single_image.py) - **Grok-1** - Executable via [`QEffAutoModelForCausalLM`](#QEffAutoModelForCausalLM) @@ -22,7 +22,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th - Executable via [`QEFFAutoModelForImageTextToText`](#QEFFAutoModelForImageTextToText) - Text & Image+Text support - Sliding window support - - [Gemma3 Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/gemma3_example/gemma3_mm.py) + - [Gemma3 Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text/models/gemma_vision/inference.py) - **SwiftKV (Llama-3.1-SwiftKV-8B-Instruct)** @@ -32,7 +32,7 @@ Welcome to the official release of **Efficient Transformer Library v1.20.0**! Th - **GGUF Models** - Executable via [`QEffAutoModelForCausalLM`](#QEffAutoModelForCausalLM) - Execution support (non-quantized) - - [Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/basic_gguf_models.py) + - [Example Script](https://github.com/quic/efficient-transformers/blob/main/examples/text_generation/gguf_models.py) - **FP8 Compressed Quantization** - Support for [`Llama-3.3-70B-Instruct-FP8-Dynamic`](https://huggingface.co/Infermatic/Llama-3.3-70B-Instruct-FP8-Dynamic) diff --git a/docs/source/supported_features.rst b/docs/source/supported_features.rst index 9715da982..8260342f2 100644 --- a/docs/source/supported_features.rst +++ b/docs/source/supported_features.rst @@ -6,16 +6,18 @@ Supported Features * - Feature - Impact + * - `Compute Context Length (CCL) `_ + - Optimizes inference by using different context lengths during prefill and decode phases, reducing memory footprint and computation for shorter sequences while maintaining support for longer contexts. Supports both text-only and vision-language models. Refer `sample script `_ for more **details**. * - Sentence embedding, Flexible Pooling configuration and compilation with multiple sequence lengths - - Supports standard/custom pooling with AI 100 acceleration and sentence embedding. Enables efficient sentence embeddings via Efficient-Transformers. Compile with one or multiple seq_len; optimal graph auto-selected at runtime. Refer `sample script `_ for more **details**. + - Supports standard/custom pooling with AI 100 acceleration and sentence embedding. Enables efficient sentence embeddings via Efficient-Transformers. Compile with one or multiple seq_len; optimal graph auto-selected at runtime. Refer `sample script `_ for more **details**. * - `SpD, multiprojection heads `_ - - Implemented post-attention hidden size projections to speculate tokens ahead of the base model. Refer `sample script `_ for more **details**. + - Implemented post-attention hidden size projections to speculate tokens ahead of the base model. Refer `sample script `_ for more **details**. * - `QNN Compilation support `_ - Enabled for AutoModel classes QNN compilation capabilities for multi-models, embedding models and causal models. * - `Disaggregated serving `_ - It support for separate prefill and decode compilation for encoder (vision) and language models. * - `GGUF model execution `_ - - Supported GGUF model execution (without quantized weights). Refer `sample script `_ for more **details**. + - Supported GGUF model execution (without quantized weights). Refer `sample script `_ for more **details**. * - Replication of KV - Enabled FP8 model support on `replicate_kv_heads script `_. * - `gradient checkpointing `_ @@ -23,9 +25,9 @@ Supported Features * - Swift KV `Snowflake/Llama-3.1-SwiftKV-8B-Instruct `_ - Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. Support for both `continuous and non-continuous batching execution `_ in SwiftKV * - :ref:`Vision Language Model ` - - Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer `sample script `_ for more **details**. + - Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer `sample script `_ for more **details**. * - :ref:`Speech Sequence to Sequence Model ` - - Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer `sample script `_ for more **details**. + - Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer `sample script `_ for more **details**. * - Support for FP8 Execution - Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. * - Prefill caching @@ -33,19 +35,19 @@ Supported Features * - On Device Sampling - Enables sampling operations to be executed directly on the QAIC device rather than the host CPU for QEffForCausalLM models. This enhancement significantly reduces host-device communication overhead and improves inference throughput and scalability. Refer `sample script `_ for more **details**. * - Prompt-Lookup Decoding - - Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer `sample script `_ for more **details**. + - Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer `sample script `_ for more **details**. * - :ref:`PEFT LoRA support ` - - Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer `sample script `_ for more **details**. + - Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer `sample script `_ for more **details**. * - :ref:`QNN support ` - Enables compilation using QNN SDK, making Qeff adaptable for various backends in the future. * - :ref:`Embedding model support ` - Facilitates the generation of vector embeddings for retrieval tasks. * - :ref:`Speculative Decoding ` - - Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer `sample script `_ for more **details**. + - Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer `sample script `_ for more **details**. * - :ref:`Finite lorax ` - - Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer `sample script `_ for more **details**. + - Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer `sample script `_ for more **details**. * - Python and CPP Inferencing API support - - Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer `sample script `_ for more **details**. + - Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer `sample script `_ for more **details**. * - :ref:`Continuous batching ` - Optimizes throughput and latency by dynamically batching requests, ensuring efficient use of computational resources. * - AWQ and GPTQ support @@ -56,7 +58,5 @@ Supported Features - A script for computing the perplexity of a model, allowing for the evaluation of model performance and comparison across different models and datasets. Refer `sample script `_ for more **details**. * - KV Heads Replication Script - A sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model, running inference with the original model, replicating KV heads, validating changes, and exporting the modified model to ONNX format. Refer `sample script `_ for more **details**. - * - Context Length Specializations (upcoming) - - Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. * - Block Attention (in progress) - - Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. \ No newline at end of file + - Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. diff --git a/examples/CONTRIBUTING.md b/examples/CONTRIBUTING.md new file mode 100644 index 000000000..d7766fa92 --- /dev/null +++ b/examples/CONTRIBUTING.md @@ -0,0 +1,260 @@ +# Contributing Examples + +This guide explains how to add new examples to the QEfficient repository. + +## When to Add an Example + +Add a new example if: +- The model requires special configuration not covered by existing examples +- You're demonstrating a new feature or optimization technique +- The model has unique requirements (dependencies, image sizes, etc.) + +Don't add an example if: +- The model works with existing generic examples (just use those) +- The only difference is the model name, you can include the model name in validated model list and model class readme file. + +## Directory Structure + +Place your example in the appropriate domain: +- `text_generation/` - Text-only language models +- `image_text_to_text/` - Vision-language models +- `embeddings/` - Embedding models +- `audio/` - Speech and audio models +- `peft/` - Fine-tuning and adapter examples +- `performance/` - Optimization techniques + + + +## File Requirements + +### 1. Python Script + +Your example script should: +- Include the copyright header +- Use argparse for command-line arguments +- Provide clear error messages +- Print results in a readable format + +Basic template: +```python +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse +from transformers import AutoTokenizer +from QEfficient import QEFFAutoModelForCausalLM + +def main(): + parser = argparse.ArgumentParser(description="Description of what this example does") + parser.add_argument("--model-name", type=str, required=True, help="HuggingFace model ID") + parser.add_argument("--prompt", type=str, default="Hello", help="Input prompt") + parser.add_argument("--prefill-seq-len", type=int, default=32) + parser.add_argument("--ctx-len", type=int, default=128) + parser.add_argument("--num-cores", type=int, default=16) + parser.add_argument("--num-devices", type=int, default=1) + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + model = QEFFAutoModelForCausalLM.from_pretrained(args.model_name) + + qpc_path = model.compile( + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=args.num_devices, + ) + + exec_info = model.generate( + tokenizer=tokenizer, + prompts=[args.prompt], + ) + + print(f"Generated: {exec_info.generated_texts[0]}") + +if __name__ == "__main__": + main() +``` + +### 2. README.md + +Each model-specific example needs a README explaining: +- What the model does +- Any special requirements +- How to run it +- Expected output + +Template: +```markdown +# [Model Name] + +## Overview +Brief description of the model and what makes it special. + +## Requirements +```bash +# For single package +pip install package-name==1.2.3 + +# For multiple packages +pip install package-name==1.2.3 another-package==4.5.6 + +# Or use a requirements.txt file +pip install -r requirements.txt +``` + +**Note:** Always specify exact versions to ensure reproducibility. Use `pip show package-name` to check installed versions. + +## Usage +```bash +python inference.py --model-name [model-id] --prompt "Your prompt" +``` + +## Special Notes +Any model-specific considerations, limitations, or configuration details. + +## References +- Model card: [link] +- Paper: [link] (optional) + +## Code Guidelines + +- Use clear variable names +- Add comments for non-obvious code +- Handle errors gracefully +- Follow existing code style in the repository +- Test your example before submitting + +## Testing Your Example + +Before submitting: +1. Run the example with default parameters +2. Test with different model sizes if applicable +3. Verify the README instructions work +4. Check that all dependencies are documented + +## Submitting Your Contribution + +Follow these steps to submit your example to the QEfficient repository: + +### 1. Fork and Clone the Repository + +First, fork the repository to your GitHub account, then clone your fork: + +```bash +# Fork the repository on GitHub (click the "Fork" button) +# Then clone your fork +git clone git@github.com:YOUR_USERNAME/efficient-transformers.git +cd efficient-transformers + +# Add upstream remote to keep your fork in sync +git remote add upstream git@github.com:quic/efficient-transformers.git +``` + +### 2. Create a Feature Branch + +Create a descriptive branch for your changes: + +```bash +# Update your main branch +git checkout main +git pull upstream main + +# Create a new branch +git checkout -b add-[model-name]-example +``` + +### 3. Make Your Changes + +Add your example files following the guidelines above: +- Python script with proper copyright header +- README.md with clear documentation +- requirements.txt (if needed) + +### 4. Run Pre-commit Checks + +Before committing, ensure your code passes all quality checks: + +```bash +# Install pre-commit if not already installed +pip install pre-commit + +# Run pre-commit on your changed files +pre-commit run --files path/to/your/file1.py path/to/your/file2.md +``` + +**Important:** If pre-commit reports any failures: +- Some issues will be auto-fixed (formatting, trailing whitespace, etc.) +- For issues that aren't auto-fixed, manually correct them +- Re-run `pre-commit run --files ` until all checks pass + +### 5. Commit with Sign-off (DCO) + +All commits must be signed off to comply with the Developer Certificate of Origin (DCO): + +```bash +# Stage your changes +git add examples/your_domain/your_example.py +git add examples/your_domain/README.md + +# Commit with sign-off +git commit -s --author "Your Name " -m "Add [model-name] example + +- Implements inference for [model-name] +- Includes documentation and usage examples +- Tested with [specific configurations]" +``` + +**Commit Message Guidelines:** +- Use a clear, descriptive title +- Add a blank line, then detailed description if needed +- Always include the `-s` flag for DCO sign-off + +### 6. Push to Your Fork + +Push your branch to your forked repository: + +```bash +git push origin add-[model-name]-example +``` + +### 7. Create a Pull Request + +1. Go to your fork on GitHub +2. Click "Compare & pull request" for your branch +3. Fill out the PR template with: + - **Title:** Clear, descriptive title (e.g., "Add Llama-3.2-Vision example") + - **Description:** + - What the example demonstrates + - Why it's needed (what makes it different from existing examples) + - Any special testing considerations + - Link to model card or documentation + - **Testing:** Describe how you tested the example + +### 8. Ensure CI Checks Pass + +After creating the PR, verify that all automated checks pass: + +- ✅ **DCO Check:** Ensures all commits are signed off +- ✅ **Lint Check:** Code style and formatting validation +- ✅ **Tests:** Automated test suite (if applicable) + +If any checks fail: +1. Review the error messages in the PR +2. Make necessary fixes in your local branch +3. Commit and push the fixes (with sign-off) +4. The PR will automatically update and re-run checks + +### 9. Address Review Feedback + +Maintainers will review your PR and may request changes: +- Make requested changes in your local branch +- Commit with sign-off and push to update the PR +- Respond to comments to facilitate discussion + +## Questions + +For questions or issues, open a GitHub issue or discussion. diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..3913b25ce --- /dev/null +++ b/examples/README.md @@ -0,0 +1,97 @@ +# QEfficient Examples + +Examples for running models on Qualcomm Cloud AI 100. + +For detailed documentation, see https://quic.github.io/efficient-transformers/ + +## Quick Navigation + +### Text Generation +Language model inference. + +| Example | Description | Script | +|---------|-------------|--------| +| Basic Inference | Simple text generation | [text_generation/basic_inference.py](text_generation/basic_inference.py) | +| GGUF Models | GGUF format support | [text_generation/gguf_models.py](text_generation/gguf_models.py) | +| MoE Models | Mixture of Experts | [text_generation/moe_inference.py](text_generation/moe_inference.py) | +| Continuous Batching | Dynamic batching | [text_generation/continuous_batching.py](text_generation/continuous_batching.py) | + +[See all text generation examples →](text_generation/) + +### Image-Text-to-Text +Vision-language models. + +| Example | Model | Script | +|---------|---------------|---------------| +| Basic VLM | Most VLMs | [image_text_to_text/basic_vlm_inference.py](image_text_to_text/basic_vlm_inference.py) | + +[See all vision-language examples →](image_text_to_text/) + +### Embeddings +Sentence and document embeddings. + +| Example | Model | Script | +|---------|-------|--------| +| Text Embeddings | all-MiniLM-L6-v2 | [embeddings/text_embeddings.py](embeddings/text_embeddings.py) | + +[See all embedding examples →](embeddings/) + +### Audio +Speech processing models. + +| Example | Model | Task | Script | +|---------|-------|------|--------| +| Speech-to-Text | Whisper | Transcription | [audio/speech_to_text.py](audio/speech_to_text.py) | +| CTC Speech Recognition | Wav2Vec2 | Recognition | [audio/wav2vec2_inference.py](audio/wav2vec2_inference.py) | + +[See all audio examples →](audio/) + +### PEFT +Parameter-efficient fine-tuning. + +| Example | Description | Script | +|---------|-------------|--------| +| Single Adapter | Load and use one adapter | [peft/single_adapter.py](peft/single_adapter.py) | +| Multi-Adapter | Multiple adapters with CB | [peft/multi_adapter.py](peft/multi_adapter.py) | + +**Note:** PEFT examples use hardcoded configurations to demonstrate specific adapter workflows. Modify the scripts directly to test different adapters or configurations. + +[See all PEFT examples →](peft/) + +### Performance +Optimization techniques. + +| Example | Technique | Script | +|---------|-----------|--------| +| Draft-based SpD | Speculative decoding | [performance/speculative_decoding/draft_based.py](performance/speculative_decoding/draft_based.py) | +| Prompt Lookup | N-gram speculation | [performance/speculative_decoding/prompt_lookup.py](performance/speculative_decoding/prompt_lookup.py) | +| Multi-Projection | Turbo models | [performance/speculative_decoding/multi_projection.py](performance/speculative_decoding/multi_projection.py) | +| On-Device Sampling | Sampling parameters | [performance/on_device_sampling.py](performance/on_device_sampling.py) | +| Compute Context Length | Dynamic context optimization | [performance/compute_context_length/basic_inference.py](performance/compute_context_length/basic_inference.py) | +| C++ Execution | Native C++ API | [performance/cpp_execution/](performance/cpp_execution/) | + +[See all performance examples →](performance/) + +## Installation + +For installation instructions, see the [Quick Installation guide](../README.md#quick-installation) in the main README. + + +## Running Examples + +### Python Scripts + +Basic usage: +```bash +python text_generation/basic_inference.py \ + --model-name gpt2 \ + --prompt "Hello, how are you?" +``` + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on adding new examples. + +## Documentation + +Full documentation: https://quic.github.io/efficient-transformers/ diff --git a/examples/audio/README.md b/examples/audio/README.md new file mode 100644 index 000000000..df0204d87 --- /dev/null +++ b/examples/audio/README.md @@ -0,0 +1,87 @@ +# Audio Examples + +Examples for running audio processing models on Qualcomm Cloud AI 100. + +## Dependencies + +Install required packages: +```bash +pip install librosa==0.10.2 soundfile==0.13.1 +``` + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Supported Models + +**QEff Auto Classes:** +- `QEFFAutoModelForSpeechSeq2Seq` (for Whisper models) +- `QEFFAutoModelForCTC` (for Wav2Vec2 models) + +For the complete list of supported audio models, see the [Validated Models - Audio Section](../../docs/source/validate.md#audio-models). + +Popular models include: +- Whisper (tiny, base, small, medium, large, large-v3-turbo) +- Wav2Vec2 (base-960h) + +## Available Examples + +### speech_to_text.py +Speech-to-text transcription using Whisper models. + +**Usage:** +```bash +# With default parameters +python speech_to_text.py \ + +# With custom parameters +python speech_to_text.py \ + --model-name openai/whisper-tiny \ + --ctx-len 25 \ + --num-cores 16 +``` + +**Parameters:** +- `--model-name`: HuggingFace Whisper model ID (default: `openai/whisper-tiny`) +- `--ctx-len`: Context length for generation (default: `25`) +- `--num-cores`: Number of cores (default: `16`) + +This example: +- Loads a sample audio from the librispeech dataset +- Uses Whisper-tiny model by default +- Compiles and runs inference on Cloud AI 100 +- Outputs the transcribed text + +### wav2vec2_inference.py +Speech recognition using Wav2Vec2 models with CTC (Connectionist Temporal Classification). + +**Usage:** +```bash +# With default parameters +python wav2vec2_inference.py + +# With custom parameters +python wav2vec2_inference.py \ + --model-name facebook/wav2vec2-base-960h \ + --num-cores 16 +``` + +**Parameters:** +- `--model-name`: HuggingFace CTC model ID (default: `facebook/wav2vec2-base-960h`) +- `--num-cores`: Number of cores (default: `16`) + +This example: +- Loads a sample audio from the librispeech dataset +- Uses Wav2Vec2-base-960h model by default +- Compiles and runs inference on Cloud AI 100 +- Outputs the recognized text + +## Documentation + +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Validated Audio Models](https://quic.github.io/efficient-transformers/source/validate.html#audio-models) +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) diff --git a/examples/audio/speech_to_text.py b/examples/audio/speech_to_text.py new file mode 100644 index 000000000..9f1df19aa --- /dev/null +++ b/examples/audio/speech_to_text.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +from datasets import load_dataset +from transformers import AutoProcessor + +from QEfficient import QEFFAutoModelForSpeechSeq2Seq + + +def main(): + parser = argparse.ArgumentParser(description="Speech-to-text inference with Whisper") + parser.add_argument( + "--model-name", + type=str, + default="openai/whisper-tiny", + help="HuggingFace Whisper model ID", + ) + parser.add_argument( + "--ctx-len", + type=int, + default=25, + help="Context length for generation", + ) + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + args = parser.parse_args() + + print(f"Loading Whisper model: {args.model_name}") + + ## STEP 1 -- load audio sample + + # Using a standard english dataset + print("Loading audio sample from dataset...") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + sample_rate = ds[0]["audio"]["sampling_rate"] + data = ds[0]["audio"]["array"] + + # Reshape so shape corresponds to data with batch size 1 + data = data.reshape(-1) + + # Load processor + processor = AutoProcessor.from_pretrained(args.model_name) + + ## STEP 2 -- init base model + qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(args.model_name) + + ## STEP 3 -- export and compile model + qeff_model.compile(num_cores=args.num_cores) + + ## STEP 4 -- generate output for loaded input and processor + exec_info = qeff_model.generate( + inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=args.ctx_len + ) + + ## STEP 5 -- use processor to decode output + transcription = processor.batch_decode(exec_info.generated_ids)[0] + print(f"\nTranscription: {transcription}") + + +if __name__ == "__main__": + main() diff --git a/examples/audio/wav2vec2_inference.py b/examples/audio/wav2vec2_inference.py new file mode 100644 index 000000000..9d310b1c2 --- /dev/null +++ b/examples/audio/wav2vec2_inference.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +from datasets import load_dataset +from transformers import AutoProcessor + +from QEfficient import QEFFAutoModelForCTC + + +def main(): + parser = argparse.ArgumentParser(description="CTC speech recognition inference with Wav2Vec2") + parser.add_argument( + "--model-name", + type=str, + default="facebook/wav2vec2-base-960h", + help="HuggingFace CTC model ID (e.g., Wav2Vec2)", + ) + + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + args = parser.parse_args() + + print(f"Loading CTC model: {args.model_name}") + + ## STEP 1 -- load audio sample + # Using a standard english dataset + print("Loading audio sample from dataset...") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + data = ds[0]["audio"]["array"] + + # Reshape so shape corresponds to data with batch size 1 + data = data.reshape(-1) + + # Load processor + processor = AutoProcessor.from_pretrained(args.model_name) + + ## STEP 2 -- Load the model + model = QEFFAutoModelForCTC.from_pretrained(args.model_name) + + ## STEP 3 -- Compile the model + model.compile(num_cores=args.num_cores) + + ## STEP 4 -- Run the model and generate the output + model_output = model.generate(processor, inputs=data) + print(f"\nTranscription: {model_output}") + + +if __name__ == "__main__": + main() diff --git a/examples/basic_gguf_models.py b/examples/basic_gguf_models.py deleted file mode 100644 index 84fc73059..000000000 --- a/examples/basic_gguf_models.py +++ /dev/null @@ -1,23 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -# This is the work example of the GGUF models with the AI 100 - -from transformers import AutoTokenizer - -from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM - -# Load the model and tokenizer -model_name = "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF" -gguf_file = "Mistral-7B-Instruct-v0.3.fp16.gguf" -# org_model_name = "mistralai/Mistral-7B-Instruct-v0.3" - -tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file) -model = AutoModelForCausalLM.from_pretrained(model_name, gguf_file=gguf_file) - -generated_qpc_path = model.compile(prefill_seq_len=32, ctx_len=128, num_cores=16, num_devices=1) -model.generate(prompts=["How are you?"], tokenizer=tokenizer) diff --git a/examples/ccl_image_text_to_text_inference.py b/examples/ccl_image_text_to_text_inference.py deleted file mode 100644 index be472f433..000000000 --- a/examples/ccl_image_text_to_text_inference.py +++ /dev/null @@ -1,137 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import requests -from PIL import Image -from transformers import AutoProcessor, TextStreamer - -from QEfficient import QEFFAutoModelForImageTextToText - -# Add HuggingFace Token to access the model -HF_TOKEN = "" - - -def run_model( - model_name, - token, - query, - image_url, - kv_offload=False, - prefill_seq_len=32, - ctx_len=512, - comp_ctx_lengths_prefill=None, - comp_ctx_lengths_decode=None, - generation_len=128, - img_size=560, - num_cores=16, - num_devices=1, -): - ## STEP - 1 Load the Processor and Model - - processor = AutoProcessor.from_pretrained(model_name, token=token) - - # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs. - # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs. - # The outputs of the Vision Encoder are then passed to the Language model via host in this case. - - model = QEFFAutoModelForImageTextToText.from_pretrained( - model_name, - token=token, - attn_implementation="eager", - kv_offload=kv_offload, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, - }, - ) - - ## STEP - 2 Export & Compile the Model - - model.compile( - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - img_size=img_size, - num_cores=num_cores, - num_devices=num_devices, - mxfp6_matmul=False, - ) - - ## STEP - 3 Load and process the inputs for Inference - - image = Image.open(requests.get(image_url, stream=True).raw) - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": query}, - ], - } - ] - input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] - - inputs = processor( - text=input_text, - images=image, - return_tensors="pt", - add_special_tokens=False, - padding="max_length", - max_length=prefill_seq_len, - ) - - ## STEP - 4 Run Inference on the compiled model - - streamer = TextStreamer(processor.tokenizer) - output_statistics = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) - print(output_statistics) - - -if __name__ == "__main__": - # Model name and Input parameters - # model_name = "llava-hf/llava-1.5-7b-hf" - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - query = "Describe this image." - image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - - # Compilation parameters for the model - kv_offload = True - prefill_seq_len = 32 - ctx_len = 8192 - generation_len = 128 - # img_size = 336 - img_size = 560 - num_cores = 16 - num_devices = 4 - comp_ctx_lengths_prefill = [4096] - comp_ctx_lengths_decode = [6144, ctx_len] - - run_model( - model_name=model_name, - token=HF_TOKEN, - query=query, - kv_offload=kv_offload, - image_url=image_url, - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, - comp_ctx_lengths_decode=comp_ctx_lengths_decode, - generation_len=generation_len, - img_size=img_size, - num_cores=num_cores, - num_devices=num_devices, - ) - - -""" -Expected Response: - -This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape. - -The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the - -""" diff --git a/examples/compute_context_length.py b/examples/compute_context_length.py deleted file mode 100644 index 163261e04..000000000 --- a/examples/compute_context_length.py +++ /dev/null @@ -1,70 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -## In this example, you can run a model for static and continuous batching with different Compute-Context-Length (CCL) inputs. ## - -from transformers import AutoTokenizer - -from QEfficient import QEFFAutoModelForCausalLM - -## Using optional variable comp_ctx_lengths variable you can pass a list of context lengths for both prefilling and decoding processes. It will run the model with default context length if comp_ctx_lengths=None. ## -## - The first comp_ctx_lengths_prefill list shows the compute-ctx-length list for prefilling process. It will start the prefilling process with the first element in the list and gradually will increase the comp_ctx_lengths based on the position_id of the current prompt chunk. ## -## - The second comp_ctx_lengths_decode list will be used for decoding. During the decoding process, based on the position_id or cache index it will work with the specific compute-context-length in the list. It will start from a proper compute-context-length in the list based on input prompt length and will gradually increase the compute-context-length if the cache index passes the current compute-context-length. ## - -ctx_len = 1024 -comp_ctx_lengths_prefill = [256, 500] # None -comp_ctx_lengths_decode = [512, ctx_len] # None - -model_name = "meta-llama/Llama-3.2-1B" -# model_name = "google/gemma-7b" -# model_name = "tiiuae/falcon-7b-instruct" -# model_name = "google/gemma-2-2b" -# model_name = "ibm-granite/granite-3.1-8b-instruct" -# model_name = "Snowflake/Llama-3.1-SwiftKV-8B-Instruct" -# model_name = "mistralai/Mistral-7B-v0.1" -# model_name = "microsoft/phi-1_5" -# model_name = "microsoft/Phi-3-mini-4k-instruct" -# model_name = "Qwen/Qwen2.5-7B-Instruct" -# model_name = "Qwen/Qwen3-1.7B" -# model_name = "allenai/OLMo-2-0425-1B" -# model_name = "ibm-granite/granite-3.3-2b-base" -# model_name = "ibm-granite/granite-3.2-8b-instruct" -# model_name = "meta-llama/Llama-3.3-70B-Instruct" -# model_name = "Salesforce/codegen-350M-mono" -# model_name = "openai-community/gpt2" -# model_name = "EleutherAI/gpt-j-6b" - -model = QEFFAutoModelForCausalLM.from_pretrained( - model_name, - continuous_batching=True, - qaic_config={ - "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, - "comp_ctx_lengths_decode": comp_ctx_lengths_decode, - "ctx_len": ctx_len, # Is required for CCL checkings - }, -) - -# model compilation for either continuous or static batching. For continuous batching full_batch_size is needed. -model.compile( - prefill_seq_len=128, - ctx_len=ctx_len, - num_cores=16, - num_devices=1, - mxint8_kv_cache=True, - mxfp6_matmul=True, - full_batch_size=1, -) - -# Create tokenizer and run model.generate and passes the input prompts to it. -tokenizer = AutoTokenizer.from_pretrained(model_name) -model.generate( - prompts=[ - "My name is ", - ], - tokenizer=tokenizer, - generation_len=128, -) diff --git a/examples/embedding_model.py b/examples/embedding_model.py deleted file mode 100644 index 7e6973e2e..000000000 --- a/examples/embedding_model.py +++ /dev/null @@ -1,46 +0,0 @@ -# ----------------------------------------------------------------------------- - -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause - -# ----------------------------------------------------------------------------- - -# This is the work example of the Embedding model with the AI 100 -# For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 - -import torch -from transformers import AutoTokenizer - -from QEfficient import QEFFAutoModel as AutoModel - - -def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float() - last_hidden_states[input_mask_expanded == 0] = -1e9 - return torch.max(last_hidden_states, 1)[0] - - -# Sentences we want sentence embeddings for -sentences = "This is an example sentence" - -# Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - - -# You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function. -# If no pooling is specified, the model will return its default output (typically token embeddings). -qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling) -# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max") -# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - -# Here seq_len can be list of seq_len or single int -qeff_model.compile(num_cores=16, seq_len=[32, 64]) -# qeff_model.compile(num_cores=16, seq_len=32) - - -# Tokenize sentences -encoded_input = tokenizer(sentences, return_tensors="pt") - -sentence_embeddings = qeff_model.generate(encoded_input) - -print("Sentence embeddings:", sentence_embeddings) diff --git a/examples/embeddings/README.md b/examples/embeddings/README.md new file mode 100644 index 000000000..baf80919c --- /dev/null +++ b/examples/embeddings/README.md @@ -0,0 +1,71 @@ +# Embedding Examples + +Examples for running text embedding models on Qualcomm Cloud AI 100. + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Supported Models + +**QEff Auto Class:** `QEFFAutoModel` + +For the complete list of supported embedding models, see the [Validated Models - Embedding Section](../../docs/source/validate.md#embedding-models). + +Popular model families include: +- BERT-based (BGE, E5) +- MPNet +- Mistral-based +- NomicBERT +- Qwen2 +- RoBERTa (Granite) +- XLM-RoBERTa (multilingual) + +## Available Examples + +### text_embeddings.py +Generate text embeddings using transformer models. + +**Usage:** +```bash +# With default parameters +python text_embeddings.py + +# With custom parameters +python text_embeddings.py \ + --model-name sentence-transformers/all-MiniLM-L6-v2 \ + --sentences "This is an example sentence" \ + --pooling max \ + --num-cores 16 \ + --seq-len "32,64" +``` + +**Parameters:** +- `--model-name`: HuggingFace embedding model ID (default: `sentence-transformers/all-MiniLM-L6-v2`) +- `--sentences`: Input text to generate embeddings for (default: `"This is an example sentence"`) +- `--pooling`: Pooling strategy - `max`, `mean`, or `none` (default: `max`) +- `--num-cores`: Number of cores (default: `16`) +- `--seq-len`: Sequence length(s) - single int or comma-separated list (default: `"32,64"`) + +This example: +- Uses `sentence-transformers/all-MiniLM-L6-v2` by default +- Demonstrates custom pooling strategies (max pooling) +- Compiles for multiple sequence lengths [32, 64] +- Outputs text embeddings +- Works with various embedding model families (BERT, MPNet, Mistral-based, etc.) + +## Pooling Strategies + +The example supports different pooling strategies: +- **max**: Max pooling over token embeddings +- **mean**: Mean pooling over token embeddings +- **custom**: Pass your own pooling function + +## Documentation + +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Validated Embedding Models](https://quic.github.io/efficient-transformers/source/validate.html#embedding-models) +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) diff --git a/examples/embeddings/text_embeddings.py b/examples/embeddings/text_embeddings.py new file mode 100644 index 000000000..e69e6f1af --- /dev/null +++ b/examples/embeddings/text_embeddings.py @@ -0,0 +1,92 @@ +# ----------------------------------------------------------------------------- + +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause + +# ----------------------------------------------------------------------------- + +import argparse + +import torch +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModel as AutoModel + + +def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Apply max pooling to the last hidden states.""" + input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float() + last_hidden_states[input_mask_expanded == 0] = -1e9 + return torch.max(last_hidden_states, 1)[0] + + +def main(): + parser = argparse.ArgumentParser(description="Text embeddings inference") + parser.add_argument( + "--model-name", + type=str, + default="sentence-transformers/all-MiniLM-L6-v2", + help="HuggingFace embedding model ID", + ) + parser.add_argument( + "--sentences", + type=str, + default="This is an example sentence", + help="Input sentence(s) to generate embeddings for", + ) + parser.add_argument( + "--pooling", + type=str, + default="max", + choices=["max", "mean", "none"], + help="Pooling strategy: 'max' for max pooling, 'mean' for mean pooling, 'none' for no pooling", + ) + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument( + "--seq-len", + type=str, + default="32,64", + help="Sequence length(s) - single int (e.g., '32') or comma-separated list (e.g., '32,64')", + ) + args = parser.parse_args() + + # Parse seq_len argument + if "," in args.seq_len: + seq_len = [int(x.strip()) for x in args.seq_len.split(",")] + else: + seq_len = int(args.seq_len) + + print(f"Loading embedding model: {args.model_name}") + print(f"Pooling strategy: {args.pooling}") + print(f"Sequence length(s): {seq_len}") + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # Load model with pooling strategy + # You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function. + # If no pooling is specified, the model will return its default output (typically token embeddings). + if args.pooling == "max": + qeff_model = AutoModel.from_pretrained(args.model_name, pooling=max_pooling) + elif args.pooling == "mean": + qeff_model = AutoModel.from_pretrained(args.model_name, pooling="mean") + else: + qeff_model = AutoModel.from_pretrained(args.model_name) + + # Compile the model + # seq_len can be a list of seq_len or single int + qeff_model.compile(num_cores=args.num_cores, seq_len=seq_len) + + # Tokenize sentences + encoded_input = tokenizer(args.sentences, return_tensors="pt") + + # Run the generation + sentence_embeddings = qeff_model.generate(encoded_input) + + print(f"\nInput: {args.sentences}") + print(f"Sentence embeddings shape: {sentence_embeddings['output'].shape}") + print(f"Sentence embeddings: {sentence_embeddings}") + + +if __name__ == "__main__": + main() diff --git a/examples/gpt_oss.py b/examples/gpt_oss.py deleted file mode 100644 index 24d050e97..000000000 --- a/examples/gpt_oss.py +++ /dev/null @@ -1,35 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -from transformers import AutoTokenizer, TextStreamer - -from QEfficient import QEFFAutoModelForCausalLM - -model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 - -qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) -tokenizer = AutoTokenizer.from_pretrained(model_id) - -onnx_model_path = qeff_model.export() -qpc_path = qeff_model.compile( - prefill_seq_len=1, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. - ctx_len=256, - num_cores=16, - mxfp6_matmul=True, - mxint8_kv_cache=True, - num_devices=8, - mos=1, - aic_enable_depth_first=True, - num_speculative_tokens=None, -) -print(f"qpc path is {qpc_path}") -streamer = TextStreamer(tokenizer) -exec_info = qeff_model.generate( - tokenizer, - prompts="Who is your creator? and What all you are allowed to do?", - device_id=[0, 1, 2, 3], -) diff --git a/examples/image_text_to_text/README.md b/examples/image_text_to_text/README.md new file mode 100644 index 000000000..a6f1608b4 --- /dev/null +++ b/examples/image_text_to_text/README.md @@ -0,0 +1,112 @@ +# Image-Text-to-Text (Vision-Language Models) + +Multi-modal models that process both images and text. + + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` +## Quick Start +### Generic VLM Inference +Generic script for vision-language models: + +```bash +# With default parameters +python basic_vlm_inference.py + +# With custom parameters +python basic_vlm_inference.py \ + --model-name llava-hf/llava-1.5-7b-hf \ + --image-url "https://example.com/image.jpg" \ + --query "Describe this image" \ + --prefill-seq-len 128 \ + --ctx-len 3000 \ + --generation-len 128 \ + --num-cores 16 +``` + +### Single QPC Mode +Run the entire model (vision encoder + language model) in a single QPC: + +```bash +python basic_vlm_inference.py \ + --model-name llava-hf/llava-1.5-7b-hf \ + --image-url "https://example.com/image.jpg" \ + --query "Describe this image" \ + --num-cores 16 \ + --num-devices 1 +``` + +### Dual QPC Mode +Split the model into two QPCs (vision encoder + language model separately): + +```bash +python basic_vlm_inference.py \ + --model-name llava-hf/llava-1.5-7b-hf \ + --image-url "https://example.com/image.jpg" \ + --query "Describe this image" \ + --kv-offload \ + --num-cores 16 \ + --num-devices 1 +``` + +**Note:** In Dual QPC mode (`kv_offload=True`), the vision encoder runs in one QPC and the language model in another, with outputs transferred via host. This provides flexibility for independent execution of vision and language components. + +### Text-Only Execution (Skip Vision) +Run text-only inference without image processing: + +```bash +python basic_vlm_inference.py \ + --model-name llava-hf/llava-1.5-7b-hf \ + --prompt "Tell me about yourself" \ + --skip-vision True +``` + +**Note:** Use `skip_vision=True` when you want to run the language model without processing any images. This is useful for text-only tasks on vision-language models. + +### Continuous Batching +Dynamic batching for VLMs: + +```bash +python continuous_batching_vlm.py \ + --model-name meta-llama/Llama-4-Scout-17B-16E-Instruct \ + --full-batch-size 4 \ +``` + +## Supported Models + +**QEff Auto Class:** `QEFFAutoModelForImageTextToText` + +For the complete list of supported vision-language models, see the [Validated Models - Vision-Language Models Section](../../docs/source/validate.md#vision-language-models-text--image-generation). + +Popular model families include: +- Llama Vision (3.2, 4-Scout) +- Qwen VL (2.5) +- Mistral Vision (Small-3.1) +- Gemma-3 +- Granite Vision (3.2) +- InternVL +- Molmo +- LLaVA + +### Model-Specific Examples + +Some models have specialized examples demonstrating advanced features: + +| Model | Location | +|-------|----------| +| **Llama-4** | [models/llama4/](models/llama4/) | +| **Qwen** | [models/qwen_vl/](models/qwen_vl/) | +| **Mistral** | [models/mistral_vision/](models/mistral_vision/) | +| **Gemma** | [models/gemma_vision/](models/gemma_vision/) | +| **Granite** | [models/granite_vision/](models/granite_vision/) | +| **InternVL** | [models/internvl/](models/internvl/) | +| **Molmo** | [models/molmo/](models/molmo/) | + + +## Documentation +- **Full Guide**: [VLM Documentation](../../docs/source/quick_start.md#vision-language-models) +- **API Reference**: [QEFFAutoModelForImageTextToText](../../docs/source/qeff_autoclasses.md#QEFFAutoModelForImageTextToText) diff --git a/examples/image_text_to_text/basic_vlm_inference.py b/examples/image_text_to_text/basic_vlm_inference.py new file mode 100644 index 000000000..45d5454cb --- /dev/null +++ b/examples/image_text_to_text/basic_vlm_inference.py @@ -0,0 +1,134 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +import requests +from PIL import Image +from transformers import AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + + +def run_model( + model_name, + query, + image_url, + kv_offload=True, + prefill_seq_len=32, + ctx_len=512, + generation_len=128, + img_size=336, + num_cores=16, + num_devices=1, +): + ## STEP 1: Load the Processor and Model + + processor = AutoProcessor.from_pretrained(model_name) + + # `kv_offload` determines Single QPC vs Dual QPC mode: + # - Single QPC (kv_offload=False): Entire model runs in one QPC + # - Dual QPC (kv_offload=True): Vision encoder and language model run in separate QPCs + # with outputs transferred via host for flexibility + + model = QEFFAutoModelForImageTextToText.from_pretrained( + model_name, attn_implementation="eager", kv_offload=kv_offload + ) + + ## STEP 2: Export & Compile the Model + + model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + img_size=img_size, + num_cores=num_cores, + num_devices=num_devices, + mxfp6_matmul=False, + ) + + ## STEP 3: Load and Process the Inputs for Inference + # Note: the message format would change for different model + image = Image.open(requests.get(image_url, stream=True).raw) + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + } + ] + input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] + + inputs = processor( + text=input_text, + images=image, + return_tensors="pt", + add_special_tokens=False, + padding="max_length", + max_length=prefill_seq_len, + ) + + ## STEP 4: Run Inference on the Compiled Model + + streamer = TextStreamer(processor.tokenizer) + model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) + + +def main(): + parser = argparse.ArgumentParser(description="Vision-Language Model (VLM) inference") + parser.add_argument( + "--model-name", + type=str, + default="llava-hf/llava-1.5-7b-hf", + help="HuggingFace VLM model ID", + ) + parser.add_argument( + "--query", + type=str, + default="Describe this image.", + help="Text query/question about the image", + ) + parser.add_argument( + "--image-url", + type=str, + default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + help="URL of the image to process", + ) + parser.add_argument( + "--kv-offload", + action="store_true", + default=True, + help="Enable Dual QPC mode (vision encoder and LM in separate QPCs)", + ) + parser.add_argument("--prefill-seq-len", type=int, default=128, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=3000, help="Context length") + parser.add_argument("--generation-len", type=int, default=128, help="Number of tokens to generate") + parser.add_argument("--img-size", type=int, default=336, help="Image size for processing") + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument("--num-devices", type=int, default=1, help="Number of devices") + args = parser.parse_args() + + print(f"Running VLM inference with model: {args.model_name}") + print(f"KV offload (Dual QPC mode): {args.kv_offload}") + + run_model( + model_name=args.model_name, + query=args.query, + image_url=args.image_url, + kv_offload=args.kv_offload, + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + generation_len=args.generation_len, + img_size=args.img_size, + num_cores=args.num_cores, + num_devices=args.num_devices, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/gemma3_example/fp32_nodes_gemma3_27b.yaml b/examples/image_text_to_text/models/gemma_vision/configs/fp32_nodes_gemma3_27b.yaml similarity index 100% rename from examples/gemma3_example/fp32_nodes_gemma3_27b.yaml rename to examples/image_text_to_text/models/gemma_vision/configs/fp32_nodes_gemma3_27b.yaml diff --git a/examples/gemma3_example/fp32_nodes_gemma3_4b.yaml b/examples/image_text_to_text/models/gemma_vision/configs/fp32_nodes_gemma3_4b.yaml similarity index 100% rename from examples/gemma3_example/fp32_nodes_gemma3_4b.yaml rename to examples/image_text_to_text/models/gemma_vision/configs/fp32_nodes_gemma3_4b.yaml diff --git a/examples/gemma3_example/gemma3_mm.py b/examples/image_text_to_text/models/gemma_vision/gemma3_example.py similarity index 95% rename from examples/gemma3_example/gemma3_mm.py rename to examples/image_text_to_text/models/gemma_vision/gemma3_example.py index ca82b2120..5c1f141d4 100644 --- a/examples/gemma3_example/gemma3_mm.py +++ b/examples/image_text_to_text/models/gemma_vision/gemma3_example.py @@ -13,15 +13,17 @@ # Change model_id to "google/gemma-3-27b-it" for 27B model model_id = "google/gemma-3-4b-it" + config = AutoConfig.from_pretrained(model_id) + # For Testing Purpose Only config.text_config.num_hidden_layers = 1 config.vision_config.num_hidden_layers = 2 + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id) -# pass HF_TOKEN if gated model -# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### +# For single QPC: kv_offload=False, For dual QPC: kv_offload=True qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, config=config, attn_implementation="eager", kv_offload=True ) diff --git a/examples/granite_example/readme.md b/examples/image_text_to_text/models/granite_vision/README.md similarity index 100% rename from examples/granite_example/readme.md rename to examples/image_text_to_text/models/granite_vision/README.md diff --git a/examples/granite_example/granite_vision_inference.py b/examples/image_text_to_text/models/granite_vision/granite_example.py similarity index 96% rename from examples/granite_example/granite_vision_inference.py rename to examples/image_text_to_text/models/granite_vision/granite_example.py index 230e10a40..08b01b1ef 100644 --- a/examples/granite_example/granite_vision_inference.py +++ b/examples/image_text_to_text/models/granite_vision/granite_example.py @@ -5,15 +5,14 @@ # # ----------------------------------------------------------------------------- +import os + import requests from PIL import Image from transformers import AutoProcessor, TextStreamer from QEfficient import QEFFAutoModelForImageTextToText -# Add HuggingFace Token to access the model -HF_TOKEN = "" - def run_model( model_name, @@ -29,7 +28,6 @@ def run_model( num_devices=1, ): ## STEP - 1 Load the Processor and Model - processor = AutoProcessor.from_pretrained(model_name, token=token) # `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed. @@ -40,7 +38,6 @@ def run_model( model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload) ## STEP - 2 Export & Compile the Model - model.compile( prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, @@ -88,9 +85,12 @@ def run_model( num_cores = 16 num_devices = 4 + # Get HF token from environment variable (None if not set) + hf_token = os.getenv("HF_TOKEN") + run_model( model_name=model_name, - token=HF_TOKEN, + token=hf_token, query=query, kv_offload=kv_offload, image_url=image_url, diff --git a/examples/intern_example/readme.md b/examples/image_text_to_text/models/internvl/README.md similarity index 95% rename from examples/intern_example/readme.md rename to examples/image_text_to_text/models/internvl/README.md index 6b0b674c9..8371ffc50 100644 --- a/examples/intern_example/readme.md +++ b/examples/image_text_to_text/models/internvl/README.md @@ -2,7 +2,6 @@ This directory contains an example script of how to run inference on InternVL-1B model via QEFFAutoModelForCausalLM class. ## Required packages: -- `torch==2.7.0+cpu` - `torchvision==0.22.0+cpu` - `timm==1.0.14` - `einops==0.8.1` @@ -14,7 +13,7 @@ pip install torch==2.7.0+cpu --extra-index-url https://download.pytorch.org/whl/ To run example script after package installations: ```sh -python internvl_inference.py +python internvl_example.py ``` Expected output for given sample inputs in the script: diff --git a/examples/intern_example/internvl_inference.py b/examples/image_text_to_text/models/internvl/internvl_example.py similarity index 100% rename from examples/intern_example/internvl_inference.py rename to examples/image_text_to_text/models/internvl/internvl_example.py diff --git a/examples/image_text_to_text/models/llama4/continuous_batching.py b/examples/image_text_to_text/models/llama4/continuous_batching.py new file mode 100644 index 000000000..515e7c01b --- /dev/null +++ b/examples/image_text_to_text/models/llama4/continuous_batching.py @@ -0,0 +1,91 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +""" +Continuous Batching Example for Llama-4-Scout Vision Model + +This example demonstrates how to use continuous batching with vision-language models +to process multiple image-text pairs simultaneously in a single batch. +""" + +import transformers +from transformers import AutoConfig, AutoProcessor + +from QEfficient import QEFFAutoModelForImageTextToText + +# Model configuration +model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + +## STEP 1: Load Model Configuration and Processor +config = AutoConfig.from_pretrained(model_id) +# For Testing Purpose Only - reduce layers for faster testing +config.text_config.num_hidden_layers = 4 +config.vision_config.num_hidden_layers = 2 + +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +## STEP 2: Initialize Model with Continuous Batching +# Enable continuous batching to process multiple prompts in parallel +# Set kv_offload=True for Dual QPC mode (vision encoder + language model separately) +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, # Dual QPC mode + config=config, + continuous_batching=True, # Enable continuous batching +) + +## STEP 3: Compile the Model for Cloud AI 100 +# Configure compilation parameters for continuous batching +qeff_model.compile( + prefill_seq_len=128, + ctx_len=3072, + img_size=336, + num_cores=16, + num_devices=4, + max_num_tiles=17, + batch_size=1, # Batch size per request + full_batch_size=4, # Total batch size for continuous batching + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, +) + +## STEP 4: Prepare Input Images and Prompts +# Define multiple images to process in the batch +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +# Define corresponding prompts for each image +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +## STEP 5: Run Inference with Continuous Batching +# Process all image-prompt pairs in a single batch +exec_info = qeff_model.generate( + tokenizer=tokenizer, + prompts=prompts, + processor=processor, + images=image_urls, # Images are processed with their corresponding prompts + device_ids=[0, 1, 2, 3], + generation_len=100, +) + +## STEP 6: Display Results +print("Generated IDs:", exec_info.generated_ids) +print("\nFull execution info:") +print(exec_info) diff --git a/examples/llama4_multi_image_example.py b/examples/image_text_to_text/models/llama4/multi_image.py similarity index 100% rename from examples/llama4_multi_image_example.py rename to examples/image_text_to_text/models/llama4/multi_image.py diff --git a/examples/llama4_example.py b/examples/image_text_to_text/models/llama4/single_image.py similarity index 65% rename from examples/llama4_example.py rename to examples/image_text_to_text/models/llama4/single_image.py index 981bac203..ca1017d58 100644 --- a/examples/llama4_example.py +++ b/examples/image_text_to_text/models/llama4/single_image.py @@ -5,29 +5,47 @@ # # ----------------------------------------------------------------------------- +""" +Single Image Inference Example for Llama-4-Scout Vision Model + +This example demonstrates two modes: +1. Text-only mode (skip_vision=True): Run language model without image processing +2. Vision+Text mode (skip_vision=False): Process image and text together +""" + import torch import transformers from transformers import AutoConfig, AutoProcessor, TextStreamer from QEfficient import QEFFAutoModelForImageTextToText +# Model configuration model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + +## STEP 1: Load Model Configuration and Processor config = AutoConfig.from_pretrained(model_id) -# For Testing Purpose Only +# For Testing Purpose Only - reduce layers for faster testing config.text_config.num_hidden_layers = 4 config.vision_config.num_hidden_layers = 2 +## STEP 2: Initialize the Model +# Set kv_offload=True for Dual QPC mode (vision encoder + language model separately) qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( model_id, attn_implementation="eager", kv_offload=True, config=config ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) -### use skip_vision=Ture, if want to run only text, ow false ### +# Toggle between text-only and vision+text modes +# Set skip_vision=True for text-only execution (no image processing) +# Set skip_vision=False for vision+text execution (process images with text) skip_vision = True if skip_vision: - ## Only Text ## + ## TEXT-ONLY MODE ## + + ## STEP 3: Compile Model for Text-Only Execution + # Set skip_vision=True to bypass image processing qeff_model.compile( prefill_seq_len=128, ctx_len=3072, @@ -38,10 +56,12 @@ mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, - skip_vision=True, + skip_vision=True, # Skip vision encoder for text-only inference mos=1, ) + ## STEP 4: Prepare Text-Only Input + # Create a text-only message without any image messages = [ { "role": "user", @@ -51,6 +71,7 @@ }, ] + ## STEP 5: Process Input with Chat Template inputs = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -59,14 +80,20 @@ return_tensors="pt", ) + ## STEP 6: Run Text-Only Inference streamer = TextStreamer(tokenizer) output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100) + + ## STEP 7: Display Results print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) else: - ## Vision + Text ## + ## VISION + TEXT MODE ## + + ## STEP 3: Compile Model for Vision+Text Execution + # Do not set skip_vision (defaults to False) to enable image processing qeff_model.compile( prefill_seq_len=128, ctx_len=3072, @@ -80,11 +107,13 @@ mos=1, ) - ### IMAGE + TEXT ### + ## STEP 4: Prepare Image and Text Input + # Define the image URL to process image_url = ( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png" ) + # Create a message with both image and text messages = [ { "role": "user", @@ -95,6 +124,7 @@ }, ] + ## STEP 5: Process Input with Chat Template inputs = processor.apply_chat_template( messages, add_generation_prompt=True, @@ -102,10 +132,14 @@ return_dict=True, return_tensors="pt", ) + # Convert pixel values to float32 for processing inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + + ## STEP 6: Run Vision+Text Inference streamer = TextStreamer(tokenizer) output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100) + + ## STEP 7: Display Results print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) - print() diff --git a/examples/mistral3_example.py b/examples/image_text_to_text/models/mistral_vision/mistral3_example.py similarity index 100% rename from examples/mistral3_example.py rename to examples/image_text_to_text/models/mistral_vision/mistral3_example.py diff --git a/examples/molmo_example.py b/examples/image_text_to_text/models/molmo/molmo_example.py similarity index 96% rename from examples/molmo_example.py rename to examples/image_text_to_text/models/molmo/molmo_example.py index 09658ce41..04bba5248 100644 --- a/examples/molmo_example.py +++ b/examples/image_text_to_text/models/molmo/molmo_example.py @@ -16,7 +16,8 @@ model_id = "allenai/Molmo-7B-D-0924" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) -config.num_hidden_layers = 2 +# For faster execution user can run on 2 layers, This is only for testing purpose +# config.num_hidden_layers = 2 # load the model qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, kv_offload=True, trust_remote_code=True, config=config) diff --git a/examples/qwen2_5_vl_example.py b/examples/image_text_to_text/models/qwen_vl/basic_inference.py similarity index 95% rename from examples/qwen2_5_vl_example.py rename to examples/image_text_to_text/models/qwen_vl/basic_inference.py index d5d943c9c..374f70ad2 100644 --- a/examples/qwen2_5_vl_example.py +++ b/examples/image_text_to_text/models/qwen_vl/basic_inference.py @@ -5,9 +5,6 @@ # # ----------------------------------------------------------------------------- -# If we want to enable QBlocking Run below command:, default is without blocking -# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py - import requests import transformers from PIL import Image diff --git a/examples/qwen2_5_vl_CB.py b/examples/image_text_to_text/models/qwen_vl/continuous_batching.py similarity index 91% rename from examples/qwen2_5_vl_CB.py rename to examples/image_text_to_text/models/qwen_vl/continuous_batching.py index 96ef4898a..03094dc92 100644 --- a/examples/qwen2_5_vl_CB.py +++ b/examples/image_text_to_text/models/qwen_vl/continuous_batching.py @@ -5,9 +5,6 @@ # # ----------------------------------------------------------------------------- -# If we want to enable QBlocking Run below command:, default is without blocking -# ATTENTION_BLOCKING_MODE=q num_q_blocks=2 python -W ignore qwen2_5_vl_example.py - import transformers from transformers import AutoConfig, AutoProcessor, TextStreamer diff --git a/examples/image_text_to_text_inference.py b/examples/image_text_to_text_inference.py deleted file mode 100644 index e722284ba..000000000 --- a/examples/image_text_to_text_inference.py +++ /dev/null @@ -1,120 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import requests -from PIL import Image -from transformers import AutoProcessor, TextStreamer - -from QEfficient import QEFFAutoModelForImageTextToText - -# Add HuggingFace Token to access the model -HF_TOKEN = "" - - -def run_model( - model_name, - token, - query, - image_url, - kv_offload=False, - prefill_seq_len=32, - ctx_len=512, - generation_len=128, - img_size=560, - num_cores=16, - num_devices=1, -): - ## STEP - 1 Load the Processor and Model - - processor = AutoProcessor.from_pretrained(model_name, token=token) - - # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs. - # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs. - # The outputs of the Vision Encoder are then passed to the Language model via host in this case. - - model = QEFFAutoModelForImageTextToText.from_pretrained( - model_name, token=token, attn_implementation="eager", kv_offload=kv_offload - ) - - ## STEP - 2 Export & Compile the Model - - model.compile( - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - img_size=img_size, - num_cores=num_cores, - num_devices=num_devices, - mxfp6_matmul=False, - ) - - ## STEP - 3 Load and process the inputs for Inference - - image = Image.open(requests.get(image_url, stream=True).raw) - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": query}, - ], - } - ] - input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] - - inputs = processor( - text=input_text, - images=image, - return_tensors="pt", - add_special_tokens=False, - padding="max_length", - max_length=prefill_seq_len, - ) - - ## STEP - 4 Run Inference on the compiled model - - streamer = TextStreamer(processor.tokenizer) - model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) - - -if __name__ == "__main__": - # Model name and Input parameters - model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" - query = "Describe this image." - image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - - # Compilation parameters for the model - kv_offload = False - prefill_seq_len = 32 - ctx_len = 512 - generation_len = 128 - img_size = 560 - num_cores = 16 - num_devices = 1 - - run_model( - model_name=model_name, - token=HF_TOKEN, - query=query, - kv_offload=kv_offload, - image_url=image_url, - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - generation_len=generation_len, - img_size=img_size, - num_cores=num_cores, - num_devices=num_devices, - ) - - -""" -Expected Response: - -This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape. - -The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the - -""" diff --git a/examples/llama4_CB_example_vision_lang.py b/examples/llama4_CB_example_vision_lang.py deleted file mode 100644 index f285ea278..000000000 --- a/examples/llama4_CB_example_vision_lang.py +++ /dev/null @@ -1,93 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ---------------------------------------------------------------------------- - -import transformers -from transformers import AutoConfig, AutoProcessor - -from QEfficient import QEFFAutoModelForImageTextToText - -model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" -config = AutoConfig.from_pretrained(model_id) -# For Testing Purpose Only -config.text_config.num_hidden_layers = 4 -config.vision_config.num_hidden_layers = 2 - -tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) -processor = AutoProcessor.from_pretrained(model_id) - -continious_batching = False -if continious_batching: - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_id, - attn_implementation="eager", - kv_offload=True, - config=config, - continuous_batching=True, - ) - - qeff_model.compile( - prefill_seq_len=128, - ctx_len=3072, - img_size=336, - num_cores=16, - num_devices=4, - max_num_tiles=17, - batch_size=1, - full_batch_size=4, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - mos=1, - ) -else: - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_id, - attn_implementation="eager", - kv_offload=True, - config=config, - ) - - qeff_model.compile( - prefill_seq_len=128, - ctx_len=3072, - img_size=336, - num_cores=16, - num_devices=4, - max_num_tiles=17, - batch_size=1, - mxfp6_matmul=True, - mxint8_kv_cache=True, - aic_enable_depth_first=True, - mos=1, - ) - -image_urls = [ - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", -] - -prompts = [ - "Can you describe the image in detail?", - "What are the objects in the image?", - "What is the main subject of the image?", - "What colors are predominant in the image?", -] - -exec_info = qeff_model.generate( - tokenizer=tokenizer, - prompts=prompts, - processor=processor, - images=image_urls, - device_ids=[0, 1, 2, 3], - generation_len=100, -) - -# print("Generated texts:", exec_info.generated_texts) -print("Generated IDs:", exec_info.generated_ids) -print(exec_info) diff --git a/examples/peft/README.md b/examples/peft/README.md new file mode 100644 index 000000000..fbc8c99b7 --- /dev/null +++ b/examples/peft/README.md @@ -0,0 +1,83 @@ +# PEFT Examples + +Examples for running Parameter-Efficient Fine-Tuning (PEFT) models with LoRA adapters on Qualcomm Cloud AI 100. + + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Supported Models + +**QEff Auto Class:** `QEffAutoPeftModelForCausalLM` + +PEFT/LoRA adapters work with any supported base model architecture. + +Popular base models include: +- Llama +- Mistral, Mixtral + + +## Available Examples + +### single_adapter.py +Load and use a single LoRA adapter with a base model. + +**Usage:** +```python +python single_adapter.py +``` + +This example: +- Loads Mistral-7B base model with a LoRA adapter +- Demonstrates adapter switching +- Shows inference with different adapters (magicoder, tldr, gsm8k, agnews) + +### multi_adapter.py +Use multiple LoRA adapters with continuous batching. + +**Usage:** +```python +python multi_adapter.py +``` + +This example: +- Runs multiple adapters simultaneously in one batch +- Demonstrates continuous batching with `full_batch_size=4` +- Shows different prompts using different adapters in the same batch + +## Key Features + +### Single Adapter Mode +- Load one LoRA adapter at a time +- Switch between adapters dynamically +- Suitable for single-task inference + +### Multi-Adapter Mode (Continuous Batching) +- Run multiple adapters simultaneously +- Different prompts can use different adapters in the same batch +- Efficient for multi-task scenarios +- Requires `continuous_batching=True` and `finite_adapters=True` + +## Adapter Management + +```python +# Load adapter +qeff_model.load_adapter("predibase/adapter_name", "adapter_name") + +# Set active adapter +qeff_model.set_adapter("adapter_name") + +# Unload adapter +qeff_model.unload_adapter("adapter_name") +``` + +## Documentation + +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Validated Base Models](https://quic.github.io/efficient-transformers/source/validate.html#text-only-language-models) +- [PEFT Documentation](https://huggingface.co/docs/peft) +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) diff --git a/examples/lora_models.py b/examples/peft/multi_adapter.py similarity index 100% rename from examples/lora_models.py rename to examples/peft/multi_adapter.py diff --git a/examples/peft_models.py b/examples/peft/single_adapter.py similarity index 60% rename from examples/peft_models.py rename to examples/peft/single_adapter.py index 63c196a22..4f84bd13c 100644 --- a/examples/peft_models.py +++ b/examples/peft/single_adapter.py @@ -5,6 +5,8 @@ # # ----------------------------------------------------------------------------- +## This example demonstrates single adapter usage with sequential adapter switching ## + from transformers import AutoTokenizer, TextStreamer from QEfficient import QEffAutoPeftModelForCausalLM @@ -12,19 +14,27 @@ base_model_name = "mistralai/Mistral-7B-v0.1" tokenizer = AutoTokenizer.from_pretrained(base_model_name) streamer = TextStreamer(tokenizer) +prefill_seq_len = 32 +ctx_len = 1024 +generation_len = 1024 + + +## STEP 1 -- init base model +qeff_model = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder") + +## STEP 2 -- export & compile qeff model +qeff_model.compile(prefill_seq_len=prefill_seq_len, ctx_len=ctx_len) -m = QEffAutoPeftModelForCausalLM.from_pretrained("predibase/magicoder", "magicoder") -m.export() -m.compile(prefill_seq_len=32, ctx_len=1024) +## STEP 3 -- run inference with different adapters -# Magicoder adapter -m.set_adapter("magicoder") +# Magicoder adapter - code generation +qeff_model.set_adapter("magicoder") inputs = tokenizer("def fibonacci", return_tensors="pt") -m.generate(**inputs, streamer=streamer, max_new_tokens=1024) +qeff_model.generate(**inputs, streamer=streamer, max_new_tokens=generation_len) -# TLDR, summary generator -m.load_adapter("predibase/tldr_headline_gen", "tldr_headline_gen") -m.set_adapter("tldr_headline_gen") +## STEP 3.1 -- load and use TLDR headline generator adapter +qeff_model.load_adapter("predibase/tldr_headline_gen", "tldr_headline_gen") +qeff_model.set_adapter("tldr_headline_gen") inputs = tokenizer( """Summarize this passage in one sentence or less: Jeffrey Berns, CEO of Blockchains LLC, wants the Nevada government to allow companies like \ his to form local governments on land they own, granting them power over everything from \ @@ -36,21 +46,21 @@ Summary: """, return_tensors="pt", ) -m.generate(**inputs, streamer=streamer, max_new_tokens=1024) +qeff_model.generate(**inputs, streamer=streamer, max_new_tokens=1024) -# Math problems -m.load_adapter("predibase/gsm8k", "gsm8k") -m.set_adapter("gsm8k") +## STEP 3.2 -- load and use GSM8K adapter for math problems +qeff_model.load_adapter("predibase/gsm8k", "gsm8k") +qeff_model.set_adapter("gsm8k") inputs = tokenizer( "James decides to run 3 sprints 3 times a week. He runs 60 meters each sprint. \ How many total meters does he run a week?", return_tensors="pt", ) -m.generate(**inputs, streamer=streamer, max_new_tokens=1024) +qeff_model.generate(**inputs, streamer=streamer, max_new_tokens=1024) -# News explanation -m.load_adapter("predibase/agnews_explained", "agnews_explained") -m.set_adapter("agnews_explained") +## STEP 3.3 -- load and use AGNews adapter for news classification +qeff_model.load_adapter("predibase/agnews_explained", "agnews_explained") +qeff_model.set_adapter("agnews_explained") inputs = tokenizer( """Below is a news article. Please classify it under one of the following \ classes (World, Business, Sports, Sci/Tech) and provide a reasonable coherent explanation for \ @@ -65,4 +75,4 @@ """, return_tensors="pt", ) -m.generate(**inputs, streamer=streamer, max_new_tokens=1024) +qeff_model.generate(**inputs, streamer=streamer, max_new_tokens=1024) diff --git a/examples/performance/README.md b/examples/performance/README.md new file mode 100644 index 000000000..48d34d972 --- /dev/null +++ b/examples/performance/README.md @@ -0,0 +1,110 @@ +# Performance Optimization Examples + +Examples demonstrating performance optimization techniques for Qualcomm Cloud AI 100. + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Available Examples + +### Speculative Decoding + +Accelerate text generation using speculative decoding techniques. + +#### draft_based.py +Draft-based speculative decoding with separate draft and target models. + +**Basic Usage:** +```bash +python speculative_decoding/draft_based.py \ + --target-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --draft-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --num-speculative-tokens 4 +``` + +**Advanced Usage:** +```bash +python speculative_decoding/draft_based.py \ + --target-model-name meta-llama/Llama-3.1-8B \ + --draft-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --num-speculative-tokens 4 \ + --prefill-seq-len 32 \ + --ctx-len 128 \ + --target-device-group 0,1 \ + --draft-device-group 2 +``` +errors in this example + + +#### prompt_lookup.py +Prompt Lookup Decoding (PLD) - N-gram based speculation without a draft model. + +**Basic Usage:** +```bash +python speculative_decoding/prompt_lookup.py \ + --target-model-name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --num-speculative-tokens 3 \ + --max-ngram-size 3 +``` + +#### multi_projection.py +Multi-projection speculative decoding (Turbo models). + +**Basic Usage:** +```bash +python speculative_decoding/multi_projection.py \ + --pretrained-model-name-or-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` +error + +### On-Device Sampling + +Control sampling parameters directly on the AI 100 hardware. + +#### on_device_sampling.py +Configure sampling parameters (temperature, top-k, top-p, etc.) on-device. + +**Basic Usage:** +```bash +python on_device_sampling.py \ + --model-name meta-llama/Llama-3.1-8B \ + --num-cores 16 \ + --prompt-len 128 \ + --ctx-len 256 +``` + +**Advanced Usage with Sampling Parameters:** +```bash +python on_device_sampling.py \ + --model-name meta-llama/Llama-3.1-8B \ + --prompt-len 128 \ + --ctx-len 256 \ + --full-batch-size 2 \ + --device-group 0,1,2,3 \ + --num-cores 16 \ + --mxint8-kv-cache \ + --mxfp6-matmul \ + --override-qaic-config "aic_include_sampler:true aic_return_pdfs:false max_top_k_ids:512" \ + --repetition-penalty 1.9 \ + --temperature 0.67 \ + --top-k 54720 \ + --top-p 0.89 +``` + +## Performance Tips + +1. **Speculative Decoding**: Best for long-form generation where draft model is much faster than target +2. **Prompt Lookup**: No draft model needed, works well for repetitive patterns +3. **Multi-Projection**: Optimal for models with built-in speculation support +4. **On-Device Sampling**: Reduces host-device communication overhead +5. **C++ Execution**: Maximum performance for production deployments + +## Documentation + +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Performance Features](https://quic.github.io/efficient-transformers/source/features_enablement.html) +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md new file mode 100644 index 000000000..bbc240645 --- /dev/null +++ b/examples/performance/compute_context_length/README.md @@ -0,0 +1,323 @@ +# Compute Context Length (CCL) Examples + +Examples demonstrating Compute Context Length (CCL) optimization for efficient inference on Qualcomm Cloud AI 100. + +## What is CCL? + +Compute Context Length (CCL) is a performance optimization feature that allows models to use different context lengths during different phases of inference: + +- **Prefill Phase**: Processing the initial prompt with optimized context lengths +- **Decode Phase**: Generating new tokens with dynamically adjusted context lengths + +This optimization provides: +- **Memory Efficiency**: Uses smaller context lengths when possible +- **Performance Optimization**: Reduces computation for shorter sequences +- **Flexible Scaling**: Adapts context length based on actual sequence position +- **Hardware Optimization**: Optimized for Qualcomm Cloud AI 100 accelerators + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Quick Start + +### Text-Only Models + +Run basic CCL inference with default settings: +```bash +python basic_inference.py +``` + +Customize with command-line arguments: +```bash +python basic_inference.py \ + --model-name meta-llama/Llama-3.2-1B \ + --prompt "Hello, how are you?" \ + --ctx-len 1024 \ + --comp-ctx-lengths-prefill "256,500" \ + --comp-ctx-lengths-decode "512,1024" \ + --generation-len 100 +``` + +### Vision-Language Models + +Run VLM inference with CCL: +```bash +python vlm_inference.py +``` + +Customize with command-line arguments: +```bash +python vlm_inference.py \ + --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ + --query "Describe this image" \ + --image-url "https://..." \ + --comp-ctx-lengths-prefill "4096" \ + --comp-ctx-lengths-decode "6144,8192" \ + --ctx-len 8192 +``` + +## Available Examples + +### Text-Only Models + +#### basic_inference.py +Basic CCL usage with text-only language models. + +**Supported Models:** +- Llama (3.2, 3.3) +- Gemma/Gemma-2 +- Mistral +- Phi/Phi-3 +- Qwen +- Granite +- GPT-2, GPT-J +- CodeGen +- OLMo-2 + +**Command-Line Arguments:** +- `--model-name`: HuggingFace model ID (default: meta-llama/Llama-3.2-1B) +- `--prompt`: Input prompt (default: "My name is ") +- `--ctx-len`: Maximum context length (default: 1024) +- `--comp-ctx-lengths-prefill`: Comma-separated prefill context lengths (default: 256,500) +- `--comp-ctx-lengths-decode`: Comma-separated decode context lengths (default: 512,1024) +- `--generation-len`: Number of tokens to generate (default: 128) +- `--continuous-batching`: Enable continuous batching mode +- `--num-cores`: Number of cores (default: 16) +- `--num-devices`: Number of devices (default: 1) + +**Usage Examples:** +```bash +# Basic usage with defaults +python basic_inference.py + +# Custom model and prompt +python basic_inference.py \ + --model-name Qwen/Qwen2.5-7B-Instruct \ + --prompt "Explain quantum computing" + +# With continuous batching +python basic_inference.py \ + --continuous-batching \ + --full-batch-size 4 + +# Larger context with progressive CCL +python basic_inference.py \ + --ctx-len 4096 \ + --comp-ctx-lengths-prefill "1024,2048" \ + --comp-ctx-lengths-decode "2048,3072,4096" +``` + +**Python API:** +```python +from transformers import AutoTokenizer +from QEfficient import QEFFAutoModelForCausalLM + +model = QEFFAutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.2-1B", + qaic_config={ + "comp_ctx_lengths_prefill": [256, 500], + "comp_ctx_lengths_decode": [512, 1024], + "ctx_len": 1024, # Required for CCL validation + }, +) +``` + +#### gpt_oss.py +CCL for GPT-OSS MoE models with prefill_seq_len=1 optimization. + +**Usage:** +```bash +python gpt_oss.py +``` + +**Note:** For MoE models, both prefill and decode CCL lists can be similar when using prefill_seq_len=1. + +### Vision-Language Models + +#### vlm_inference.py +General VLM inference with CCL optimization. + +**Usage:** +```bash +python vlm_inference.py +``` + +#### gemma3.py +CCL for Gemma-3 multimodal models (4B/27B). + +**Usage:** +```bash +python gemma3.py +``` + +#### granite_vision.py +CCL for IBM Granite Vision models. + +**Usage:** +```bash +python granite_vision.py +``` + +#### internvl.py +CCL for InternVL2.5 models with custom processor. + +**Usage:** +```bash +python internvl.py +``` + +#### llama4.py +CCL for Llama-4 Scout vision-language models. + +**Usage:** +```bash +python llama4.py +``` + +#### llama4_cb.py +CCL for Llama-4 with continuous batching. + +**Usage:** +```bash +python llama4_cb.py +``` + +#### llama4_multi_image.py +CCL for Llama-4 with multiple images. + +**Usage:** +```bash +python llama4_multi_image.py +``` + +#### mistral3.py +CCL for Mistral-Small-3.1 vision models. + +**Usage:** +```bash +python mistral3.py +``` + +#### molmo.py +CCL for Molmo-7B multimodal models. + +**Usage:** +```bash +python molmo.py +``` + +#### qwen2_5_vl.py +CCL for Qwen2.5-VL models (32B). + +**Usage:** +```bash +python qwen2_5_vl.py +``` + +#### qwen2_5_vl_cb.py +CCL for Qwen2.5-VL with continuous batching. + +**Usage:** +```bash +python qwen2_5_vl_cb.py +``` + +## Configuration Guidelines + +### Choosing CCL Values + +1. **Prefill Context Lengths** (`comp_ctx_lengths_prefill`): + - Start with smaller values (e.g., [256, 512, 1024]) + - Should be less than or equal to your prefill_seq_len + - Gradually increase based on prompt chunk position + +2. **Decode Context Lengths** (`comp_ctx_lengths_decode`): + - Start from a value based on expected prompt length + - Include intermediate steps (e.g., [512, 1024, 2048, ctx_len]) + - Final value should match ctx_len + +3. **Context Length** (`ctx_len`): + - Maximum context length for the model + - Required parameter for CCL validation + - Should match your model's maximum supported length + +### Example Configurations + +**Small Context (1K-2K):** +```python +ctx_len = 2048 +comp_ctx_lengths_prefill = [256, 512] +comp_ctx_lengths_decode = [1024, ctx_len] +``` + +**Medium Context (4K-8K):** +```python +ctx_len = 8192 +comp_ctx_lengths_prefill = [3072, 4096] +comp_ctx_lengths_decode = [4096, 6144, ctx_len] +``` + +**Large Context (16K+):** +```python +ctx_len = 16384 +comp_ctx_lengths_prefill = [4096, 8192] +comp_ctx_lengths_decode = [8192, 12288, ctx_len] +``` + +## Performance Tips + +1. **Memory Optimization**: Use smaller CCL values for prefill to reduce memory footprint +2. **Progressive Scaling**: Include intermediate CCL values in decode list for smooth transitions +3. **Vision Models**: Larger prefill contexts needed for image embeddings +4. **Continuous Batching**: CCL works seamlessly with CB for dynamic workloads +5. **MoE Models**: Consider prefill_seq_len=1 for optimal performance + +## Common Patterns + +### Text-Only Model +```python +model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + qaic_config={ + "comp_ctx_lengths_prefill": [256, 500], + "comp_ctx_lengths_decode": [512, 1024], + "ctx_len": 1024, + }, +) +``` + +### Vision-Language Model +```python +model = QEFFAutoModelForImageTextToText.from_pretrained( + model_name, + kv_offload=True, + qaic_config={ + "comp_ctx_lengths_prefill": [3072], + "comp_ctx_lengths_decode": [4096, 8192], + "ctx_len": 8192, + }, +) +``` + +### Continuous Batching +```python +model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + continuous_batching=True, + qaic_config={ + "comp_ctx_lengths_prefill": [256, 500], + "comp_ctx_lengths_decode": [512, 1024], + "ctx_len": 1024, + }, +) +``` + +## Documentation + +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Performance Features](https://quic.github.io/efficient-transformers/source/features_enablement.html) +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py new file mode 100644 index 000000000..a4407b05a --- /dev/null +++ b/examples/performance/compute_context_length/basic_inference.py @@ -0,0 +1,154 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +""" +Basic Compute Context Length (CCL) inference example. + +This example demonstrates how to use CCL optimization for text generation models. +CCL allows using different context lengths during prefill and decode phases, +reducing memory footprint and computation for shorter sequences. +""" + +import argparse + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="Text generation with Compute Context Length (CCL) optimization") + parser.add_argument( + "--model-name", + type=str, + default="meta-llama/Llama-3.2-1B", + help="HuggingFace model ID", + ) + parser.add_argument( + "--prompt", + type=str, + default="My name is ", + help="Input prompt for text generation", + ) + parser.add_argument( + "--prefill-seq-len", + type=int, + default=128, + help="Prefill sequence length", + ) + parser.add_argument( + "--ctx-len", + type=int, + default=1024, + help="Maximum context length", + ) + parser.add_argument( + "--comp-ctx-lengths-prefill", + type=lambda x: [int(i) for i in x.split(",")], + default="256,500", + help="Comma-separated list of context lengths for prefill phase (e.g., '256,500')", + ) + parser.add_argument( + "--comp-ctx-lengths-decode", + type=lambda x: [int(i) for i in x.split(",")], + default="512,1024", + help="Comma-separated list of context lengths for decode phase (e.g., '512,1024')", + ) + parser.add_argument( + "--generation-len", + type=int, + default=128, + help="Number of tokens to generate", + ) + parser.add_argument( + "--num-cores", + type=int, + default=16, + help="Number of cores for compilation", + ) + parser.add_argument( + "--num-devices", + type=int, + default=1, + help="Number of devices to use", + ) + parser.add_argument( + "--continuous-batching", + action="store_true", + help="Enable continuous batching mode", + ) + parser.add_argument( + "--full-batch-size", + type=int, + default=1, + help="Full batch size for continuous batching", + ) + parser.add_argument( + "--mxint8-kv-cache", + action="store_true", + default=True, + help="Enable MX INT8 KV cache", + ) + parser.add_argument( + "--mxfp6-matmul", + action="store_true", + default=True, + help="Enable MX FP6 matrix multiplication", + ) + args = parser.parse_args() + + print(f"Loading model: {args.model_name}") + print("CCL Configuration:") + print(f" - Prefill context lengths: {args.comp_ctx_lengths_prefill}") + print(f" - Decode context lengths: {args.comp_ctx_lengths_decode}") + print(f" - Max context length: {args.ctx_len}") + print(f" - Continuous batching: {args.continuous_batching}") + + # Load model with CCL configuration + model = QEFFAutoModelForCausalLM.from_pretrained( + args.model_name, + continuous_batching=args.continuous_batching, + qaic_config={ + "comp_ctx_lengths_prefill": args.comp_ctx_lengths_prefill, + "comp_ctx_lengths_decode": args.comp_ctx_lengths_decode, + "ctx_len": args.ctx_len, # Required for CCL validation + }, + ) + + # Compile the model + print("\nCompiling model...") + compile_kwargs = { + "prefill_seq_len": args.prefill_seq_len, + "ctx_len": args.ctx_len, + "num_cores": args.num_cores, + "num_devices": args.num_devices, + "mxint8_kv_cache": args.mxint8_kv_cache, + "mxfp6_matmul": args.mxfp6_matmul, + } + + if args.continuous_batching: + compile_kwargs["full_batch_size"] = args.full_batch_size + + qpc_path = model.compile(**compile_kwargs) + print(f"Model compiled successfully to: {qpc_path}") + + # Load tokenizer and generate + print("\nGenerating text...") + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + exec_info = model.generate( + prompts=[args.prompt], + tokenizer=tokenizer, + generation_len=args.generation_len, + ) + + print(f"\nPrompt: {args.prompt}") + print(f"Generated: {exec_info.generated_texts[0]}") + + +if __name__ == "__main__": + main() diff --git a/examples/gemma3_example/ccl_gemma3_mm.py b/examples/performance/compute_context_length/gemma3.py similarity index 98% rename from examples/gemma3_example/ccl_gemma3_mm.py rename to examples/performance/compute_context_length/gemma3.py index 9bf6e9c5a..c31b1748a 100644 --- a/examples/gemma3_example/ccl_gemma3_mm.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -38,7 +38,7 @@ }, ) -### use skip_vision=Ture, if want to run only text, or false ### +### use skip_vision=True, if want to run only text, or false ### skip_vision = False if skip_vision: diff --git a/examples/ccl_gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py similarity index 100% rename from examples/ccl_gpt_oss.py rename to examples/performance/compute_context_length/gpt_oss.py diff --git a/examples/granite_example/ccl_granite_vision_inference.py b/examples/performance/compute_context_length/granite_vision.py similarity index 98% rename from examples/granite_example/ccl_granite_vision_inference.py rename to examples/performance/compute_context_length/granite_vision.py index 64ecaf948..39b139bad 100644 --- a/examples/granite_example/ccl_granite_vision_inference.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -11,9 +11,6 @@ from QEfficient import QEFFAutoModelForImageTextToText -# Add HuggingFace Token to access the model -HF_TOKEN = "" - def run_model( model_name, @@ -104,7 +101,6 @@ def run_model( run_model( model_name=model_name, - token=HF_TOKEN, query=query, kv_offload=kv_offload, image_url=image_url, diff --git a/examples/intern_example/ccl_internvl_inference.py b/examples/performance/compute_context_length/internvl.py similarity index 100% rename from examples/intern_example/ccl_internvl_inference.py rename to examples/performance/compute_context_length/internvl.py diff --git a/examples/ccl_llama4_example.py b/examples/performance/compute_context_length/llama4.py similarity index 98% rename from examples/ccl_llama4_example.py rename to examples/performance/compute_context_length/llama4.py index 5da29960f..534be8f96 100644 --- a/examples/ccl_llama4_example.py +++ b/examples/performance/compute_context_length/llama4.py @@ -37,7 +37,7 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) -### use skip_vision=Ture, if want to run only text, ow false ### +### use skip_vision=True, if want to run only text, or false ### skip_vision = False if skip_vision: diff --git a/examples/ccl_llama4_CB_example_vision_lang.py b/examples/performance/compute_context_length/llama4_cb.py similarity index 95% rename from examples/ccl_llama4_CB_example_vision_lang.py rename to examples/performance/compute_context_length/llama4_cb.py index 6423ee765..ea7c09d69 100644 --- a/examples/ccl_llama4_CB_example_vision_lang.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -100,7 +100,6 @@ prompts=prompts, processor=processor, images=image_urls, - device_ids=[32, 33, 34, 35], generation_len=100, ) diff --git a/examples/ccl_llama4_multi_image_example.py b/examples/performance/compute_context_length/llama4_multi_image.py similarity index 96% rename from examples/ccl_llama4_multi_image_example.py rename to examples/performance/compute_context_length/llama4_multi_image.py index 33bf07df0..d7c403e5f 100644 --- a/examples/ccl_llama4_multi_image_example.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -83,7 +83,7 @@ inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(tokenizer) -output = qeff_model.generate(inputs=inputs, device_ids=[32, 33, 34, 35], generation_len=100) +output = qeff_model.generate(inputs=inputs, generation_len=100) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) print(output) diff --git a/examples/ccl_mistral3_example.py b/examples/performance/compute_context_length/mistral3.py similarity index 100% rename from examples/ccl_mistral3_example.py rename to examples/performance/compute_context_length/mistral3.py diff --git a/examples/ccl_molmo_example.py b/examples/performance/compute_context_length/molmo.py similarity index 97% rename from examples/ccl_molmo_example.py rename to examples/performance/compute_context_length/molmo.py index dd09fa020..f68481631 100644 --- a/examples/ccl_molmo_example.py +++ b/examples/performance/compute_context_length/molmo.py @@ -37,7 +37,7 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) -### use skip_vision=Ture, if want to run only text, ow false ### +### use skip_vision=True, if want to run only text, or false ### skip_vision = False if skip_vision: diff --git a/examples/ccl_qwen2_5_vl_example.py b/examples/performance/compute_context_length/qwen2_5_vl.py similarity index 98% rename from examples/ccl_qwen2_5_vl_example.py rename to examples/performance/compute_context_length/qwen2_5_vl.py index 273a18361..00f43a73f 100644 --- a/examples/ccl_qwen2_5_vl_example.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -39,7 +39,7 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) -### use skip_vision=Ture, if want to run only text, ow false ### +### use skip_vision=True, if want to run only text, or false ### skip_vision = False if skip_vision: diff --git a/examples/ccl_qwen2_5_vl_CB.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py similarity index 100% rename from examples/ccl_qwen2_5_vl_CB.py rename to examples/performance/compute_context_length/qwen2_5_vl_cb.py diff --git a/examples/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py similarity index 100% rename from examples/qwen3moe_example/ccl_qwen3moe_inference.py rename to examples/performance/compute_context_length/qwen3moe_example/ccl_qwen3moe_inference.py diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py new file mode 100644 index 000000000..0920ddf30 --- /dev/null +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -0,0 +1,236 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +""" +Vision-Language Model (VLM) inference with Compute Context Length (CCL) optimization. + +This example demonstrates how to use CCL optimization for vision-language models. +CCL allows using different context lengths during prefill and decode phases, +reducing memory footprint and computation while maintaining support for longer contexts. +""" + +import argparse + +import requests +from PIL import Image +from transformers import AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + + +def run_model( + model_name, + query, + image_url, + hf_token=None, + kv_offload=True, + prefill_seq_len=32, + ctx_len=8192, + comp_ctx_lengths_prefill=None, + comp_ctx_lengths_decode=None, + generation_len=128, + img_size=560, + num_cores=16, + num_devices=4, +): + """ + Run VLM inference with CCL optimization. + + Args: + model_name: HuggingFace model ID + query: Text query about the image + image_url: URL of the image to process + hf_token: HuggingFace token for gated models + kv_offload: Enable Dual QPC mode (vision encoder and LM in separate QPCs) + prefill_seq_len: Prefill sequence length + ctx_len: Maximum context length + comp_ctx_lengths_prefill: List of context lengths for prefill phase + comp_ctx_lengths_decode: List of context lengths for decode phase + generation_len: Number of tokens to generate + img_size: Image size for processing + num_cores: Number of cores for compilation + num_devices: Number of devices to use + """ + print(f"Loading model: {model_name}") + print(f"KV offload (Dual QPC mode): {kv_offload}") + print("CCL Configuration:") + print(f" - Prefill context lengths: {comp_ctx_lengths_prefill}") + print(f" - Decode context lengths: {comp_ctx_lengths_decode}") + print(f" - Max context length: {ctx_len}") + + ## STEP 1: Load the Processor and Model + + processor = AutoProcessor.from_pretrained(model_name, token=hf_token) + + # `kv_offload` determines Single QPC vs Dual QPC mode: + # - Single QPC (kv_offload=False): Entire model runs in one QPC + # - Dual QPC (kv_offload=True): Vision encoder and language model run in separate QPCs + # with outputs transferred via host for flexibility + + model = QEFFAutoModelForImageTextToText.from_pretrained( + model_name, + token=hf_token, + attn_implementation="eager", + kv_offload=kv_offload, + qaic_config={ + "comp_ctx_lengths_prefill": comp_ctx_lengths_prefill, + "comp_ctx_lengths_decode": comp_ctx_lengths_decode, + "ctx_len": ctx_len, + }, + ) + + ## STEP 2: Export & Compile the Model + + print("\nCompiling model...") + qpc_path = model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + img_size=img_size, + num_cores=num_cores, + num_devices=num_devices, + mxfp6_matmul=False, + ) + print(f"Model compiled successfully to: {qpc_path}") + + ## STEP 3: Load and Process the Inputs for Inference + + print(f"\nLoading image from: {image_url}") + image = Image.open(requests.get(image_url, stream=True).raw) + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + } + ] + input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] + + inputs = processor( + text=input_text, + images=image, + return_tensors="pt", + add_special_tokens=False, + padding="max_length", + max_length=prefill_seq_len, + ) + + ## STEP 4: Run Inference on the Compiled Model + + print(f"\nQuery: {query}") + print("Generated response:") + streamer = TextStreamer(processor.tokenizer) + output_statistics = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) + + print(f"Tokens generated: {len(output_statistics.generated_ids[0])}") + + +def main(): + parser = argparse.ArgumentParser( + description="Vision-Language Model (VLM) inference with Compute Context Length (CCL) optimization" + ) + parser.add_argument( + "--model-name", + type=str, + default="meta-llama/Llama-3.2-11B-Vision-Instruct", + help="HuggingFace VLM model ID", + ) + parser.add_argument( + "--query", + type=str, + default="Describe this image.", + help="Text query/question about the image", + ) + parser.add_argument( + "--image-url", + type=str, + default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + help="URL of the image to process", + ) + parser.add_argument( + "--hf-token", + type=str, + default=None, + help="HuggingFace token for accessing gated models", + ) + parser.add_argument( + "--kv-offload", + action="store_true", + default=True, + help="Enable Dual QPC mode (vision encoder and LM in separate QPCs)", + ) + parser.add_argument( + "--prefill-seq-len", + type=int, + default=32, + help="Prefill sequence length", + ) + parser.add_argument( + "--ctx-len", + type=int, + default=8192, + help="Maximum context length", + ) + parser.add_argument( + "--comp-ctx-lengths-prefill", + type=lambda x: [int(i) for i in x.split(",")], + default="4096", + help="Comma-separated list of context lengths for prefill phase (e.g., '4096')", + ) + parser.add_argument( + "--comp-ctx-lengths-decode", + type=lambda x: [int(i) for i in x.split(",")], + default="6144,8192", + help="Comma-separated list of context lengths for decode phase (e.g., '6144,8192')", + ) + parser.add_argument( + "--generation-len", + type=int, + default=128, + help="Number of tokens to generate", + ) + parser.add_argument( + "--img-size", + type=int, + default=336, + help="Image size for processing", + ) + parser.add_argument( + "--num-cores", + type=int, + default=16, + help="Number of cores for compilation", + ) + parser.add_argument( + "--num-devices", + type=int, + default=4, + help="Number of devices to use", + ) + args = parser.parse_args() + + run_model( + model_name=args.model_name, + query=args.query, + image_url=args.image_url, + hf_token=args.hf_token, + kv_offload=args.kv_offload, + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + comp_ctx_lengths_prefill=args.comp_ctx_lengths_prefill, + comp_ctx_lengths_decode=args.comp_ctx_lengths_decode, + generation_len=args.generation_len, + img_size=args.img_size, + num_cores=args.num_cores, + num_devices=args.num_devices, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/cpp_execution/CMakeLists.txt b/examples/performance/cpp_execution/CMakeLists.txt similarity index 100% rename from examples/cpp_execution/CMakeLists.txt rename to examples/performance/cpp_execution/CMakeLists.txt diff --git a/examples/cpp_execution/InferenceSetIOBuffer.cpp b/examples/performance/cpp_execution/InferenceSetIOBuffer.cpp similarity index 100% rename from examples/cpp_execution/InferenceSetIOBuffer.cpp rename to examples/performance/cpp_execution/InferenceSetIOBuffer.cpp diff --git a/examples/cpp_execution/README.md b/examples/performance/cpp_execution/README.md similarity index 81% rename from examples/cpp_execution/README.md rename to examples/performance/cpp_execution/README.md index 386921657..2d1c604e5 100644 --- a/examples/cpp_execution/README.md +++ b/examples/performance/cpp_execution/README.md @@ -24,7 +24,7 @@ make -j 8 cd ../../../ # Need to be in base folder - efficient-transformers to run below cmd # Run the python script to get the generated text -python examples/cpp_execution/text_inference_using_cpp.py --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 14 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first +python examples/performance/cpp_execution/text_inference_cpp.py --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 14 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first ``` diff --git a/examples/cpp_execution/text_inference_using_cpp.py b/examples/performance/cpp_execution/text_inference_cpp.py similarity index 99% rename from examples/cpp_execution/text_inference_using_cpp.py rename to examples/performance/cpp_execution/text_inference_cpp.py index 072f2c57c..8355c1e44 100644 --- a/examples/cpp_execution/text_inference_using_cpp.py +++ b/examples/performance/cpp_execution/text_inference_cpp.py @@ -229,7 +229,7 @@ def tokenize_decode_output(tokenizer, generated_ids, prompt): "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples/sample_prompts folder", ) parser.add_argument("--generation_len", "--generation-len", type=int, help="Number of tokens to generate") parser.add_argument( diff --git a/examples/on_device_sampling.py b/examples/performance/on_device_sampling.py similarity index 99% rename from examples/on_device_sampling.py rename to examples/performance/on_device_sampling.py index 00d8c2430..6cc72b715 100644 --- a/examples/on_device_sampling.py +++ b/examples/performance/on_device_sampling.py @@ -177,7 +177,7 @@ def main(args, **kwargs): "--prompts_txt_file_path", "--prompts-txt-file-path", type=str, - help="File path for taking input prompts from txt file, sample prompts.txt file present in examples folder", + help="File path for taking input prompts from txt file, sample prompts.txt file present in examples/sample_prompts folder", ) parser.add_argument("--generation_len", "--generation-len", type=int, help="Number of tokens to generate") diff --git a/examples/performance/speculative_decoding/README.md b/examples/performance/speculative_decoding/README.md new file mode 100644 index 000000000..e03eb45be --- /dev/null +++ b/examples/performance/speculative_decoding/README.md @@ -0,0 +1,181 @@ +# Speculative Decoding Examples + +Accelerate text generation using speculative decoding techniques on Qualcomm Cloud AI 100. + +Speculative decoding improves inference speed by generating multiple candidate tokens in parallel and validating them with the target model, reducing sequential forward passes required for text generation. + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Quick Start + +```bash +# Draft-based: Use small draft model + large target model +python draft_based.py \ + --draft-model-name "meta-llama/Llama-3.2-1B" \ + --target-model-name "meta-llama/Llama-3.2-1B" \ + --num-speculative-tokens 4 + +# Prompt Lookup: N-gram matching without draft model +python prompt_lookup.py \ + --target-model-name "meta-llama/Llama-3.2-1B" \ + --num-speculative-tokens 3 \ + --max-ngram-size 3 + +# Multi-Projection: Built-in speculation for Turbo models (requires speculator_config.json) +# Note: TinyLlama does not support multi-projection - use actual Turbo models +python multi_projection.py \ + --pretrained-model-name-or-path "meta-llama/Llama-3.1-8B-Turbo" +``` + +## Available Scripts + +### draft_based.py - Two-Model Speculative Decoding + +**How It Works:** +1. **Draft Phase**: Small, fast model generates `N` candidate tokens sequentially +2. **Validation Phase**: Large target model scores all candidates in a single forward pass +3. **Acceptance**: Greedily accept tokens until first mismatch, then sample from target distribution +4. **Iteration**: Repeat with accepted tokens + one additional target token + +This approach achieves speedup when draft model is 3-8x faster than target model. + +**Basic Usage:** +```bash +python draft_based.py \ + --draft-model-name "meta-llama/Llama-3.2-1B" \ + --target-model-name "meta-llama/Llama-3.2-8B" \ + --num-speculative-tokens 4 \ + --prefill-seq-len 32 \ + --ctx-len 128 +``` + +**Multi-Device Deployment:** +```bash +python draft_based.py \ + --draft-model-name "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \ + --target-model-name "meta-llama/Llama-3.1-70B" \ + --target-device-group 0,1,2,3 \ + --draft-device-group 4,5 \ + --num-speculative-tokens 6 +``` + +**Key Features:** +- Uses `qaic_config={"speculative_model_type": "target"}` for target model compilation +- Draft model uses fewer cores (5) vs target model (11) by default +- Supports both regular batching and continuous batching modes +- Implements "bonus token" handling for multi-batch scenarios + +**Recommended Model Pairs:** +- `TinyLlama-1.1B` → `Llama-3.1-8B` (8x size ratio) +- `Llama-3.2-1B` → `Llama-3.2-8B` (8x size ratio) +- `Llama-3.1-8B` → `Llama-3.1-70B` (9x size ratio) + +### prompt_lookup.py - N-gram Pattern Matching + +**How It Works:** +1. **Pattern Search**: Sliding window searches input context for n-gram matches +2. **Candidate Generation**: When match found, extract following tokens as candidates +3. **Fallback**: If no match, pad with dummy tokens (no speculation benefit) +4. **Validation**: Target model scores candidates like draft-based approach + +Most effective for repetitive text patterns, code with common structures, or templated content. + +**Basic Usage:** +```bash +python prompt_lookup.py \ + --target-model-name "meta-llama/Llama-3.2-8B" \ + --num-speculative-tokens 3 \ + --max-ngram-size 3 \ + --prefill-seq-len 256 \ + --ctx-len 1024 +``` + +**Optimized for Repetitive Content:** +```bash +python prompt_lookup.py \ + --target-model-name "meta-llama/Llama-3.1-8B" \ + --prompts "Write code with repeated patterns: for i in range(10): print(i)" \ + --num-speculative-tokens 5 \ + --max-ngram-size 4 \ + --ctx-len 2048 +``` + +**Key Features:** +- Implements `find_candidate_pred_tokens()` for n-gram matching +- Maintains `all_ids` array to track full context for pattern matching +- Default prompts designed for repetitive patterns (e.g., "hello, good morning to you") +- Uses `fill_tok=-1` for padding when no matches found +- No separate draft model required - uses n-gram pattern matching instead + +**Key Parameters:** +- `--max-ngram-size`: Larger values (3-5) better for structured text +- `--num-speculative-tokens`: Reduce if acceptance rate is low +- Longer context lengths improve pattern matching opportunities + +### multi_projection.py - Turbo Model Speculation + +**How It Works:** +1. **Multi-Head Projection**: Model has multiple projection heads generating token candidates +2. **Single Forward Pass**: All candidates generated simultaneously in one inference +3. **Built-in Validation**: Model internally scores and ranks candidates +4. **Optimized Architecture**: Specifically designed for speculative decoding + +Requires models with `speculative_config` and multi-projection architecture. + +**Basic Usage:** +```bash +python multi_projection.py \ + --pretrained-model-name-or-path "meta-llama/Llama-3.1-8B-Turbo" \ + --prefill-seq-len 32 \ + --ctx-len 128 +``` + +**Continuous Batching:** +```bash +python multi_projection.py \ + --pretrained-model-name-or-path "meta-llama/Llama-3.1-8B-Turbo" \ + --full-batch-size 4 \ + --device-group 0,1,2,3 \ + --ignore-eos-token +``` + +**Key Features:** +- Uses `qaic_config={"speculative_model_type": "turbo"}` for compilation +- Automatically extracts `num_speculative_tokens` from model's `speculative_config` +- Generates 4D logits tensor: `[batch, num_logits, num_logits, vocab_size]` +- No separate draft model required - speculation built into architecture + + +## Common Parameters + +| Parameter | Description | Default | Recommended | +|-----------|-------------|---------|-------------| +| `--prefill-seq-len` | Prefill chunk size | 32 | 128-256 | +| `--ctx-len` | Max context length | 128 | 512-2048 | +| `--num-speculative-tokens` | Candidates per iteration | 3-4 | 3-6 | +| `--device-group` | Device allocation | `[0]` | Multi-device for large models | +| `--full-batch-size` | Continuous batching | None | 2-8 for throughput | + +## Performance Metrics Explained + +All scripts output detailed metrics: + +``` +Avg TLM+DLM TTFT = 0.15 # Time to first token (seconds) +Decode Throughput = 125.67 # Tokens/second during generation +E2E Throughput = 98.23 # Overall tokens/second including prefill +Avg number of accepted tokens = 2.8 # Speculation effectiveness +``` + + + +## Documentation + +- [Speculative Decoding Guide](https://quic.github.io/efficient-transformers/source/features_enablement.html#speculative-decoding) +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) +- [Performance Optimization](https://quic.github.io/efficient-transformers/source/features_enablement.html) diff --git a/examples/draft_spd_inference.py b/examples/performance/speculative_decoding/draft_based.py similarity index 98% rename from examples/draft_spd_inference.py rename to examples/performance/speculative_decoding/draft_based.py index 9dccc2a1d..9e617663c 100644 --- a/examples/draft_spd_inference.py +++ b/examples/performance/speculative_decoding/draft_based.py @@ -200,7 +200,7 @@ def draft_spec_decode_inference( continuous_batching = full_batch_size is not None if target_model_session is None: target_model = AutoModelForCausalLM.from_pretrained( - target_model_name, continuous_batching=continuous_batching, is_tlm=True + target_model_name, continuous_batching=continuous_batching, qaic_config={"speculative_model_type": "target"} ) target_num_devices = len(target_device_group) target_model_qpc_path: str = target_model.compile( @@ -248,6 +248,7 @@ def draft_spec_decode_inference( p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded) position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1) p_tok["position_ids"] = position_ids + p_tok["num_logits_to_keep"] = np.array([[1]], dtype=np.int64) prompts_tokenized.append(p_tok) # create caches to hold generated ids and input prompt lengths generated_ids = [[] for i in range(decode_batch_size)] @@ -264,6 +265,7 @@ def draft_spec_decode_inference( input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64), position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64), batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1), + num_logits_to_keep=np.arange(num_speculative_tokens + 1, dtype=np.int64).reshape(-1, 1), ) max_gen_len = [ctx_len] * decode_batch_size num_logits_to_keep = num_speculative_tokens + 1 diff --git a/examples/multiprojs_spd_inference.py b/examples/performance/speculative_decoding/multi_projection.py similarity index 100% rename from examples/multiprojs_spd_inference.py rename to examples/performance/speculative_decoding/multi_projection.py diff --git a/examples/pld_spd_inference.py b/examples/performance/speculative_decoding/prompt_lookup.py similarity index 98% rename from examples/pld_spd_inference.py rename to examples/performance/speculative_decoding/prompt_lookup.py index 2b5baba18..53b1f4e85 100644 --- a/examples/pld_spd_inference.py +++ b/examples/performance/speculative_decoding/prompt_lookup.py @@ -103,7 +103,7 @@ def run_prefill_on_draft_and_target( prefill_seq_len: int, slot_idx: int, ): - input_len = inputs.input_ids.shape[1] + input_len = inputs["input_ids"].shape[1] num_chunks = input_len // prefill_seq_len cache_index = np.array([[0]], np.int64) batch_index = np.array([[slot_idx]], np.int64) @@ -234,7 +234,7 @@ def pld_spec_decode_inference( # export_and_compile tlm and dlm continuous_batching = full_batch_size is not None target_model = AutoModelForCausalLM.from_pretrained( - target_model_name, continuous_batching=continuous_batching, is_tlm=True + target_model_name, continuous_batching=continuous_batching, qaic_config={"speculative_model_type": "target"} ) num_devices = len(device_group) @@ -270,6 +270,7 @@ def pld_spec_decode_inference( p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded) position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1) p_tok["position_ids"] = position_ids + p_tok["num_logits_to_keep"] = np.array([[1]], dtype=np.int64) prompts_tokenized.append(p_tok) # create caches to hold generated ids and input prompt lengths generated_ids = [[] for i in range(decode_batch_size)] @@ -280,6 +281,7 @@ def pld_spec_decode_inference( input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64), position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64), batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1), + num_logits_to_keep=np.arange(num_speculative_tokens + 1, dtype=np.int64).reshape(-1, 1), ) num_logits_to_keep = num_speculative_tokens + 1 max_gen_len = [ctx_len] * decode_batch_size diff --git a/examples/qwen3moe_example/qwen3moe_inference.py b/examples/qwen3moe_example/qwen3moe_inference.py deleted file mode 100644 index 3bef3a1dc..000000000 --- a/examples/qwen3moe_example/qwen3moe_inference.py +++ /dev/null @@ -1,21 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -from transformers import AutoTokenizer - -from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.utils.constants import Constants - -model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507" -""" -# For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function -# We will use prompt_len=1 for compilation for both cb and non-cb inference -""" -model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=False) -model.compile(prefill_seq_len=1, ctx_len=256, num_cores=16, num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False) -tokenizer = AutoTokenizer.from_pretrained(model_name) -exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) diff --git a/examples/prompts.txt b/examples/sample_prompts/prompts.txt similarity index 100% rename from examples/prompts.txt rename to examples/sample_prompts/prompts.txt diff --git a/examples/speech_to_text/README.md b/examples/speech_to_text/README.md deleted file mode 100644 index 4b091347b..000000000 --- a/examples/speech_to_text/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Speech Seq2Seq -This directory contains an example script of how to use the AutoModelForSpeechSeq2Seq class. (for now, Whisper models on audio <30 seconds only has been validated) - -## Required packages: -- `librosa==0.10.2` -- `soundfile==0.13.1` - -You can install them using pip: -```sh -pip install librosa==0.10.2 soundfile==0.13.1 -``` - -To run example script after package installations: -```sh -python speech_seq2seq_models.py -``` - -Expected output for given data sample: -```sh -<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|> -``` \ No newline at end of file diff --git a/examples/speech_to_text/run_whisper_speech_to_text.py b/examples/speech_to_text/run_whisper_speech_to_text.py deleted file mode 100644 index d24389e9e..000000000 --- a/examples/speech_to_text/run_whisper_speech_to_text.py +++ /dev/null @@ -1,36 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -from datasets import load_dataset -from transformers import AutoProcessor - -from QEfficient import QEFFAutoModelForSpeechSeq2Seq - -base_model_name = "openai/whisper-tiny" -ctx_len = 25 - -## STEP 1 -- load audio sample, using a standard english dataset, can load specific files if longer audio needs to be tested; also load initial processor -ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") -data = ds[0]["audio"]["array"] -# reshape to so shape corresponds to data with batch size 1 -data = data.reshape(-1) -sample_rate = ds[0]["audio"]["sampling_rate"] -processor = AutoProcessor.from_pretrained(base_model_name) - -## STEP 2 -- init base model -qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(base_model_name) - -## STEP 3 -- export and compile model -qeff_model.compile() - -## STEP 4 -- generate output for loaded input and processor -exec_info = qeff_model.generate( - inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len -) - -## STEP 5 (optional) -- use processor to decode output -print(processor.batch_decode(exec_info.generated_ids)[0]) diff --git a/examples/text_generation/README.md b/examples/text_generation/README.md new file mode 100644 index 000000000..6b80442c2 --- /dev/null +++ b/examples/text_generation/README.md @@ -0,0 +1,314 @@ +# Text Generation Examples + +Examples for running inference on text-only language models on Qualcomm Cloud AI 100. + + +## Authentication + +For private/gated models, export your HuggingFace token: +```bash +export HF_TOKEN= +``` + +## Supported Models + +**QEff Auto Class:** `QEFFAutoModelForCausalLM` + +For the complete list of supported text generation models, see the [Validated Models - Text Generation Section](../../docs/source/validate.md#text-only-language-models). + +Popular model families include: +- Llama (2, 3, 3.1, 3.2, 3.3) +- Mistral, Mixtral, Codestral +- Qwen, Qwen2, Qwen3-MoE +- Gemma, CodeGemma +- GPT-2, GPT-J +- Falcon, MPT, Phi-3 +- Granite, StarCoder + +--- + +## Python Examples + +### basic_inference.py +Simple text generation with any supported language model. + +**Usage:** +```bash +python basic_inference.py \ + --model-name Qwen/Qwen2-1.5B-Instruct \ + --prompt "Hello, how are you?" \ + --prefill-seq-len 32 \ + --ctx-len 128 \ + --num-cores 16 +``` + +This example: +- Demonstrates basic text generation workflow +- Loads any HuggingFace text model +- Compiles and runs inference on Cloud AI 100 + +### continuous_batching.py +Dynamic batching for processing multiple prompts efficiently. + +**Usage:** +```bash +python continuous_batching.py \ + --model-name meta-llama/Llama-3.1-8B \ + --prompts "Hello|Hi there|Good morning|How are you" \ + --full-batch-size 4 \ + --prefill-seq-len 128 \ + --ctx-len 512 \ + --num-cores 16 +``` + +This example: +- Demonstrates continuous batching mode +- Processes multiple prompts in parallel +- Improves throughput for multi-request scenarios +- Uses pipe-separated prompts + +### gguf_models.py +GGUF format model support (quantized models). To run GGUF format models, you need to install the `gguf` package: + +```bash +pip install gguf +``` + +**Usage:** +```bash +# With default parameters +python gguf_models.py + +# With custom parameters +python gguf_models.py \ + --model-name MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF \ + --gguf-file Mistral-7B-Instruct-v0.3.fp16.gguf \ + --prompt "How are you?" \ + --prefill-seq-len 32 \ + --ctx-len 128 \ + --num-cores 16 +``` + +This example: +- Loads models in GGUF format (quantized models) +- Demonstrates GGUF file loading from HuggingFace +- Compiles and runs inference on Cloud AI 100 +- Supports custom GGUF files and prompts + +--- + + +### moe_inference.py +Mixture of Experts (MoE) model inference. + +**Usage:** +```bash +python moe_inference.py \ + --model-name Qwen/Qwen3-30B-A3B-Instruct-2507 \ + --prompt "Explain quantum computing" \ + --ctx-len 256 \ + --num-cores 16 +``` + +This example: +- Demonstrates MoE model inference +- Uses sparse expert activation for efficiency +- Works with Qwen, Mixtral, and other MoE models + + +## CLI Workflow + +The QEfficient CLI provides a streamlined workflow for running text generation models on Cloud AI 100. You can use individual commands for each step or the all-in-one `infer` command. + +### Quick Start: All-in-One Inference (Recommended) + +The `infer` command handles export, compile, and execute in a single step: + +```bash +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompt "Write a short story about AI" \ + --mxfp6 \ + --mxint8_kv_cache \ + --mos 1 \ + --aic_enable_depth_first +``` + +**What it does:** +1. Downloads and exports the model to ONNX +2. Compiles to QPC +3. Executes inference with your prompt + +**CLI API Reference:** [`QEfficient.cloud.infer`](https://quic.github.io/efficient-transformers/source/cli_api.html#qefficient-cloud-infer) + +### Step-by-Step Workflow + +For more control, you can execute each step individually: + +#### Step 1: Export Model to ONNX + +Export the HuggingFace model to ONNX format optimized for Cloud AI 100: + +```bash +python -m QEfficient.cloud.export \ + --model_name meta-llama/Llama-3.1-8B \ + --cache_dir ~/.cache/qeff_cache +``` + +This downloads the model and converts it to ONNX format. The ONNX model is saved in the QEfficient cache directory. + +**CLI API Reference:** [`QEfficient.cloud.export`](https://quic.github.io/efficient-transformers/source/cli_api.html#qefficient-cloud-export) + +#### Step 2: Compile Model to QPC + +Compile the ONNX model to Qualcomm Program Container (QPC) format: + +```bash +python -m QEfficient.cloud.compile \ + --onnx_path ~/.cache/qeff_cache/meta-llama/Llama-3.1-8B/onnx/model.onnx \ + --qpc_path ./qpc_output \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --mxfp6 \ + --mos 1 \ + --aic_enable_depth_first +``` + +**Note:** The `compile` API is deprecated for direct use. Use the unified `infer` API instead for most use cases. + +**CLI API Reference:** [`QEfficient.cloud.compile`](https://quic.github.io/efficient-transformers/source/cli_api.html#qefficient-cloud-compile) + +#### Step 3: Execute Inference + +Run inference using the pre-compiled QPC: + +```bash +python -m QEfficient.cloud.execute \ + --model_name meta-llama/Llama-3.1-8B \ + --qpc_path ./qpc_output/qpcs \ + --prompt "Write a short story about AI" \ + --device_group [0] +``` + +This uses the pre-compiled QPC for fast inference. You can run this multiple times with different prompts without recompiling. + +**CLI API Reference:** [`QEfficient.cloud.execute`](https://quic.github.io/efficient-transformers/source/cli_api.html#qefficient-cloud-execute) + +### Common CLI Parameters + +| Parameter | Description | Default | Example | +|-----------|-------------|---------|---------| +| `--model_name` | HuggingFace model ID | Required | `meta-llama/Llama-3.1-8B` | +| `--prompt` | Input text prompt | Required | `"Hello, how are you?"` | +| `--prompt_len` | Maximum input sequence length | 32 | `128` | +| `--ctx_len` | Maximum context length (input + output) | 128 | `512` | +| `--batch_size` | Batch size for inference | 1 | `1` | +| `--num_cores` | AI 100 cores to use | 16 | `16` or `14` | +| `--device_group` | Device IDs to use | `[0]` | `[0]` or `[0,1,2,3]` | +| `--mxfp6` | Enable MXFP6 quantization | False | Add flag to enable | +| `--mxint8_kv_cache` | Enable MXINT8 KV cache | False | Add flag to enable | +| `--mos` | Memory optimization strategy | 1 | `1` or `2` | +| `--aic_enable_depth_first` | Enable depth-first execution | False | Add flag to enable | + + +### Advanced Features + +#### Multi-Device Inference (Multi-Qranium) + +Run models across multiple devices for better performance: + +```bash +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0,1,2,3] \ + --prompt "Explain quantum computing" \ + --mxfp6 \ + --mxint8_kv_cache \ + --aic_enable_depth_first +``` + +**Documentation:** [Multi-Qranium Inference](https://quic.github.io/efficient-transformers/source/features_enablement.html#multi-qranium-inference) + +#### Continuous Batching + +Process multiple prompts efficiently with continuous batching: + +```bash +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --full_batch_size 4 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompt "Hello|Hi there|Good morning|How are you" \ + --mxfp6 \ + --mxint8_kv_cache +``` + +**Note:** Use pipe (`|`) to separate multiple prompts. When using continuous batching, do not specify `--batch_size`. + +**Documentation:** [Continuous Batching](https://quic.github.io/efficient-transformers/source/features_enablement.html#continuous-batching) + +#### Batch Processing from File + +Process multiple prompts from a text file: + +```bash +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --full_batch_size 8 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompts_txt_file_path examples/sample_prompts/prompts.txt \ + --mxfp6 \ + --mxint8_kv_cache +``` + +### CLI Examples Script + +For a comprehensive collection of copy-paste ready CLI commands, run: + +```bash +bash cli_examples.sh +``` + +This script demonstrates: +- Complete 4-step workflow (Export → Compile → Execute → Infer) +- Multi-device inference +- Continuous batching +- Batch processing from file +- Parameter explanations and best practices + +--- + + +## Additional Resources + +### Documentation +- [CLI API Reference](https://quic.github.io/efficient-transformers/source/cli_api.html) - Complete CLI command documentation +- [Quick Start Guide](https://quic.github.io/efficient-transformers/source/quick_start.html) - Getting started with QEfficient +- [Features Enablement](https://quic.github.io/efficient-transformers/source/features_enablement.html) - Advanced features guide +- [QEff Auto Classes](https://quic.github.io/efficient-transformers/source/qeff_autoclasses.html) - Python API reference +- [Validated Models](https://quic.github.io/efficient-transformers/source/validate.html) - Supported models list + + +### Model Storage +By default, exported models and QPC files are stored in `~/.cache/qeff_cache`. Customize this with: +- `QEFF_HOME`: Primary cache directory +- `XDG_CACHE_HOME`: Alternative cache location + diff --git a/examples/text_generation/basic_inference.py b/examples/text_generation/basic_inference.py new file mode 100644 index 000000000..6340ec725 --- /dev/null +++ b/examples/text_generation/basic_inference.py @@ -0,0 +1,57 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="Basic text generation inference") + parser.add_argument("--model-name", type=str, default="Qwen/Qwen2-1.5B-Instruct", help="HuggingFace model ID") + parser.add_argument("--prompt", type=str, default="Hello, how are you?", help="Input prompt") + parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=128, help="Context length") + parser.add_argument("--generation-len", type=int, default=100, help="Number of tokens to generate") + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument( + "--device-group", + type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], + default=None, + help="Device IDs (comma-separated) e.g. [0,1]", + ) + args = parser.parse_args() + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + model = QEFFAutoModelForCausalLM.from_pretrained(args.model_name) + + # Compile the model + qpc_path = model.compile( + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=(1 if args.device_group is None else len(args.device_group)), + ) + print(f"Model compiled to: {qpc_path}") + + # Generate text + exec_info = model.generate( + tokenizer=tokenizer, + prompts=[args.prompt], + device_id=args.device_group, + generation_len=args.generation_len, + ) + + print(f"\nPrompt: {args.prompt}") + print(f"Generated: {exec_info.generated_texts[0]}") + + +if __name__ == "__main__": + main() diff --git a/examples/text_generation/cli_examples.sh b/examples/text_generation/cli_examples.sh new file mode 100755 index 000000000..12a426ebe --- /dev/null +++ b/examples/text_generation/cli_examples.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +# QEfficient CLI Examples for Text Generation +# This script provides a simplified workflow for running text generation on Cloud AI 100 + +echo "QEfficient CLI Workflow for Text Generation" +echo "===========================================" +echo "" +echo "This example demonstrates the complete workflow using Llama-3.1-8B" +echo "" + +# ============================================================================ +# STEP 1: EXPORT MODEL TO ONNX +# ============================================================================ + +echo "Step 1: Export Model to ONNX" +echo "-----------------------------" +echo "Export the HuggingFace model to ONNX format optimized for Cloud AI 100" +echo "" +cat << 'EOF' +python -m QEfficient.cloud.export \ + --model_name meta-llama/Llama-3.1-8B \ + --cache_dir ~/.cache/qeff_cache +EOF +echo "" +echo "This will download the model and convert it to ONNX format." +echo "The ONNX model will be saved in the QEfficient cache directory." +echo "" + +# ============================================================================ +# STEP 2: COMPILE MODEL TO QPC +# ============================================================================ + +echo "Step 2: Compile Model to QPC" +echo "-----------------------------" +echo "Compile the ONNX model to Qualcomm Program Container (QPC) format" +echo "" +cat << 'EOF' +python -m QEfficient.cloud.compile \ + --onnx_path ~/.cache/qeff_cache/meta-llama/Llama-3.1-8B/onnx/model.onnx \ + --qpc_path ./qpc_output \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --mxfp6 \ + --mos 1 \ + --aic_enable_depth_first +EOF +echo "" +echo "Compilation parameters:" +echo " --batch_size: Number of prompts to process simultaneously" +echo " --prompt_len: Maximum input prompt length" +echo " --ctx_len: Maximum context length (prompt + generation)" +echo " --num_cores: Number of AI 100 cores to use (typically 14 or 16)" +echo " --device_group: Device IDs to use (e.g., [0] for single device, [0,1,2,3] for multi-device)" +echo " --mxfp6: Enable MXFP6 quantization for better performance" +echo " --mos: Memory optimization strategy" +echo " --aic_enable_depth_first: Enable depth-first execution" +echo "" + +# ============================================================================ +# STEP 3: EXECUTE WITH COMPILED QPC +# ============================================================================ + +echo "Step 3: Execute Inference with Compiled QPC" +echo "--------------------------------------------" +echo "Run inference using the pre-compiled QPC" +echo "" +cat << 'EOF' +python -m QEfficient.cloud.execute \ + --model_name meta-llama/Llama-3.1-8B \ + --qpc_path ./qpc_output/qpcs \ + --prompt "Write a short story about AI" \ + --device_group [0] +EOF +echo "" +echo "This uses the pre-compiled QPC for fast inference." +echo "You can run this multiple times with different prompts without recompiling." +echo "" + +# ============================================================================ +# STEP 4: END-TO-END INFERENCE (ALL-IN-ONE) +# ============================================================================ + +echo "Step 4: End-to-End Inference (Recommended)" +echo "-------------------------------------------" +echo "The 'infer' command handles export, compile, and execute in one step" +echo "" +cat << 'EOF' +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompt "Write a short story about AI" \ + --mxfp6 \ + --mxint8_kv_cache \ + --mos 1 \ + --aic_enable_depth_first +EOF +echo "" +echo "This is the recommended approach for most use cases." +echo "It automatically:" +echo " 1. Downloads and exports the model to ONNX (if not cached)" +echo " 2. Compiles to QPC (if not already compiled with these settings)" +echo " 3. Executes inference with your prompt" +echo "" + +# ============================================================================ +# ADDITIONAL EXAMPLES +# ============================================================================ + +echo "" +echo "Additional Examples" +echo "===================" +echo "" + +echo "Multi-Device Inference (Multi-Qranium)" +echo "---------------------------------------" +cat << 'EOF' +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --batch_size 1 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0,1,2,3] \ + --prompt "Explain quantum computing" \ + --mxfp6 \ + --mxint8_kv_cache \ + --aic_enable_depth_first +EOF +echo "" + +echo "Continuous Batching (Multiple Prompts)" +echo "---------------------------------------" +cat << 'EOF' +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --full_batch_size 4 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompt "Hello|Hi there|Good morning|How are you" \ + --mxfp6 \ + --mxint8_kv_cache +EOF +echo "" +echo "Note: Use pipe (|) to separate multiple prompts for continuous batching" +echo "" + +echo "Batch Processing from File" +echo "---------------------------" +cat << 'EOF' +python -m QEfficient.cloud.infer \ + --model_name meta-llama/Llama-3.1-8B \ + --full_batch_size 8 \ + --prompt_len 128 \ + --ctx_len 512 \ + --num_cores 16 \ + --device_group [0] \ + --prompts_txt_file_path examples/sample_prompts/prompts.txt \ + --mxfp6 \ + --mxint8_kv_cache +EOF +echo "" + +# ============================================================================ +# NOTES AND DOCUMENTATION +# ============================================================================ + +echo "" +echo "Important Notes" +echo "===============" +echo "" +echo "Terminal Compatibility:" +echo " - Use bash terminal for best compatibility" +echo " - If using ZSH, wrap device_group in single quotes: '--device_group [0]'" +echo "" +echo "Common Parameters:" +echo " --model_name: HuggingFace model ID (e.g., meta-llama/Llama-3.1-8B)" +echo " --prompt: Input text prompt" +echo " --prompt_len: Maximum input sequence length" +echo " --ctx_len: Maximum context length (input + output)" +echo " --num_cores: AI 100 cores (typically 14 or 16)" +echo " --device_group: Device IDs [0] for single, [0,1,2,3] for multi-device" +echo " --mxfp6: Enable MXFP6 quantization (recommended)" +echo " --mxint8_kv_cache: Enable MXINT8 KV cache (recommended)" +echo " --aic_enable_depth_first: Enable depth-first execution" +echo "" +echo "For More Information:" +echo " - Full CLI API Reference: https://quic.github.io/efficient-transformers/cli_api.html" +echo " - Quick Start Guide: https://quic.github.io/efficient-transformers/quick_start.html" +echo " - Features Guide: https://quic.github.io/efficient-transformers/features_enablement.html" +echo " - Supported Models: https://quic.github.io/efficient-transformers/validate.html" +echo " - Examples README: examples/text_generation/README.md" +echo "" diff --git a/examples/text_generation/continuous_batching.py b/examples/text_generation/continuous_batching.py new file mode 100644 index 000000000..ec3a36ea9 --- /dev/null +++ b/examples/text_generation/continuous_batching.py @@ -0,0 +1,72 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="Continuous batching inference") + parser.add_argument("--model-name", type=str, default="Qwen/Qwen2-1.5B-Instruct", help="HuggingFace model ID") + parser.add_argument( + "--prompts", + type=str, + default="Hello! How can I help?|Hi there! What’s up?|Hey! Need assistance?|Welcome! How can I support you today?", + help="Pipe-separated prompts for batch processing", + ) + parser.add_argument("--prefill-seq-len", type=int, default=128, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=512, help="Context length") + parser.add_argument("--full-batch-size", type=int, default=4, help="Full batch size for continuous batching") + parser.add_argument("--generation-len", type=int, default=100, help="Number of tokens to generate") + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument( + "--device-group", + type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], + default=None, + help="Device IDs (comma-separated) e.g. [0,1]", + ) + args = parser.parse_args() + + # Parse prompts + prompt_list = args.prompts.split("|") + print(f"Processing {len(prompt_list)} prompts with continuous batching") + + # Load tokenizer and model with continuous batching enabled + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + model = QEFFAutoModelForCausalLM.from_pretrained(args.model_name, continuous_batching=True) + + # Compile the model with full_batch_size for continuous batching + qpc_path = model.compile( + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + full_batch_size=args.full_batch_size, + num_cores=args.num_cores, + num_devices=(1 if args.device_group is None else len(args.device_group)), + ) + print(f"Model compiled to: {qpc_path}") + + # Generate text for all prompts + exec_info = model.generate( + tokenizer=tokenizer, + prompts=prompt_list, + device_id=args.device_group, + generation_len=args.generation_len, + ) + + # Display results + print("\n" + "=" * 80) + for i, (prompt, generated) in enumerate(zip(prompt_list, exec_info.generated_texts)): + print(f"\nPrompt {i + 1}: {prompt}") + print(f"Generated: {generated}") + print("-" * 80) + + +if __name__ == "__main__": + main() diff --git a/examples/text_generation/gguf_models.py b/examples/text_generation/gguf_models.py new file mode 100644 index 000000000..2f81ef031 --- /dev/null +++ b/examples/text_generation/gguf_models.py @@ -0,0 +1,59 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import argparse + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="GGUF model inference") + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen2-1.5B-Instruct-GGUF", + help="HuggingFace model ID for GGUF model", + ) + parser.add_argument( + "--gguf-file", + type=str, + default="qwen2-1_5b-instruct-q8_0.gguf", + help="GGUF file name within the model repository", + ) + parser.add_argument("--prompt", type=str, default="Hello! How are you?", help="Input prompt") + parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=128, help="Context length") + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument("--num-devices", type=int, default=1, help="Number of devices") + args = parser.parse_args() + + # Load the model and tokenizer + print(f"Loading GGUF model: {args.model_name}") + print(f"GGUF file: {args.gguf_file}") + + tokenizer = AutoTokenizer.from_pretrained(args.model_name, gguf_file=args.gguf_file) + model = QEFFAutoModelForCausalLM.from_pretrained(args.model_name, gguf_file=args.gguf_file) + + # Compile the model + generated_qpc_path = model.compile( + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=args.num_devices, + ) + print(f"Model compiled to: {generated_qpc_path}") + + # Generate text + exec_info = model.generate(prompts=[args.prompt], tokenizer=tokenizer) + print(f"\nPrompt: {args.prompt}") + print(f"Generated: {exec_info.generated_texts[0]}") + + +if __name__ == "__main__": + main() diff --git a/examples/text_generation/moe_inference.py b/examples/text_generation/moe_inference.py new file mode 100644 index 000000000..276c766dd --- /dev/null +++ b/examples/text_generation/moe_inference.py @@ -0,0 +1,66 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import argparse + +from transformers import AutoTokenizer + +from QEfficient import QEFFAutoModelForCausalLM + + +def main(): + parser = argparse.ArgumentParser(description="MoE model inference") + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-30B-A3B-Instruct-2507", + help="HuggingFace MoE model ID", + ) + parser.add_argument("--prompt", type=str, default="Explain quantum computing", help="Input prompt") + parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=256, help="Context length") + parser.add_argument("--generation-len", type=int, default=None, help="Number of tokens to generate") + parser.add_argument("--num-cores", type=int, default=16, help="Number of cores") + parser.add_argument( + "--device-group", + type=lambda device_ids: [int(x) for x in device_ids.strip("[]").split(",")], + default=None, + help="Device IDs (comma-separated) e.g. [0,1]", + ) + args = parser.parse_args() + + print(f"Loading MoE model: {args.model_name}") + print("Note: MoE models use sparse expert activation for efficient inference") + + # Load tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + model = QEFFAutoModelForCausalLM.from_pretrained(args.model_name) + + # Compile the model + qpc_path = model.compile( + prefill_seq_len=args.prefill_seq_len, + ctx_len=args.ctx_len, + num_cores=args.num_cores, + num_devices=(1 if args.device_group is None else len(args.device_group)), + ) + print(f"Model compiled to: {qpc_path}") + + # Generate text + exec_info = model.generate( + tokenizer=tokenizer, + prompts=[args.prompt], + device_id=args.device_group, + generation_len=args.generation_len, + ) + + print(f"\nPrompt: {args.prompt}") + print(f"Generated: {exec_info.generated_texts[0]}") + + +if __name__ == "__main__": + main() diff --git a/examples/wav2vec2_example/README.md b/examples/wav2vec2_example/README.md deleted file mode 100644 index fba8d9ad2..000000000 --- a/examples/wav2vec2_example/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Speech Recognition with Wav2Vec2 -This directory contains an example script of how to use the AutoModelForCTC class. (for now, Wav2Vec2 models on audio <30 seconds only has been validated) - -## Required packages: -- `librosa==0.10.2` -- `soundfile==0.13.1` - -You can install them using pip: -```sh -pip install librosa==0.10.2 soundfile==0.13.1 -``` - -To run example script after package installations: -```sh -python run_wav2vec2_inference.py -``` - -Expected output for given data sample: -```sh -MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL -``` \ No newline at end of file diff --git a/examples/wav2vec2_example/run_wav2vec2_inference.py b/examples/wav2vec2_example/run_wav2vec2_inference.py deleted file mode 100644 index 961aabeb8..000000000 --- a/examples/wav2vec2_example/run_wav2vec2_inference.py +++ /dev/null @@ -1,24 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -from datasets import load_dataset -from transformers import AutoProcessor - -from QEfficient import QEFFAutoModelForCTC - -base_model_name = "facebook/wav2vec2-base-960h" - -ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") -data = ds[0]["audio"]["array"] -# reshape to so shape corresponds to data with batch size 1 -data = data.reshape(-1) -sample_rate = ds[0]["audio"]["sampling_rate"] -processor = AutoProcessor.from_pretrained(base_model_name) - -model = QEFFAutoModelForCTC.from_pretrained(base_model_name) -model.compile(num_cores=16) -print(model.generate(processor, inputs=data)) diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py index f1c80a6b0..c2e77578a 100644 --- a/tests/cloud/test_export_compile_execute.py +++ b/tests/cloud/test_export_compile_execute.py @@ -76,7 +76,7 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl model_name=model_name, qpc_path=qpc_path, prompt="My name is", - prompts_txt_file_path="examples/prompts.txt", + prompts_txt_file_path="examples/sample_prompts/prompts.txt", generation_len=20, full_batch_size=full_batch_size, ) diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 9addc0a7b..e11f69017 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -24,7 +24,7 @@ def check_infer( num_cores=16, prompt=prompt, local_model_dir=None, - prompts_txt_file_path="examples/prompts.txt", + prompts_txt_file_path="examples/sample_prompts/prompts.txt", aic_enable_depth_first=True, mos=1, hf_token=None,