Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
158f9b9
Stage-1 export debug
quic-amitraj Jul 10, 2025
8690842
Stage-2 Export inital working version done
quic-amitraj Jul 16, 2025
072be0e
Stage-3 compilation work is under progress
quic-amitraj Jul 17, 2025
41503c2
Stage-4 Working pipeline with wrong output
quic-amitraj Jul 23, 2025
597c936
Testing
quic-amitraj Jul 30, 2025
565df68
Testing
quic-amitraj Jul 30, 2025
cb1591b
Working sd3-turbo
quic-amitraj Aug 3, 2025
5f43d40
Working with cleaned code
quic-amitraj Aug 5, 2025
9c77344
Working with cleaned code
quic-amitraj Aug 5, 2025
5f17d61
Working with vae_included
quic-amitraj Aug 6, 2025
7aed56f
Fix-1
quic-amitraj Aug 8, 2025
39b53d0
Fix-2
quic-amitraj Aug 10, 2025
83a2666
Fix-3
quic-amitraj Aug 13, 2025
e413646
Added readme for diffusers
quic-amitraj Aug 14, 2025
060f4d0
Code cleanup
quic-amitraj Aug 14, 2025
7cc9d28
Code cleanup-2
quic-amitraj Aug 15, 2025
b4aee48
Minor fix
quic-amitraj Aug 20, 2025
80a1bbb
Added Support of flux
quic-amitraj Sep 19, 2025
80ff141
Updated seq_len of flux transformers
tv-karthikeya Sep 24, 2025
7947495
Removing SD3, adding small fix for flux model hash
tv-karthikeya Oct 9, 2025
212be5d
adding device id support for flux for all stages
tv-karthikeya Oct 9, 2025
4464fb7
[WIP] Adding support for custom Height,width
tv-karthikeya Nov 3, 2025
12c3d3a
Flux support with Custom config
quic-amitraj Nov 4, 2025
75842f7
Added OnnxfunctionTransform and code cleanup while modifying compile …
quic-amitraj Nov 4, 2025
4ba0e9c
Compile fix
quic-amitraj Nov 5, 2025
7f6aac1
Modification of Pipeline-1
Nov 6, 2025
1409315
Modification of Pipeline-2
Nov 7, 2025
42ef5cd
Update readme for diffusers
Nov 10, 2025
ddbf534
Added support of output dataclass
Nov 11, 2025
a36f598
Replaced output dict with dataclass to make it more user friendly
Nov 11, 2025
753ae8d
Rebased with main and fixed some issues
Nov 12, 2025
35a4bb3
Code cleaning and removed redundant code
Nov 13, 2025
f491971
Code cleaning and removed redundant code-2
Nov 13, 2025
fd31a58
Added tqdm for export and compile
Nov 13, 2025
b91e2c9
Parallel compilation and onnx subfunction is added
Nov 14, 2025
8d78ac9
Height and widht now can be passed from compile and __call__ method a…
Nov 24, 2025
fcec077
Removed redundant code
Nov 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def check_qaic_sdk():
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile

# Imports for the diffusers
from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEFFFluxPipeline
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
Expand All @@ -70,6 +73,7 @@ def check_qaic_sdk():
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
"QEFFFluxPipeline",
]

else:
Expand Down
26 changes: 24 additions & 2 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,30 @@ def _model_offloaded_check(self) -> None:
raise RuntimeError(error_msg)

@property
@abstractmethod
def model_name(self) -> str: ...
def model_name(self) -> str:
"""
Get the model class name without QEff/QEFF prefix.

This property extracts the underlying model's class name and removes
any QEff or QEFF prefix that may have been added during wrapping.

Returns:
str: Model class name (e.g., "CLIPTextModel" instead of "QEffCLIPTextModel")
"""
mname = self.model.__class__.__name__
if mname.startswith("QEff") or mname.startswith("QEFF"):
mname = mname[4:]
return mname

@property
def get_model_config(self) -> Dict:
"""
Get the model configuration as a dictionary.

Returns:
Dict: The configuration dictionary of the underlying HuggingFace model
"""
return self.model.config.__dict__

@abstractmethod
def export(self, export_dir: Optional[str] = None) -> Path:
Expand Down
108 changes: 108 additions & 0 deletions QEfficient/diffusers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@

<div align="center">


# **Diffusion Models on Qualcomm Cloud AI 100**


<div align="center">

### 🎨 **Experience the Future of AI Image Generation**

* Optimized for Qualcomm Cloud AI 100*

<img src="../../docs/image/girl_laughing.png" alt="Sample Output" width="400">

**Generated with**: `black-forest-labs/FLUX.1-schnell` • `"A girl laughing"` • 4 steps • 0.0 guidance scale • ⚡



</div>



[![Diffusers](https://img.shields.io/badge/Diffusers-0.35.1-orange.svg)](https://github.com/huggingface/diffusers)
</div>

---

## ✨ Overview

QEfficient Diffusers brings the power of state-of-the-art diffusion models to Qualcomm Cloud AI 100 hardware for text-to-image generation. Built on top of the popular HuggingFace Diffusers library, our optimized pipeline provides seamless inference on Qualcomm Cloud AI 100 hardware.

## 🛠️ Installation

### Prerequisites

Ensure you have Python 3.8+ and the required dependencies:

```bash
# Create Python virtual environment (Recommended Python 3.10)
sudo apt install python3.10-venv
python3.10 -m venv qeff_env
source qeff_env/bin/activate
pip install -U pip
```

### Install QEfficient

```bash
# Install from GitHub (includes diffusers support)
pip install git+https://github.com/quic/efficient-transformers

# Or build from source
git clone https://github.com/quic/efficient-transformers.git
cd efficient-transformers
pip install build wheel
python -m build --wheel --outdir dist
pip install dist/qefficient-0.0.1.dev0-py3-none-any.whl
```

### Install Diffusers Dependencies

```bash
# Install diffusers optional dependencies
pip install "QEfficient[diffusers]"
```

---

## 🎯 Supported Models
- ✅ [`black-forest-labs/FLUX.1-schnell`](https://huggingface.co/black-forest-labs/FLUX.1-schnell)

---


## 📚 Examples

Check out our comprehensive examples in the [`examples/diffusers/`](../../examples/diffusers/) directory:

---

## 🤝 Contributing

We welcome contributions! Please see our [Contributing Guide](../../CONTRIBUTING.md) for details.

### Development Setup

```bash
git clone https://github.com/quic/efficient-transformers.git
cd efficient-transformers
pip install -e ".[diffusers,test]"
```

---

## 🙏 Acknowledgments

- **HuggingFace Diffusers**: For the excellent foundation library
- **Stability AI**: For the amazing Stable Diffusion models
---

## 📞 Support

- 📖 **Documentation**: [https://quic.github.io/efficient-transformers/](https://quic.github.io/efficient-transformers/)
- 🐛 **Issues**: [GitHub Issues](https://github.com/quic/efficient-transformers/issues)

---

6 changes: 6 additions & 0 deletions QEfficient/diffusers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# ----------------------------------------------------------------------------
6 changes: 6 additions & 0 deletions QEfficient/diffusers/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# ----------------------------------------------------------------------------
75 changes: 75 additions & 0 deletions QEfficient/diffusers/models/attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# ----------------------------------------------------------------------------

import torch
from diffusers.models.attention import JointTransformerBlock, _chunked_feed_forward


class QEffJointTransformerBlock(JointTransformerBlock):
def forward(
self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
):
if self.use_dual_attention:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
hidden_states, emb=temb
)
else:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)

if self.context_pre_only:
norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
else:
norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
encoder_hidden_states, emb=temb
)

# Attention.
attn_output, context_attn_output = self.attn(
hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
)

# Process attention outputs for the `hidden_states`.
attn_output = gate_msa.unsqueeze(1) * attn_output
hidden_states = hidden_states + attn_output

if self.use_dual_attention:
attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
hidden_states = hidden_states + attn_output2

norm_hidden_states = self.norm2(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
else:
# ff_output = self.ff(norm_hidden_states)
ff_output = self.ff(norm_hidden_states, block_size=4096)
ff_output = gate_mlp.unsqueeze(1) * ff_output

hidden_states = hidden_states + ff_output

# Process attention outputs for the `encoder_hidden_states`.
if self.context_pre_only:
encoder_hidden_states = None
else:
context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
encoder_hidden_states = encoder_hidden_states + context_attn_output

norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
context_ff_output = _chunked_feed_forward(
self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
)
else:
# context_ff_output = self.ff_context(norm_encoder_hidden_states)
context_ff_output = self.ff_context(norm_encoder_hidden_states, block_size=333)
encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output

return encoder_hidden_states, hidden_states
Loading
Loading