Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ jobs:
export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M

python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --device cpu

python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte
Expand Down Expand Up @@ -618,7 +618,7 @@ jobs:

python torchchat.py list
python torchchat.py download stories15m
python torchchat.py generate stories15M
python torchchat.py generate stories15M --device cpu
python torchchat.py remove stories15m

test-mps:
Expand Down Expand Up @@ -832,30 +832,30 @@ jobs:
echo "******************************************"

echo "Running eager"
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

echo "Running compiled"
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile
python3 torchchat.py generate --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"

echo "Running eager"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

echo "Running compiled"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"

echo "Running eager"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --device cpu

echo "Running compiled"
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile
python3 torchchat.py generate --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 --compile --device cpu

echo "tests complete"
echo "******************************************"
Expand Down Expand Up @@ -942,7 +942,7 @@ jobs:

export PRMT="Once upon a time in a land far away"

python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}"
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}" --device cpu

python torchchat.py export stories15M --output-pte-path ./model.pte
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
Expand Down Expand Up @@ -995,7 +995,7 @@ jobs:
export MODEL_DIR=${PWD}/checkpoints/stories15M
export PROMPT="Once upon a time in a land far away"

python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}"
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cpu

for dtype in fp32 fp16 bf16 fast fast16; do
echo "Running export + runner with dtype=$dtype"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run-readme-pr-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
with:
runner: macos-m1-14
script: |
conda create -y -n test-readme-mps-macos python=3.10.11
conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
conda activate test-readme-mps-macos
set -x
# NS: Remove previous installation of torch first
Expand Down
2 changes: 1 addition & 1 deletion cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _add_export_output_path_args(parser, verb: str) -> None:
"Export Output Path" if is_export else None,
"Specify the output path for the exported model files" if is_export else None,
)
exclusive_parser = output_path_parser.add_mutually_exclusive_group()
exclusive_parser = output_path_parser.add_mutually_exclusive_group(required=is_export)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only actual change. Everything else is rebase artifact

exclusive_parser.add_argument(
"--output-pte-path",
type=str,
Expand Down
8 changes: 7 additions & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Docs in this directory are unstable
# Most Docs in this directory are unstable

Explicitly calling out that the docs in this directory may be outdated, incomplete, scratch notes, or a WIP.
torchchat provides no guarantees on these files as references.

Please refer to the root README for stable features and documentation.

---

Docs that are updated and used as **Source of Truth**:
- [Model Customization](model_customization.md)
- [Quantization](quantization.md)
12 changes: 11 additions & 1 deletion docs/model_customization.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,17 @@ To reduce the memory bandwidth requirement and to take advantage of higher densi
the model can use lower precision floating point representations.
For example, many GPUs and some of the CPUs have good support for bfloat16 and float16.

See the [precision guide](quantization.md#model-precision-dtype-precision-setting) for more details.
Unlike gpt-fast which uses bfloat16 as default, torchchat uses the dtype
"fast16". This picks the best performing 16-bit floating point type
available (for execution with Executorch, macOS/ARM and Linux/x86 platforms).
For example on macOS, support depends on the OS version, with versions starting
with 14.0 supporting bfloat16 as support, and float16 for earlier OS version
based on system support for these data types.

The "fast" data type is also provided as a virtual data type that defaults
to the best floating point data type available on the selected device.
Currently, this behaves the same as "fast16", but with "fp32" when exporting
to ExecuTorch.


## Quantization
Expand Down
94 changes: 39 additions & 55 deletions docs/quantization.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
> [!WARNING]
> Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation.


# Quantization

<!--
Expand All @@ -11,62 +7,41 @@
-->

## Introduction
Quantization focuses on reducing the precision of model parameters and computations from floating-point to lower-bit integers, such as 8-bit integers. This approach aims to minimize memory requirements, accelerate inference speeds, and decrease power consumption, making models more feasible for deployment on edge devices with limited computational resources. For high-performance devices such as GPUs, quantization provides a way to reduce the required memory bandwidth and take advantage of the massive compute capabilities provided by today's server-based accelerators such as GPUs.
Quantization focuses on reducing the precision of model parameters and computations from floating-point to lower-bit integers, such as 8-bit integers.
This approach aims to minimize memory requirements, accelerate inference speeds, and decrease power consumption, making models more feasible for
deployment on edge devices with limited computational resources. For high-performance devices such as GPUs, quantization provides a way to
reduce the required memory bandwidth and take advantage of the massive compute capabilities provided by today's server-based accelerators such as GPUs.

While quantization can potentially degrade the model's performance, the methods supported by torchchat are designed to mitigate this effect, maintaining a balance between efficiency and accuracy. In this document we provide details on the supported quantization schemes, how to quantize models with these schemes and a few example of running such quantized models on supported backends.
While quantization can potentially degrade the model's performance, the methods supported by torchchat are designed to mitigate this effect,
maintaining a balance between efficiency and accuracy. In this document we provide details on the supported quantization schemes, how to quantize
models with these schemes and a few example of running such quantized models on supported backends.

## Supported Quantization Schemes
### Weight Quantization
| compression | bitwidth| group size | dynamic activation quantization | Eager | AOTI | ExecuTorch |
|--|--|--|--|--|--|--|
| linear (asymmetric) | [8, 4]* | [32, 64, 128, 256]** | | ✅ | ✅ | 🚧 |
| linear (asymmetric) | [4, 8]* | [32, 64, 128, 256]^ | | ✅ | ✅ | 🚧 |
| linear with dynamic activations (symmetric) | | [32, 64, 128, 256]* | a8w4dq | 🚧 |🚧 | ✅ |

### Embedding Quantization

Due to the larger vocabulary size of llama3, we also recommend
To support the larger vocabularies (e.g. Llama 3), we also recommend
quantizing the embeddings to further reduce the model size for
on-device usecases.

| compression | weight quantization (bitwidth)| weight quantization (group size) | dynamic activation quantization | Eager | AOTI | ExecuTorch |
|--|--|--|--|--|--|--|
| embedding (symmetric) | [8, 4]* | [32, 64, 128, 256]+ | | ✅ | ✅ | ✅ |
| embedding (symmetric) | [4, 8]* | [32, 64, 128, 256]+ | | ✅ | ✅ | ✅ |


>\* These are the only valid bitwidth options.

* These are the only valid bitwidth options.

** There are many valid group size options, including 512, 1024,
>** There are many valid group size options, including 512, 1024,
etc. Note that smaller groupsize tends to be better for preserving
model quality and accuracy, and larger groupsize for further
improving performance. Set 0 for channelwise quantization.

+ Should support non-power-of-2-groups as well.

## Quantization Profiles

Torchchat quantization supports profiles with multiple settings such
as accelerator, dtype, and quantization specified in a JSON file.
Four sample profiles are included wwith the torchchat distributin in
config/data: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`
with profiles optimizing for execution on cuda, desktop, mobile and
raspberry Pi devices.

In addition to quantization recipes described below, the profiles also
enable developers to specify the accelerator and dtype to be used.

At present torchchat supports the fast, cuda, mps, and cpu devices.
The default device in torchchat is "fast". The "fast" device is a
virtual device that defaults to the fastest executor available in the
system, selecting cuda, mps, and cpu in this order.

At present torchchat supports the fast16, fast, bf16, fp16 and fp32
data types. The default data type for models is "fast16". The
"fast16" data type is a virtual data type that defaults to the best
16-bit floating point data type available on the selected device. The
"fast" data type is a virtual data type that defaults to the best
floating point data type available on the selected device. ("Best"
tangibly representing a combination of speed and accuracy.)
>\+ Should support non-power-of-2-groups as well.


## Quantization API
Expand All @@ -86,8 +61,19 @@ for valid `bitwidth` and `groupsize` values.

See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/main/quantization/quantize.py#L1260-L1266).

In addition to quantization, the [accelerator](model_customization.md#device)
and [precision](model_customization.md#model-precision) can also be specified.
Preference is given to the args provided in the quantization API over those
provided explicitly (e.g. `--device`).

The expected JSON format is described below. Refer to the links above for valid `device` and `dtype` values.
| config | JSON string |
|--|--|
| accelerator | `'{"executor": {"accelerator": <device>}}'` |
| precision | `'{"precision": {"dtype": <dtype>}}'`|

## Examples
We can mix and match weight quantization with embedding quantization.
Here are some examples of quantization configurations

[skip default]: begin
* Config file
Expand All @@ -102,43 +88,41 @@ We can mix and match weight quantization with embedding quantization.
```
--quantize '{"embedding": {"bitwidth": 4, "groupsize":32}, "linear:a8w4dq": {"groupsize" : 256}}'
```
* Quantize linear layers with specified dtype and device
```
--quantize '{"executor": {"accelerator": "cuda"},
"precision": {"dtype": "bf16"},
"linear:int4": {"groupsize" : 256}}'
```
[skip default]: end

Quantization recipes can be applied in conjunction with any of the
`chat`, `generate`, `browser` and `export` commands. Below are
`chat`, `generate`, `browser`, `server`, and `export` commands.

Below are
examples showcasing eager mode with `generate` and AOTI and ExecuTorch
with `export`.

### Eager mode
```
python3 generate.py [--compile] llama3 --prompt "Hello, my name is" --quantize '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --device cpu
python3 generate.py llama3 --prompt "Hello, my name is" --quantize '{"embedding" : {"bitwidth": 8, "groupsize": 0}}'
```
### AOTI
```
python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "groupsize":32}, "linear:int4": {"groupsize" : 256}}' --output-dso-path llama3.so

python3 generate.py llama3 --dso-path llama3.so --prompt "Hello my name is"
```
### ExecuTorch
```
python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "groupsize":32}, "linear:a8w4dq": {"groupsize" : 256}}' --output-pte-path llama3.pte

python3 generate.py llama3 --pte-path llama3.pte --prompt "Hello my name is"
```

## Model precision (dtype precision setting)
On top of quantizing models with integer quantization schemes mentioned above, models can be converted to lower bit floating point precision to reduce the memory bandwidth requirement and take advantage of higher density compute available. For example, many GPUs and some of the CPUs have good support for BFloat16 and Float16. This can be taken advantage of via `--dtype` arg as shown below.

[skip default]: begin
```
python3 generate.py --dtype [ fast16 | fast | bf16 | fp16 | fp32] ...
python3 export.py --dtype [ fast16 | fast | bf16 | fp16 | fp32] ...
```
[skip default]: end

Unlike gpt-fast which uses bfloat16 as default, torchchat uses the dtype "fast16" as the default. Torchchat will pick the appropriate 16-bit floating point type available and offering the best performance (for execution with Executorch, macOS/ARM and Linux/x86 platforms). For macOS, support depends on the OS version, with versions starting with 14.0 supporting bfloat16 as support, and float16 for earlier OS version based on system support for these data types.
## Quantization Profiles

Support for FP16 and BF16 is limited in many embedded processors and -dtype fp32 may be required in some environments. Additional ExecuTorch support for 16-bit floating point types may be added in the future based on hardware support.
Four [sample profiles](https://github.com/pytorch/torchchat/tree/main/config/data) are included with the torchchat distribution: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`
with profiles optimizing for execution on cuda, desktop, mobile and
raspberry Pi devices.

## Adding additional quantization schemes
We invite contributors to submit established quantization schemes, with accuracy and performance results demonstrating soundness.
Expand Down
Loading