diff --git a/docs/docs/examples/llm/ipex_llm.ipynb b/docs/docs/examples/llm/ipex_llm.ipynb index 172099cceb667..18b72b1752174 100644 --- a/docs/docs/examples/llm/ipex_llm.ipynb +++ b/docs/docs/examples/llm/ipex_llm.ipynb @@ -8,7 +8,9 @@ "\n", "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency.\n", "\n", - "This example goes over how to use LlamaIndex to interact with [`ipex-llm`](https://github.com/intel-analytics/ipex-llm/) for text generation and chat on CPU." + "This example goes over how to use LlamaIndex to interact with [`ipex-llm`](https://github.com/intel-analytics/ipex-llm/) for text generation and chat on CPU. \n", + "\n", + "For more examples and usage, refer to [Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples)." ] }, { diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/README.md b/llama-index-integrations/llms/llama-index-llms-ipex-llm/README.md index d0698ba4cd373..e982c22f409d1 100644 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/README.md +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/README.md @@ -1,3 +1,22 @@ # LlamaIndex Llms Integration: IPEX-LLM -[IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. This module allows loading LLMs with ipex-llm optimizations. +[IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. This module enables the use of LLMs optimized with `ipex-llm` in LlamaIndex pipelines. + +## Installation + +### On CPU + +```bash +pip install llama-index-llms-ipex-llm +``` + +## Usage + +```python +from llama_index.llms.ipex_llm import IpexLLM +``` + +## Examples + +- [Notebook Example](https://docs.llamaindex.ai/en/stable/examples/llm/ipex_llm/) +- [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples) diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/BUILD b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md new file mode 100644 index 0000000000000..674ebbabacdce --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md @@ -0,0 +1,25 @@ +# IpexLLM Examples + +This folder contains examples showcasing how to use LlamaIndex with `ipex-llm` LLM integration `llama_index.llms.ipex_llm.IpexLLM`. + +## Installation + +### On CPU + +Install `llama-index-llms-ipex-llm`. This will also install `ipex-llm` and its dependencies. + +```bash +pip install llama-index-llms-ipex-llm +``` + +## List of Examples + +### More Data Types Example + +By default, `IpexLLM` loads the model in int4 format. To load a model in different data formats like `sym_int5`, `sym_int8`, etc., you can use the `load_in_low_bit` option in `IpexLLM`. + +The example [more_data_type.py](./more_data_type.py) shows how to use the `load_in_low_bit` option. Run the example as following: + +```bash +python more_data_type.py -m -t -l +``` diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py new file mode 100644 index 0000000000000..45ffdf2b57faf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py @@ -0,0 +1,58 @@ +import argparse +from llama_index.llms.ipex_llm import IpexLLM + + +# Transform a string into input llama2-specific input +def completion_to_prompt(completion): + return f"[INST] <>\n \n<>\n\n{completion} [/INST]" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="More Data Types Example") + parser.add_argument( + "--model-name", + "-m", + type=str, + default="meta-llama/Llama-2-7b-hf", + help="The huggingface repo id for the large language model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument( + "--tokenizer-name", + "-t", + type=str, + default="meta-llama/Llama-2-7b-hf", + help="The huggingface repo id or the path to the checkpoint containing the tokenizer" + "usually it is the same as the model_name", + ) + parser.add_argument( + "--low-bit", + "-l", + type=str, + default="asym_int4", + choices=["sym_int4", "asym_int4", "sym_int5", "asym_int5", "sym_int8"], + help="The quantization type the model will convert to.", + ) + + args = parser.parse_args() + model_name = args.model_name + tokenizer_name = args.tokenizer_name + low_bit = args.low_bit + + # load the model using low-bit format specified + llm = IpexLLM( + model_name=model_name, + tokenizer_name=tokenizer_name, + context_window=512, + max_new_tokens=64, + load_in_low_bit=low_bit, + completion_to_prompt=completion_to_prompt, + generate_kwargs={"temperature": 0.7, "do_sample": False}, + ) + + print( + "\n----------------------- Text Stream Completion ---------------------------" + ) + response_iter = llm.stream_complete("Explain what is AI?") + for response in response_iter: + print(response.delta, end="", flush=True) diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py index 880677349279a..7101385c4a353 100644 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/llama_index/llms/ipex_llm/base.py @@ -3,7 +3,6 @@ from typing import Any, Callable, List, Optional, Sequence import torch - from llama_index.core.base.llms.types import ( ChatMessage, ChatResponse, @@ -59,6 +58,20 @@ class IpexLLM(CustomLLM): "Unused if `model` is passed in directly." ), ) + load_in_4bit: bool = Field( + default=True, + description=( + "Whether to load model in 4bit." "Unused if `load_in_low_bit` is not None." + ), + ) + load_in_low_bit: str = Field( + default=None, + description=( + "Which low bit precisions to use when loading model. " + "Example values: 'sym_int4', 'asym_int4', 'fp4', 'nf4', 'fp8', etc." + "Will override `load_in_4bit` if this is specified." + ), + ) context_window: int = Field( default=DEFAULT_CONTEXT_WINDOW, description="The maximum number of tokens available for input.", @@ -124,6 +137,8 @@ def __init__( max_new_tokens: int = DEFAULT_NUM_OUTPUTS, tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL, model_name: str = DEFAULT_HUGGINGFACE_MODEL, + load_in_4bit: Optional[bool] = True, + load_in_low_bit: Optional[str] = None, model: Optional[Any] = None, tokenizer: Optional[Any] = None, device_map: Optional[str] = "auto", @@ -176,19 +191,33 @@ def __init__( self._model = model else: try: - self._model = AutoModelForCausalLM.from_pretrained( - model_name, - load_in_4bit=True, - use_cache=True, - trust_remote_code=True, - **model_kwargs, - ) + if load_in_low_bit: + self._model = AutoModelForCausalLM.from_pretrained( + model_name, + load_in_low_bit=load_in_low_bit, + use_cache=True, + trust_remote_code=True, + **model_kwargs, + ) + else: + self._model = AutoModelForCausalLM.from_pretrained( + model_name, + load_in_4bit=load_in_4bit, + use_cache=True, + trust_remote_code=True, + **model_kwargs, + ) except Exception: from ipex_llm.transformers import AutoModel - self._model = AutoModel.from_pretrained( - model_name, load_in_4bit=True, **model_kwargs - ) + if load_in_low_bit: + self._model = AutoModel.from_pretrained( + model_name, load_in_low_bit=load_in_low_bit, **model_kwargs + ) + else: + self._model = AutoModel.from_pretrained( + model_name, load_in_4bit=load_in_4bit, **model_kwargs + ) if "xpu" in device_map: self._model = self._model.to(device_map) diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-ipex-llm/pyproject.toml index 99a1ba65f0041..1b7dfb4a51d71 100644 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/pyproject.toml @@ -30,7 +30,7 @@ license = "MIT" name = "llama-index-llms-ipex-llm" packages = [{include = "llama_index/"}] readme = "README.md" -version = "0.1.0" +version = "0.1.1" [tool.poetry.dependencies] python = ">=3.9,<4.0"