In [1]:
from llama_index.llms import HuggingFaceLLM
import torch
from llama_index.prompts import PromptTemplate

In [2]:
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [3]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
    model_name="StabilityAI/stablelm-tuned-alpha-3b",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:46<00:00, 23.40s/it]
generation_config.json: 100%|██████████| 111/111 [00:00<00:00, 32.4kB/s]
tokenizer_config.json: 100%|██████████| 264/264 [00:00<00:00, 1.07MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:01<00:00, 1.55MB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 219kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
from llama_index import SimpleDirectoryReader, ServiceContext, SummaryIndex

In [14]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model='local'
)

config.json: 100%|██████████| 684/684 [00:00<00:00, 3.49MB/s]
model.safetensors: 100%|██████████| 133M/133M [00:11<00:00, 12.1MB/s] 
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 1.39MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 377kB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.03MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 440kB/s]


In [18]:
# Load the your data
documents = SimpleDirectoryReader("./documents").load_data()
index = SummaryIndex.from_documents(documents, service_context=service_context)

# Query and print response
query_engine = index.as_query_engine()
response = query_engine.query("<query_text>")
print(response)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RuntimeError: MPS backend out of memory (MPS allocated: 11.79 GB, other allocations: 6.22 GB, max allowed: 18.13 GB). Tried to allocate 1.93 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).