In [None]:
%pip install -U transformers llama-index accelerate pypdf einops bitsandbytes

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM

In [None]:
documents = SimpleDirectoryReader("./data").load_data()

In [None]:
from llama_index.prompts import PromptTemplate

system_prompt = """<|SYSTEM|>#
Mistral Research is an expert in the field of research
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="mistralai/Mistral-7B-v0.1",
    model_name="mistralai/Mistral-7B-v0.1",
    device_map="auto",
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={
        "torch_dtype": torch.float16, 
        "llm_int8_enable_fp32_cpu_offload": True,
        "bnb_4bit_quant_type": 'nf4',
        "bnb_4bit_use_double_quant":True,
        "bnb_4bit_compute_dtype":torch.bfloat16,
        "load_in_4bit": True}
)

In [None]:
service_context = ServiceContext.from_defaults(chunk_size=1024,
                                               llm=llm,
                                               embed_model='local')

In [None]:
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)

In [None]:
query_engine = index.as_query_engine(streaming=True)

In [None]:
response_stream = query_engine.query("explain about cross attention?")
response_stream.print_response_stream()