### Installing Important Modules

In [5]:
!pip install -U bitsandbytes



### Importing Libraries

In [None]:
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from transformers.cache_utils import DynamicCache
import os
device = "cuda" if torch.cuda.is_available() else "cpu"

### Generate Function

In [9]:
# Minimal generate function for token-by-token generation
def generate(model, input_ids: torch.Tensor, past_key_values, max_new_tokens: int = 50) -> torch.Tensor:
    device = model.model.embed_tokens.weight.device
    origin_len = input_ids.shape[-1]
    input_ids = input_ids.to(device)
    output_ids = input_ids.clone()
    next_token = input_ids

    with torch.no_grad():
        for _ in range(max_new_tokens):
            out = model(
                input_ids=next_token,
                past_key_values=past_key_values,
                use_cache=True
            )
            logits = out.logits[:, -1, :]
            token = torch.argmax(logits, dim=-1, keepdim=True)
            output_ids = torch.cat([output_ids, token], dim=-1)
            past_key_values = out.past_key_values
            next_token = token.to(device)

            if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
                break

    # Return just the newly generated part
    return output_ids[:, origin_len:]

### Dynamic Cache Setup

In [2]:
# Initializing the DynamicCache mechanism for storing and reusing the model’s key/value states.
torch.serialization.add_safe_globals([DynamicCache])
torch.serialization.add_safe_globals([set])

def get_kv_cache(model, tokenizer, prompt: str) -> DynamicCache:
    # Encode prompt
    device = model.model.embed_tokens.weight.device
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    cache = DynamicCache()

    with torch.no_grad():
        _ = model(
            input_ids=input_ids,
            past_key_values=cache,
            use_cache=True
        )
    return cache

# Remove any extra tokens appended by user queries, appended to the original knowledge
def clean_up(cache: DynamicCache, origin_len: int):
    for i in range(len(cache.key_cache)):
        cache.key_cache[i] = cache.key_cache[i][:, :, :origin_len, :]
        cache.value_cache[i] = cache.value_cache[i][:, :, :origin_len, :]

### Load LLM Model & Tokenizer

In [None]:
def load_quantized_model_and_tokenizer(
    model_name = "mistralai/Mistral-7B-Instruct-v0.1",
    hf_token = "enter your hugging face access token"  # Hugging Face token
    ):
    # Configure quantization for 4-bit loading
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
        bnb_4bit_compute_dtype=torch.float16,  # Set computation precision
        bnb_4bit_quant_type="nf4",  # Use Normal Float 4 (NF4) quantization
        bnb_4bit_use_double_quant=True,  # Enable double quantization
    )
    # Load the pre-trained model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # Automatically allocate model to devices
        quantization_config=quantization_config,
        token=hf_token,
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token,
    )
    return tokenizer, model

tokenizer,model = load_quantized_model_and_tokenizer()

### Create a Knowledge Base from input file and prepare KV cache

In [8]:
def prepare_system_prompt(file_path, model, tokenizer):
    try:
        # Ensure the file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}. Please create a file with the necessary context.")

        # Read content from the file
        with open(file_path, "r", encoding="utf-8") as f:
            input_text = f.read().strip()

        # Create the system prompt
        system_prompt = f"""
        <|system|>
        You are an assistant who provides concise factual answers.
        <|user|>
        Context:
        {input_text}
        Question:
        """.strip()

        # Build and return KV cache
        kv_cache = get_kv_cache(model, tokenizer, system_prompt)
        origin_len = kv_cache.key_cache[0].shape[-2]
        print(f"KV cache built. Original length: {origin_len}")
        return kv_cache,origin_len

    except FileNotFoundError as e:
        print(e)
        raise
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        raise


# Specify file path and prepare KV cache
file_path = "input doc.txt"
kV_cache,origin_len = prepare_system_prompt(file_path, model, tokenizer)

KV cache built. Original length: 6904


### Ask Questions Reusing the Cache

In [19]:
# 1st query
question1 = "what is capital of Tamil Nadu?"
clean_up(kV_cache, origin_len)
input_ids_q1 = tokenizer(question1 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q1 = generate(model, input_ids_q1, kV_cache)
answer1 = tokenizer.decode(gen_ids_q1[0], skip_special_tokens=True)
print("Q1:", question1)
print(answer1)

Q1: what is capital of Tamil Nadu?
       Answer: Chennai


In [13]:
# 2nd query
question2 = "Tell me about its History"
clean_up(kV_cache, origin_len)
input_ids_q2 = tokenizer(question2 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q2 = generate(model, input_ids_q2, kV_cache)
answer2 = tokenizer.decode(gen_ids_q2[0], skip_special_tokens=True)
print("Q2:", question2)
print(answer2)

Q2: Tell me about its History
       Answer: Tamil Nadu has a rich history dating back to the Indus Valley Civilization. Archaeological evidence suggests that the region was inhabited by the Dravidians, who are believed to have migrated from Africa to India.


In [22]:
# 3rd query
question3 = "Tell me about its Dravidians"
clean_up(kV_cache, origin_len)
input_ids_q3 = tokenizer(question3 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q3 = generate(model, input_ids_q3, kV_cache)
answer3 = tokenizer.decode(gen_ids_q3[0], skip_special_tokens=True)
print("Q3:", question3)
print(answer3)

Q3: Tell me about its Dravidians
Answer: Dravidians are an ethnic group of people who are believed to have originated in the Dravidian Peninsula, which is now southern India. They are known for their distinctive language, culture, and customs, which are different from
