In [None]:
# FAILED with MS model

### Installing Important Modules

In [1]:
# !pip install -U bitsandbytes

### Importing Libraries

In [6]:
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
from transformers.cache_utils import DynamicCache
import os
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [8]:
model_name = "B:\\Work\\Code\\f\\4\\testing\\llm\\models\\unsloth-Llama-3.2-1B-Instruct"

### Generate Function

In [9]:
# Minimal generate function for token-by-token generation
def generate(model, input_ids: torch.Tensor, past_key_values, max_new_tokens: int = 50) -> torch.Tensor:
    device = model.model.embed_tokens.weight.device
    origin_len = input_ids.shape[-1]
    input_ids = input_ids.to(device)
    output_ids = input_ids.clone()
    next_token = input_ids

    with torch.no_grad():
        for _ in range(max_new_tokens):
            out = model(
                input_ids=next_token,
                past_key_values=past_key_values,
                use_cache=True
            )
            logits = out.logits[:, -1, :]
            token = torch.argmax(logits, dim=-1, keepdim=True)
            output_ids = torch.cat([output_ids, token], dim=-1)
            past_key_values = out.past_key_values
            next_token = token.to(device)

            if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
                break

    # Return just the newly generated part
    return output_ids[:, origin_len:]

### Dynamic Cache Setup

In [10]:
# Initializing the DynamicCache mechanism for storing and reusing the model’s key/value states.
torch.serialization.add_safe_globals([DynamicCache])
torch.serialization.add_safe_globals([set])

def get_kv_cache(model, tokenizer, prompt: str) -> DynamicCache:
    # Encode prompt
    device = model.model.embed_tokens.weight.device
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    cache = DynamicCache()

    with torch.no_grad():
        _ = model(
            input_ids=input_ids,
            past_key_values=cache,
            use_cache=True
        )
    return cache

# Remove any extra tokens appended by user queries, appended to the original knowledge
def clean_up(cache: DynamicCache, origin_len: int):
    for i in range(len(cache.key_cache)):
        cache.key_cache[i] = cache.key_cache[i][:, :, :origin_len, :]
        cache.value_cache[i] = cache.value_cache[i][:, :, :origin_len, :]

### Load LLM Model & Tokenizer

In [11]:
def load_model_and_tokenizer():
    # model_name = "../model/bitnet-b1.58-2B-4T"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="cuda", # <==== TODO change this
            )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
    )
    return tokenizer, model

tokenizer,model = load_model_and_tokenizer()

### Create a Knowledge Base from input file and prepare KV cache

In [12]:
def prepare_system_prompt(file_path, model, tokenizer):
    try:
        # Ensure the file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}. Please create a file with the necessary context.")

        # Read content from the file
        with open(file_path, "r", encoding="utf-8") as f:
            input_text = f.read().strip()

        # Create the system prompt
        system_prompt = f"""
        <|system|>
        You are an assistant who provides concise factual answers.
        <|user|>
        Context:
        {input_text}
        """.strip()

        # Build and return KV cache
        kv_cache = get_kv_cache(model, tokenizer, system_prompt)
        origin_len = kv_cache.key_cache[0].shape[-2]
        print(f"KV cache built. Original length: {origin_len}")
        return kv_cache,origin_len

    except FileNotFoundError as e:
        print(e)
        raise
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        raise


# Specify file path and prepare KV cache
file_path = "../input/text_1.txt"
kV_cache,origin_len = prepare_system_prompt(file_path, model, tokenizer)

An unexpected error occurred: CUDA out of memory. Tried to allocate 1.07 GiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.47 GiB is allocated by PyTorch, and 92.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.07 GiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.47 GiB is allocated by PyTorch, and 92.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Ask Questions Reusing the Cache

In [None]:
%%time
# 1st query
question1 = "What caused the rain's taste?"
clean_up(kV_cache, origin_len)
input_ids_q1 = tokenizer(question1 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q1 = generate(model, input_ids_q1, kV_cache)
answer1 = tokenizer.decode(gen_ids_q1[0], skip_special_tokens=True)
print("Q1:", question1)
print(answer1)

Q1: What caused the rain's taste?
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: total: 5min 25s
Wall time: 55.7 s


In [None]:
%%time
# 2nd query
question2 = "What is Aerilon?"
clean_up(kV_cache, origin_len)
input_ids_q2 = tokenizer(question2 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q2 = generate(model, input_ids_q2, kV_cache)
answer2 = tokenizer.decode(gen_ids_q2[0], skip_special_tokens=True)
print("Q2:", question2)
print(answer2)

Q2: What is Aerilon?
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: total: 5min 19s
Wall time: 54.5 s


In [None]:
%%time
# 3rd query
question3 = "What resists understanding?"
clean_up(kV_cache, origin_len)
input_ids_q3 = tokenizer(question3 + "\n", return_tensors="pt").input_ids.to(device)
gen_ids_q3 = generate(model, input_ids_q3, kV_cache)
answer3 = tokenizer.decode(gen_ids_q3[0], skip_special_tokens=True)
print("Q3:", question3)
print(answer3)

Q3: What resists understanding?
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: total: 5min 23s
Wall time: 55.3 s
