In [23]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# non cached version

In [None]:
model_id = "../model/bitnet-b1.58-2B-4T"
# model_id = "B:\\Work\\Code\\f\\4\\testing\\llm\\models\\unsloth-Llama-3.2-1B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
)

You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.


In [25]:
%%time
# Apply the chat template
messages = [
    {"role": "system", "content": '''
     Answer the question based on context
     The rain in Veridia always tasted faintly of iron. This wasn't due to the mineral content of the soil \u2013 though it was unusually high \u2013 but to the constant, low-level hum emanating from the Chronarium, a structure dominating the western cliffs overlooking the city. It\u2019s a place I\u2019ve spent my entire life studying, and one that remains stubbornly resistant to complete understanding, even after decades of research. "
     '''},
    {"role": "user", "content": "What caused the rain's taste?"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate response
chat_outputs = model.generate(**chat_input, max_new_tokens=50)
response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True) # Decode only the response part
print("\nAssistant Response:", response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Assistant Response: The rain in Veridia tastes faintly of iron due to the constant, low-level hum emanating from the Chronarium, a structure dominating the western cliffs overlooking the city. This hum is not due to the mineral content of the soil, although it
CPU times: total: 3min 15s
Wall time: 33.9 s


# CAG

In [19]:
from time import time
from transformers.cache_utils import DynamicCache
import os
import logging 
from typing import Union
from util import get_training_data
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [2]:
# def preprocess_knowledge(
#     model,
#     tokenizer,
#     prompt: str,
# ) -> DynamicCache:
#     """
#     Prepare knowledge kv cache for CAG.
#     Args:
#         model: HuggingFace model with automatic device mapping
#         tokenizer: HuggingFace tokenizer
#         prompt: The knowledge to preprocess, which is basically a prompt

#     Returns:
#         DynamicCache: KV Cache
#     """
#     embed_device = model.model.embed_tokens.weight.device
#     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(embed_device)
#     past_key_values = DynamicCache()
#     with torch.no_grad():
#         outputs = model(
#             input_ids=input_ids,
#             past_key_values=past_key_values,
#             use_cache=True,
#             output_attentions=False,
#             output_hidden_states=False
#         )
#     return outputs.past_key_values


In [3]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [4]:
def write_kv_cache(kv: DynamicCache, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    """
    Write the KV Cache to a file.
    """
    torch.save(kv, path)

In [None]:
# def prepare_kvcache(documents, filepath: str = "./cache/cache_knowledges.pt", answer_instruction: Union[str , None] = None):
#     # Prepare the knowledges kvcache

#     if answer_instruction is None:
#         answer_instruction = "Answer the question with a super short answer."
#     knowledges = f"""
#     <|begin_of_text|>
#     <|start_header_id|>system<|end_header_id|>
#     You are an assistant for giving short answers based on given context.<|eot_id|>
#     <|start_header_id|>user<|end_header_id|>
#     Context information is bellow.
#     ------------------------------------------------
#     {documents}
#     ------------------------------------------------
#     {answer_instruction}
#     Question:
#     """
#     # Get the knowledge cache
#     t1 = time()
#     model_name = "../model/bitnet-b1.58-2B-4T"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForCausalLM.from_pretrained(
#             model_name,
#             torch_dtype=torch.float16,
#             device_map="cpu", # <==== TODO change this
            
#         )
#     kv = preprocess_knowledge(model, tokenizer, knowledges)
#     print("kvlen: ", kv.key_cache[0].shape[-2])
#     write_kv_cache(kv, filepath)
#     t2 = time()
#     logger.info(f"KV cache prepared in {t2 - t1:.2f} seconds.")
#     return kv, t2 - t1

In [6]:
def clean_up(kv: DynamicCache, origin_len: int):
    """
    Truncate the KV Cache to the original length.
    """
    for i in range(len(kv.key_cache)):
        kv.key_cache[i] = kv.key_cache[i][:, :, :origin_len, :]
        kv.value_cache[i] = kv.value_cache[i][:, :, :origin_len, :]

In [21]:
def generate(
    model,
    input_ids: torch.Tensor,
    past_key_values,
    max_new_tokens: int = 300
) -> torch.Tensor:
    """
    Generate text with greedy decoding.

    Args:
        model: HuggingFace model with automatic device mapping
        input_ids: Input token ids
        past_key_values: KV Cache for knowledge
        max_new_tokens: Maximum new tokens to generate
    """

    embed_device = model.model.embed_tokens.weight.device

    origin_ids = input_ids
    input_ids = input_ids.to(embed_device)

    output_ids = input_ids.clone()
    next_token = input_ids

    with torch.no_grad():
        for _ in range(max_new_tokens):
            outputs = model(
                input_ids=next_token, 
                past_key_values=past_key_values,
                use_cache=True
            )
            next_token_logits = outputs.logits[:, -1, :]
            next_token = next_token_logits.argmax(dim=-1).unsqueeze(-1)
            next_token = next_token.to(embed_device)

            past_key_values = outputs.past_key_values

            output_ids = torch.cat([output_ids, next_token], dim=1)

            # if next_token.item() in model.config.eos_token_id:
            #     break
    return output_ids[:, origin_ids.shape[-1]:]

In [8]:
# answer_instruction = "Answer the asked question."
# text_list, dataset = get_training_data("../input/training_data_1.json")
# kvcache_path = "./data_cache/cache_knowledges.pt"
# knowledges = '\n\n\n\n\n\n'.join(text_list)
# knowledge_cache, prepare_time = prepare_kvcache(knowledges, filepath=kvcache_path, answer_instruction=answer_instruction)
# kv_len = knowledge_cache.key_cache[0].shape[-2]
# print(f"KVcache prepared in {prepare_time} seconds")
# with open("./output/result_2.txt", "a") as f:
#     f.write(f"KVcache prepared in {prepare_time} seconds\n")

# results = {
#     "cache_time": [],
#     "generate_time": [],
#     "similarity": [],
#     "prompts": [],
#     "responses": []
# }

In [13]:
def read_kv_cache(path: str) -> Union[DynamicCache , None]:
    """
    Read the KV Cache from a file. If the cache file is invalid or empty, return None.
    """
    if os.path.exists(path) and os.path.getsize(path) > 0:
        kv = torch.load(path, weights_only=False)
        return kv
    else:
        # Regenerate cache if it doesn't exist or is too small
        return None

In [22]:
%%time
text_list, dataset = get_training_data("../input/training_data_1.json")
dataset = list(dataset)  # Convert the dataset to a list
model_name = "../model/bitnet-b1.58-2B-4T"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="cpu", # <==== TODO change this
            
        )


# max_questions = min(len(dataset), args.maxQuestion) if args.maxQuestion is not None else len(dataset)
# Retrieve the knowledge from the vector database
for id, (question, ground_truth) in enumerate(dataset[:]):
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

    # Read the knowledge cache from the cache file
    # cache_t1 = time()
    # if args.kvcache == "file":
    #     knowledge_cache = read_kv_cache(kvcache_path)

    # Not a good idea to use this method, as it will consume a lot of memory
    # if args.kvcache == "variable":
    #     knowledge_cache = documents_cache
    # cache_t2 = time()

    # Generate Response for the question
    knowledges = '\n\n\n'.join(text_list)

#     if args.usePrompt:
#         prompt = f"""
# <|begin_of_text|>
# <|start_header_id|>system<|end_header_id|>
# You are an assistant for giving short answers based on given context.<|eot_id|>
# <|start_header_id|>user<|end_header_id|>
# Context information is bellow.
# ------------------------------------------------
# {knowledges}
# ------------------------------------------------
# {answer_instruction}
# Question:
# {question}<|eot_id|>
# <|start_header_id|>assistant<|end_header_id|>
# """
#         generate_t1 = time()
#         input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
#         output = generate(model, input_ids, DynamicCache()) 
#         generated_text = tokenizer.decode(output[0], skip_special_tokens=True, temperature=None)
#         generate_t2 = time()
#     else:
    prompt = f"""
        {question}<|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """
    generate_t1 = time()
    knowledge_cache = read_kv_cache("../data_cache/cache_knowledges_1.pt")
    kv_len = knowledge_cache.key_cache[0].shape[-2]
    clean_up(knowledge_cache, kv_len)
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    output = generate(model, input_ids, knowledge_cache)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True, temperature=None)
    generate_t2 = time()

    # print("D: ", knowledges)
    print("Q: ", question)
    print("A: ", generated_text)
    print(time())

#     # Evaluate bert-score similarity
#     similarity = cagsim.bert(generated_text, ground_truth)

#     print(f"[{id}]: Semantic Similarity: {round(similarity, 5)},",
#             f"cache time: {cache_t2 - cache_t1},",
#             f"generate time: {generate_t2 - generate_t1}")
#     with open(args.output, "a") as f:
#         f.write(f"[{id}]: Semantic Similarity: {round(similarity, 5)},\t cache time: {cache_t2 - cache_t1},\t generate time: {generate_t2 - generate_t1}\n")

#     results["prompts"].append(question)
#     results["responses"].append(generated_text)
#     results["cache_time"].append(cache_t2 - cache_t1)
#     results["generate_time"].append(generate_t2 - generate_t1)
#     results["similarity"].append(similarity)

#     with open(args.output, "a") as f:
#         f.write(f"[{id}]: [Cumulative]: "
#                 + f"Semantic Similarity: {round(sum(results['similarity']) / (len(results['similarity'])) , 5)},"
#                 + f"\t cache time: {sum(results['cache_time']) / (len(results['cache_time'])) },"
#                 + f"\t generate time: {sum(results['generate_time']) / (len(results['generate_time'])) }\n")

# avg_similarity = sum(results["similarity"]) / len(results["similarity"])
# avg_cache_time = sum(results["cache_time"]) / len(results["cache_time"])
# avg_generate_time = sum(results["generate_time"]) / len(results["generate_time"])
# print()
# print(f"Prepare time: {prepare_time}")
# print(f"Average Semantic Similarity: {avg_similarity}")
# print(f"cache time: {avg_cache_time},\t generate time: {avg_generate_time}")
# print()
# with open(args.output, "a") as f:
#     f.write("\n")
#     f.write(f"Result for {args.output}\n")
#     f.write(f"Prepare time: {prepare_time}\n")
#     f.write(f"Average Semantic Similarity: {avg_similarity}\n")
#     f.write(f"cache time: {avg_cache_time},\t generate time: {avg_generate_time}\n")

Q:  What caused the rain's taste?
A:  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1745052371.0781054
Q:  Where is the structure located?
A:  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1745052695.5691574
Q:  What resists understanding?
A:  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

KeyboardInterrupt: 

In [None]:
# model_id = "../output/bitnet-b1.58-2B-4T"
# # Load tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
# )

In [None]:
# %%time
# # Apply the chat template
# messages = [
#     {"role": "system", "content": '''Answer the following questions'''},
#     {"role": "user", "content": "What caused the rain's taste?"},
# ]
# prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)

# # Generate response
# chat_outputs = model.generate(**chat_input, max_new_tokens=50)
# response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True) # Decode only the response part
# print("\nAssistant Response:", response)