In [1]:
import os
import re
import json
import pickle
from tempfile import mkdtemp
import shutil

import numpy as np

from huggingface_hub import login as hf_login
from sentence_transformers import SentenceTransformer
from langchain_mistralai import ChatMistralAI
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

from tqdm.notebook import tqdm




In [2]:
ARTEFACT_VERSION = '01'

In [3]:
# !pip install airllm==2.11.0

In [4]:
ARTEFACT_ROOT_FOLDER = os.environ.get('ARTEFACT_ROOT_FOLDER', '/artefact')
ARTEFACT_FOLDER = os.path.join(ARTEFACT_ROOT_FOLDER, 'eberron', f'v{ARTEFACT_VERSION}')

In [5]:
# Written on ChatGPT 4.0 with the prompt: Write a python class to temporarily set an environmental variable. I need to use the class with `with`. I want the variable to be unset upon exit if it was not set at entry, or set back to its original value.

class TempEnvVar:
    def __init__(self, key, value):
        """
        Initialize the TempEnvVar object with the environment variable key and temporary value.
        
        Args:
            key (str): The environment variable name.
            value (str): The temporary value to set.
        """
        self.key = key
        self.value = value
        self.original_value = None
        self.was_set = False

    def __enter__(self):
        """
        Set the environment variable when entering the with block.
        """
        self.original_value = os.environ.get(self.key)
        self.was_set = self.key in os.environ
        os.environ[self.key] = self.value

    def __exit__(self, exc_type, exc_value, traceback):
        """
        Restore or unset the environment variable when exiting the with block.
        """
        if self.was_set:
            os.environ[self.key] = self.original_value
        else:
            os.environ.pop(self.key, None)
            

# Load the Artefact

In [6]:
with open(os.path.join(ARTEFACT_FOLDER, 'embeddings.pkl'), 'rb') as f:
    embeddings = pickle.load(f)


In [7]:
with open(os.path.join(ARTEFACT_FOLDER, 'model_metadata.json'), 'r') as f:
    model_metadata = json.load(f)
assert model_metadata['embedding_model']['str'].startswith('SentenceTransformer')

In [8]:
with open(os.path.join(ARTEFACT_FOLDER, 'chunk_metadata.json'), 'r') as f:
    chunk_metadata = json.load(f)


In [9]:
file_names = [f for f in os.listdir(os.path.join(ARTEFACT_FOLDER, 'chunks')) if f.endswith('.md')]
file_names = sorted(file_names)
chunks = [None] * len(file_names)
for file_name in tqdm(file_names):
    file_path = os.path.join(ARTEFACT_FOLDER, 'chunks', file_name)
    with open(file_path, 'r') as f:
        chunks[int(file_name.split('.')[0])] = f.read()

  0%|          | 0/7639 [00:00<?, ?it/s]

# Evaluate

In [10]:
# TODO: Experiment with BM25 and compare

## BM25


In [11]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:         2863804 kB
MemAvailable:   30576364 kB


In [12]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 1 MiB, 15095 MiB


In [13]:
tokenized_chunks = [chunk.lower().split(" ") for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)


In [14]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:         1694128 kB
MemAvailable:   29406908 kB


In [15]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 1 MiB, 15095 MiB


In [16]:
model_metadata['embedding_model']['name']

'Alibaba-NLP/gte-base-en-v1.5'

In [17]:
embedding_model = SentenceTransformer(model_metadata['embedding_model']['name'], 
                                      trust_remote_code=True, 
                                      revision=model_metadata['embedding_model']['revision'])
embedding_model = embedding_model.to("cpu")

2025-01-10 02:10:17.560398: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-10 02:10:17.575265: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-10 02:10:17.594916: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-10 02:10:17.600868: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-10 02:10:17.615201: I tensorflow/core/platform/cpu_feature_guar

In [18]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          677232 kB
MemAvailable:   28565060 kB


In [19]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 671 MiB, 14425 MiB


In [20]:
# query = "Who is Dash Donnigan?"
# query = "Who is Commander Iyanna?"
# query = "Tell me about Menthis Plateau."
# query = "Tell me about Eldeen Reaches."
# query = "Tell me about the rivers of Khorvaire."

# query = "Tell me about Xen'drik."
# query = "Tell me about fashion in Khorvaire."
# query = "Create a House Cannith item."
user_query = "Tell me about the languages of Eberron."
query_embed = embedding_model.encode(user_query, normalize_embeddings=True)

In [21]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          674892 kB
MemAvailable:   28562720 kB


In [22]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 671 MiB, 14425 MiB


In [23]:
k = 3
similarities = torch.from_numpy(np.dot(embeddings, query_embed.T))

similarities.topk(k).indices.tolist()

[2719, 4202, 4203]

In [24]:
for i in similarities.topk(k).indices.tolist():
    print(chunk_metadata[i])

{'filename': 'Eberron_ Rising From the Last War - Jeremy Crawford & James Wyatt & Keith Baker.pdf', 'edition': '5e', 'pdf/title': 'Eberron: Rising From the Last War', 'pdf/author': 'Jeremy Crawford & James Wyatt & Keith Baker', 'pages': [5, 6, 7]}
{'filename': '1598836-Languages_of_Eberron_2E.pdf', 'edition': '5e', 'pdf/title': 'Languages of Eberron 2E mk iii', 'pdf/author': '', 'pages': [2, 3, 4]}
{'filename': '1598836-Languages_of_Eberron_2E.pdf', 'edition': '5e', 'pdf/title': 'Languages of Eberron 2E mk iii', 'pdf/author': '', 'pages': [3, 4, 5]}


In [25]:
bm25.get_top_n(user_query.lower().split(" "), chunk_metadata, n=k)

[{'filename': '1598836-Languages_of_Eberron_2E.pdf',
  'edition': '5e',
  'pdf/title': 'Languages of Eberron 2E mk iii',
  'pdf/author': '',
  'pages': [1, 2, 3]},
 {'filename': '1598836-Languages_of_Eberron_2E.pdf',
  'edition': '5e',
  'pdf/title': 'Languages of Eberron 2E mk iii',
  'pdf/author': '',
  'pages': [2, 3, 4]},
 {'filename': '1598836-Languages_of_Eberron_2E.pdf',
  'edition': '5e',
  'pdf/title': 'Languages of Eberron 2E mk iii',
  'pdf/author': '',
  'pages': [3, 4, 5]}]

# Answer Queries

In [26]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


In [27]:
# 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
# hf_login()


In [28]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch


In [27]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          675552 kB
MemAvailable:   28563408 kB


In [28]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 671 MiB, 14425 MiB


In [29]:
# from airllm import AutoModel

In [30]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

In [None]:
# model_path = "/jupyterlab/models/hf/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db"

In [None]:
# model = AutoModel.from_pretrained(model_path)

In [31]:
model_path = "/jupyterlab/models/hf/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db"
# model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, quantization_config=bnb_config, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, quantization_config=bnb_config, device_map="balanced_low_0")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [32]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          361912 kB
MemAvailable:   27970812 kB


In [33]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 4087 MiB, 11009 MiB


In [34]:
# with TempEnvVar('TRANSFORMERS_OFFLINE', '1'):
#     model = AutoModelForCausalLM.from_pretrained(model_id)

In [35]:
system_prompt = ""

In [36]:
retrieved_docs = "\n\n\n".join([chunks[i] for i in similarities.topk(1).indices.tolist()])

In [37]:
# with TempEnvVar('TRANSFORMERS_OFFLINE', '1'):
#     model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)

In [38]:
prompt = f"""{system_prompt}

Here is the retrieved context:
{retrieved_docs}

Here is the users query:
{user_query}
"""

formatted_prompt = f"Q: {prompt} A: "

In [51]:
# input_tokens = model.tokenizer([prompt],
#     return_tensors="pt", 
#     return_attention_mask=False, 
#     truncation=True, 
#     max_length=128, 
#     padding=False)
# input_tokens

In [52]:
# generation_output = model.generate(
#     input_tokens['input_ids'].cuda(), 
#     max_new_tokens=20,
#     use_cache=True,
#     return_dict_in_generate=True)

In [None]:
# # model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, quantization_config=bnb_config, device_map="auto")
# with TempEnvVar('TRANSFORMERS_OFFLINE', '1'):
#     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, quantization_config=bnb_config, device_map="balanced_low_0")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
# model.gradient_checkpointing_enable()

In [39]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          360924 kB
MemAvailable:   27969896 kB


In [40]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 4087 MiB, 11009 MiB


In [41]:
# with TempEnvVar('TRANSFORMERS_OFFLINE', '1'):
#     model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
# model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)







In [44]:
all(param.device.type == 'cuda' for param in model.parameters())

True

In [43]:
all(param.device.type == 'cpu' for param in embedding_model.parameters())

True

In [45]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          359092 kB
MemAvailable:   27968448 kB


In [46]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 4087 MiB, 11009 MiB


In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [48]:
!cat /proc/meminfo | grep Mem


MemTotal:       32386472 kB
MemFree:          349772 kB
MemAvailable:   27959312 kB


In [49]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 4087 MiB, 11009 MiB


In [73]:
model_inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
# model_inputs = tokenizer([user_query], return_tensors="pt").to("cuda")


In [74]:
model_inputs[0]


Encoding(num_tokens=4667, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [75]:
len(formatted_prompt), len(model_inputs), len(model_inputs[0])

(16604, 2, 4667)

In [76]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 5957 MiB, 9139 MiB


In [77]:
# generated_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=True)

In [78]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 5957 MiB, 9139 MiB


In [91]:
# generated_ids = model.generate(**model_inputs, do_sample=True)
generated_ids = model.generate(**model_inputs, max_new_tokens=512)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [92]:
len(generated_ids)

1

In [93]:
len(generated_ids[0])

5179

In [94]:
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, stream=False)
output_text.split('A:  ')[1]

"In Eberron, languages reflect culture and geography; a dwarf raised in Breland might not know Dwarvish, but a halfling raised in the Mror Holds might. The historical development of languages and cultures also explains the scripts used to write various languages. For example, the Orc language is written using the Goblin script (rather than Dwarvish, as stated in the Player's Handbook), because the orcs of Khorvaire learned writing from the goblins. Common is the language of the Five Nations and the language of trade in Khorvaire, known by most of its people. Goblin was the trade language of the goblin empire of Dhakaan and survives as the primary language in Darguun, Droaam, and the Shadow Marches regions. Goblin displaced the Orc language; the people of the Shadow Marches typically speak Goblin, and Orc is an exotic language. Members of all races in Xen'drik speak Giant and use it as their trade language. Abyssal is the common tongue of all fiends. Abyssal is sometimes called “Khyber’

In [70]:
!nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv

memory.total [MiB], memory.used [MiB], memory.free [MiB]
15360 MiB, 5957 MiB, 9139 MiB


In [71]:
# OutOfMemoryError: CUDA out of memory. Tried to allocate 56.35 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.05 GiB is free. Process 14461 has 9.68 GiB memory in use. Of the allocated memory 8.48 GiB is allocated by PyTorch, and 1.08 GiB is reserved by PyTorch but unallocated. 
# If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [72]:
!ps -a

    PID TTY          TIME CMD
    210 pts/0    00:00:00 ps


In [None]:
# OutOfMemoryError: CUDA out of memory. Tried to allocate 482.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 310.12 MiB is free. Process 31577 has 14.44 GiB memory in use. Of the allocated memory 13.09 GiB is allocated by PyTorch, and 1.22 GiB is reserved by PyTorch but unallocated. 
# If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
tokenizer.batch_decode(generated_ids)[0]

In [None]:
response = model(formatted_prompt, max_tokens=800, stop=["Q:", "\n"], echo=True, stream=False)

In [None]:
response

# Create Characters

In [3]:
!df -ah | grep jupyter


/dev/nvme2n1     25G   15G   11G  59% /jupyterlab
