## Load "Vector DB" and run RAG query on pre-trained instruct LLM
Simply appends retrieved chunks to query. No fancy pipeline steps or dedicated Vector DB.

In [1]:
import gc
import os
import pickle
import sys
from threading import Thread
import time
from unsloth import FastLanguageModel

import faiss
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    TextStreamer,
    set_seed,
)

set_seed(1234)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


### Load chunk contents and embedding index
* TIL that WSL2 only has 50% of the system RAM by default. I had to increase it via `.wslconfig` so we can load both the search index and the raw chunks

In [2]:
# TODO: Database for chunks
index = faiss.read_index("wikipedia-en.index")
gc.collect()

60

In [3]:
chunks = []    
embeddings_path = '/home/stefanwebb/embeddings/wikimedia/wikipedia/20231101.en'
files = [f"train-{idx:05d}-of-00041.parquet" for idx in range(41)]

print("Reading in Chunks")
for idx in range(len(files)):
    print(f"x", end="")
    chunks_file = os.path.join(embeddings_path, f'chunks-{idx:05d}-of-00041.pkl')
    with open(chunks_file, 'rb') as f:
        chunks.extend(pickle.load(f))
    gc.collect()

Reading in Chunks
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [4]:
index.ntotal, len(chunks)

(49522046, 49522046)

In [5]:
# NOTE: Following code doesn't seem to get size of all objects references by these
# sys.getsizeof(index) / 10**9, sys.getsizeof(chunks) / 10**9

(4.8e-08, 0.430605624)

### Load LLM to run queries

In [6]:
# query_model = "google/gemma-2b-it"
query_model = "/home/stefanwebb/models/llms/Mistral-7B-Instruct-v0.3"

max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/home/stefanwebb/models/llms/mistral-7b-instruct-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0.dev20240829+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28+d444815.d20240829. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




### Construct RAG Query and Run

In [7]:
FastLanguageModel.for_inference(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (

In [8]:
def run_query_streamed(query, query_model):
    system_prompt = "You are a helpful assistant who answers question truthfully to the best of your knowledge. You decline to answer if you do not know the answer."

    chat = [
        {
            "role": "system",
            "content": system_prompt,
        },
        
        {

            "role": "user",
            "content": f"{query}",
        },
    ]

    formatted_prompt = tokenizer.apply_chat_template(
        chat, tokenize=False, add_generation_prompt=True, return_tensors="pt"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    streamer = TextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=512)

In [9]:
query_encoder = SentenceTransformer("/home/stefanwebb/models/llms/multi-qa-MiniLM-L6-cos-v1")

In [10]:
def top_k_chunks(query: str, k=1) -> str:
    """
    Find closest chunk for a given query.
    """
    embeddings = query_encoder.encode([query])
    D, I = index.search(embeddings, k)
    return D, I


def run_rag_query_streamed(query, query_model, k=3):
    # Retrieve most similar chunks
    D, I = top_k_chunks(query, k=k)
    # formatted_chunks = '\n\n'.join(["Document: " + chunks[i] for i in I[0]])
    formatted_chunks = ' '.join([chunks[i] for i in I[0]])
    
    # rag_query = f"Answer the query below and ground your answer in facts contained in the documents below:\n\nQuery: {query}\n\n{formatted_chunks}"

    rag_query = f"{formatted_chunks}\n\nAnswer the following question: {query}"

    # DEBUG
    # print(rag_query)
    # print("\n")

    run_query_streamed(rag_query, query_model)

    # for i, d in zip(I[0], D[0]):
    #     print(d, chunks[i])
    #     print("")

### Debug Examples
Compare answers to questions, with and without context from most similar document chunks.

In [11]:
query = "Why did Abraham Lincoln grow a beard?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Abraham Lincoln grew a beard primarily for practical reasons. In the mid-19th century, beards were common among men, but Lincoln did not have a beard during his first term as a U.S. Representative. After losing the Senate race in 1858, he grew a beard as a way to differentiate himself from his opponents in his presidential campaign in 1860. The beard also helped to hide a condition he had called "tuberculosis laryngitis," which caused his voice to be weak and hoarse. Additionally, the beard may have helped to make him appear more mature and serious, which could have been beneficial in a time of national crisis.

Mistral-7B-Instruct-v0.3 + RAG
Abraham Lincoln grew a beard in response to a letter he received from a young girl named Grace Bedell during the 1860 presidential campaign. She suggested that he grow a beard to improve his appearance and make him look more presidential. Lincoln responded to the letter, but made no promises. However, within a month, he gr

In [12]:
query = "How are glacier caves formed?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Glacier caves, also known as moulins or ice caves, are formed in glaciers due to the melting of ice. Here's a simplified explanation of the process:

1. Meltwater: As snow accumulates on a glacier, it eventually turns into ice. Over time, this ice melts due to various factors like geothermal heat, precipitation, and air temperature. This meltwater collects in crevasses and other low spots within the glacier.

2. Enlargement of crevasses: The meltwater continues to flow, enlarging the crevasses. As the crevasse deepens, it may eventually connect with the surface of the glacier, forming a vertical shaft known as a moulin.

3. Drainage: The meltwater drains into the underlying rock or bedrock, carving out a tunnel or cave-like structure. This process continues as long as the conditions (temperature, precipitation, etc.) allow for melting and drainage.

4. Collapse: Over time, the roof of the cave may become too thin to support itself, causing it to collapse. This 

In [19]:
query = "What is a beer can pyramid?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
A beer can pyramid, also known as a can pyramid or a can sculpture, is a structure made by stacking beer cans in a specific pattern to create a pyramid shape. This is often done as a fun and creative activity, and the stability of the structure depends on the number of cans used and the arrangement of the cans in the pyramid. The cans are usually empty, but some people use full cans to create a more stable structure. It's important to note that this activity should be done responsibly and safely, and the cans should be disposed of properly after the pyramid is complete.

Mistral-7B-Instruct-v0.3 + RAG
A beer can pyramid, often referred to as a beeramid, is a pyramid structure made from empty beer cans. It is built as more empty cans become available over the course of a night, a week, or a month. These structures are usually temporary, and are either cleaned up or knocked over eventually.



In [18]:
query = "What is a beer can pyramid?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] a beer can pyramid, often called a beeramid as a portmanteau, is a pyramid made from discarded beer cans. beer can pyramids are built as empty beer cans became available, slowly growing as the night ( or week or month ) wears on. in most cases, though, they are temporary structures, eventually being cleaned up or accidentally knocked over. [SEP] [CLS] beeramid may refer to : beer can pyramid, a pyramid made of empty cans of beer beeramid ( comic ), a comic in the daily cardinal, a student newspaper for uw - madison [SEP] [CLS] from their introduction in the 1930s up until the 1960s, most beer cans were made of steel and had a flat top into which one needed to punch one or two holes with a can piercer, euphemistically called a " churchkey ". with the advent of the " pop - top " aluminum can, this type of beverage can has disappeared almost entirely. the reception for churchkey's retro style can has not been entirely positive, with some beer enthusiasts dubbing it " the most hipste

In [20]:
query = "What is the current world record for a beer can pyramid?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
As of my last update, the world record for the largest beer can pyramid was set by the University of Wisconsin-La Crosse Eagle's Nest Co-ed Fraternity and Sorority Life on March 23, 2019. They stacked 10,000 cans to create a pyramid that was 11.5 meters tall and 7.6 meters wide. This record was verified by Guinness World Records. However, records can change over time, so it's always a good idea to check the latest records from a reliable source.

Mistral-7B-Instruct-v0.3 + RAG
I don't have real-time data or the ability to check the current world record for a beer can pyramid. However, as of the information provided, the beer can pyramid built by the Melbourne University Student Union in 2005 contained 10,660 cans and was over 5 meters high. It was submitted for a place in the Guinness Book of Records, but I don't have the information to confirm if it actually set a record. For the most accurate and up-to-date information, I would recommend checking the Guinness

In [24]:
query = "What is the current world record for a can pyramid?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] as of july 2010, layar had 1, 000 layers. as of september 2011, layar had 2, 993 layers. [SEP] [CLS] size ( 300, 300 ) ; [SEP] [CLS] top 10 in order of height [SEP] [CLS] trivia the " world's longest churro " was produced during the 2000 festival. it was 77 m long and weighed 30 kg. [SEP] [CLS] in mathematics 204 is a refactorable number. 204 is a square pyramidal number : 204 balls may be stacked in a pyramid whose base is an 8 × 8 square. its square, 2042 = 41616, is the fourth square triangular number. as a figurate number, 204 is also a nonagonal number and a truncated triangular pyramid number. 204 is a member of the mian - chowla sequence. [SEP] [CLS] description annual, c. 100 – 1000 cm tall. [SEP] [CLS] here, the valid instances are those graphs whose maximum independent set size is either at most 5 or at least 10. [SEP] [CLS] maximum dimensions are shown in metres, after 10 – 20 years. [SEP] [CLS] see also ramesseum magician's box list of largest monoliths in the world [

In [30]:
query = "Which Simpsons episode featured a beer can pyramid outside of Duff Gardens?"
D, I = top_k_chunks(query, k=3)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] it was featured in season 13 the simpsons episode " weekend at burnsie's " where homer simpson ( after he smokes medicinal marijuana ) gets ready for work and pictures his world as a psychedelic wonderland. [SEP] [CLS] couch gag this episode's couch gag features an ident commissioned by the uk network channel 4, in 2006 ( though first aired in july 2007 ), for use before its broadcasts of simpsons episodes. the writers added in all - new sections in order to theme the gag around the super bowl which was broadcast near the airing of this episode. in the original ident, homer places his six - pack of duff beer on a hammock in the simpson's backyard. when he sits down, he inadvertently fires the six - pack into the sky, only for them to get caught on the power lines hanging over the garden. when homer manages to get up to the beer cans, climbing up the family's garden tree, he is shocked by the electricity now running through the six - pack. he is continuously shocked as the camera 

In [35]:
query = "Who is Benjamin Geza Affleck?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] z billy zaharopoulos [SEP] [CLS] robert myron zarem ( september 30, 1936 – september 26, 2021 ), known as bobby zarem, was an american publicist. after starting his own publicity agency in 1974, zarem created lengthy, personalized pitch letters, a business style, and many campaigns. his former clients included dustin hoffman, cher, arnold schwarzenegger, michael jackson, diana ross, michael douglas, michael caine, sophia loren, ann - margret, and alan alda, among others. [SEP] [CLS] z mike zaher morgan zeba [SEP] [CLS] z joe zawinul lev zhurbin torrie zito bob zurke [SEP] [CLS] mohed altrad ( ) is a french - syrian billionaire businessman, rugby chairman and writer, born c. march 1948. he was born to a very young mother and his bedouin father gave him away to his grandparents at age four following his mother's death. in 2015, altrad was named ernst & young world entrepreneur of the year. [SEP] [CLS] luciano belviso ( born august 18, 1983 ) is an italian entrepreneur, manager and 

In [36]:
query = "Who is Benjamin Geza Affleck?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Benjamin Geza Affleck is a well-known American actor, filmmaker, and screenwriter. He was born on August 15, 1972. Affleck gained fame for his roles in films such as "Good Will Hunting," for which he won an Academy Award for Best Original Screenplay along with Matt Damon, and "Argo," for which he won the Academy Award for Best Picture as a producer. He is also known for his roles in films like "Dazed and Confused," "Chasing Amy," "Gone Baby Gone," and "The Town." In addition to acting, Affleck has directed films such as "Gone Baby Gone," "The Town," and "Argo." He is also a political activist and has been involved in various philanthropic efforts.

Mistral-7B-Instruct-v0.3 + RAG
Benjamin Geza Affleck is an American actor, film director, screenwriter, and producer. He is known for his work in films such as "Good Will Hunting," "Argo," and "Gone Baby Gone." He has won multiple awards, including two Academy Awards, for his acting and directing work. He is also kno

### Investigating whether failure to retrieve relevant chunks is due to search index or document/query encoder

In [37]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=3)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)


[CLS] another attempt to break the world record beer can pyramid was made with beer cans over 5 metres high and contained 10, 660 cans. it was built by the melbourne university student union in 2005, and was featured on blokesworld and in mx. [SEP] [CLS] it has five hand pumps serving real ale and a beer garden, and was submitted for a place in the guinness book of records. [SEP] [CLS] to date the brewery has created over 300 different beers. [SEP]


In [38]:
I[0]

array([44419780, 15182634, 32287273])

In [39]:
chunks[44419780 - 1]

'[CLS] on 23 september 2000, the malaysian can team, consisting of 12 college students from the inti college subang jaya, malaysia built a free standing can pyramid created from 9, 455 empty aluminium drink cans in 24 minutes at the mid valley megamall in kuala lumpur, malaysia. it had a square base of cans, measuring. this feat made a successful entry into the guinness world record and to - date this record has yet to be broken. [SEP]'

In [42]:
import numpy as np

In [41]:
# Comparing similarity of query to retrieved chunks and the non-retrieved chunk that contains the necessary fact
emb_query = query_encoder.encode([query])
emb_chunk0 = query_encoder.encode([chunks[44419780]])
emb_chunk1 = query_encoder.encode([chunks[44419780 - 1]])
emb_chunk2 = query_encoder.encode([chunks[15182634]])
emb_chunk3 = query_encoder.encode([chunks[32287273]])

In [45]:
for emb in [emb_chunk0, emb_chunk1, emb_chunk2, emb_chunk3]:
    score1 = np.dot(emb_query.flatten(), emb.flatten())
    print(score1) 

0.80006427
0.6656804
0.5796839
0.61451036


In [57]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=1000)
I[0]

array([44419780, 15182634, 32287273, 44019041, 10667840, 17866603,
       44684914, 16419147, 48056004, 47488175, 20744553, 23933341,
       34792865, 32287276,  5588295, 38474929,  6724587, 26383078,
       47576167,  6152881, 17653053, 39803373,  3323697, 26014709,
       35153054,  5640383, 28316068, 49381089, 23267869, 38134791,
       36622100, 21392435,  7595671, 47448553, 21191890, 15855049,
       29760714,  4217265, 46842457, 36622101, 26496274, 11810654,
       35455283,   894014,  6663165, 46791145,  6663167, 46057018,
       47219191, 31450256, 27735882, 41131056, 45473629, 24821782,
        5322344, 48682325, 41026486, 33623596, 26089945, 10221309,
       14322709, 27243660, 45435603, 46047691, 46057025, 31411528,
       42145038, 14774270, 46338357, 21991699, 41376693, 12958489,
        9885952, 25444862, 36622096,   872448, 36575452, 37899982,
       29996711, 45095973, 23698841, 45152050, 46057017, 33671998,
       18100694, 23784030, 42820844, 35918301, 21798614, 27002

In [59]:
# Even fetching the top 1000 matches in index, required chunk isn't found... :(
44419780 in I[0], 44419779 in I[0]

(True, False)

In [62]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=1000)

scores = []
emb_query = query_encoder.encode([query])
for idx in I[0]:
    emb= query_encoder.encode([chunks[idx]])
    score = np.dot(emb_query.flatten(), emb.flatten())
    scores.append(score)

print(list(reversed(list(sorted(scores)))))

[0.80006427, 0.6291253, 0.6221438, 0.61451036, 0.5936669, 0.58356094, 0.5796839, 0.5711893, 0.56738615, 0.5638323, 0.56263685, 0.5614915, 0.56146127, 0.5585642, 0.5566888, 0.55643344, 0.5559496, 0.55501854, 0.5523143, 0.5492588, 0.5476148, 0.5469228, 0.54552424, 0.54455084, 0.5438782, 0.5436713, 0.5435318, 0.5431305, 0.5429449, 0.54248977, 0.54077667, 0.53899646, 0.53842926, 0.5371756, 0.5366755, 0.53484356, 0.53484356, 0.53484356, 0.5346839, 0.5345394, 0.53318346, 0.53226984, 0.5312338, 0.53103507, 0.53093576, 0.53033006, 0.53018284, 0.53007925, 0.52949023, 0.52894855, 0.5287861, 0.5262939, 0.5258082, 0.5226944, 0.5223705, 0.5221437, 0.5216145, 0.52136683, 0.52118266, 0.5208185, 0.5206651, 0.5203744, 0.5202805, 0.52012444, 0.5199131, 0.5193835, 0.5193835, 0.5193835, 0.5193835, 0.5193835, 0.5193835, 0.51920784, 0.5175997, 0.517492, 0.5167329, 0.51660883, 0.51641655, 0.5157879, 0.51542795, 0.51530284, 0.5147531, 0.51470494, 0.5142274, 0.514047, 0.51338375, 0.5130265, 0.5119648, 0.511876