## Load "Vector DB" and run RAG query on pre-trained instruct LLM
Simply appends retrieved chunks to query. No fancy pipeline steps or dedicated Vector DB.

In [1]:
import gc
import os
import pickle
import sys
from threading import Thread
import time
from unsloth import FastLanguageModel

import faiss
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    TextStreamer,
    set_seed,
)

set_seed(1234)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


### Load chunk contents and embedding index
* TIL that WSL2 only has 50% of the system RAM by default. I had to increase it via `.wslconfig` so we can load both the search index and the raw chunks

In [2]:
# TODO: Database for chunks
# index = faiss.read_index("wikipedia-en.index")
index = faiss.read_index("wikipedia-en-simplifying.index")
gc.collect()

60

In [3]:
chunks = []    
embeddings_path = '/home/stefanwebb/embeddings/wikimedia/wikipedia/20231101.en'
files = [f"train-{idx:05d}-of-00041.parquet" for idx in range(41)]

print("Reading in Chunks")
for idx in range(len(files)):
    print(f"x", end="")
    chunks_file = os.path.join(embeddings_path, f'chunks-{idx:05d}-of-00041.pkl')
    with open(chunks_file, 'rb') as f:
        chunks.extend(pickle.load(f))
    gc.collect()

Reading in Chunks
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [4]:
index.ntotal, len(chunks)

(49522046, 49522046)

In [5]:
# NOTE: Following code doesn't seem to get size of all objects references by these
# sys.getsizeof(index) / 10**9, sys.getsizeof(chunks) / 10**9

### Load LLM to run queries

In [6]:
# query_model = "google/gemma-2b-it"
query_model = "/home/stefanwebb/models/llms/Mistral-7B-Instruct-v0.3"

max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/home/stefanwebb/models/llms/mistral-7b-instruct-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.43.4.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0.dev20240829+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28+d444815.d20240829. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




### Construct RAG Query and Run

In [7]:
FastLanguageModel.for_inference(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (

In [8]:
def run_query_streamed(query, query_model):
    system_prompt = "You are a helpful assistant who answers question truthfully to the best of your knowledge. You decline to answer if you do not know the answer."

    chat = [
        {
            "role": "system",
            "content": system_prompt,
        },
        
        {

            "role": "user",
            "content": f"{query}",
        },
    ]

    formatted_prompt = tokenizer.apply_chat_template(
        chat, tokenize=False, add_generation_prompt=True, return_tensors="pt"
    )
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    streamer = TextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=512)

In [9]:
query_encoder = SentenceTransformer("/home/stefanwebb/models/llms/multi-qa-MiniLM-L6-cos-v1")

In [10]:
import numpy as np

In [33]:
def top_k_chunks(query: str, k=1) -> str:
    """
    Find closest chunk for a given query.
    """
    embeddings = query_encoder.encode([query])
    # embeddings = embeddings / np.linalg.norm(embeddings)
    D, I = index.search(embeddings, k, params = faiss.SearchParametersIVF(nprobe=100))
    return D, I


def run_rag_query_streamed(query, query_model, k=3):
    # Retrieve most similar chunks
    D, I = top_k_chunks(query, k=k)
    # formatted_chunks = '\n\n'.join(["Document: " + chunks[i] for i in I[0]])
    formatted_chunks = ' '.join([chunks[i] for i in I[0]])
    
    # rag_query = f"Answer the query below and ground your answer in facts contained in the documents below:\n\nQuery: {query}\n\n{formatted_chunks}"

    rag_query = f"{formatted_chunks}\n\nAnswer the following question: {query}"

    # DEBUG
    # print(rag_query)
    # print("\n")

    run_query_streamed(rag_query, query_model)

    # for i, d in zip(I[0], D[0]):
    #     print(d, chunks[i])
    #     print("")

### Debug Examples
Compare answers to questions, with and without context from most similar document chunks.

In [34]:
query = "Why did Abraham Lincoln grow a beard?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Abraham Lincoln grew a beard primarily for practical reasons. In the mid-19th century, beards were common among men, but Lincoln did not have a beard during his first term as a U.S. Representative. After losing the Senate race in 1858, he grew a beard as a way to differentiate himself from his opponents in his presidential campaign in 1860. The beard also helped to hide a condition he had called "tuberculosis laryngitis," which caused his voice to be weak and hoarse. Additionally, the beard may have helped to make him appear more mature and serious, which could have been beneficial in a time of national crisis.

Mistral-7B-Instruct-v0.3 + RAG
Abraham Lincoln grew a beard in response to a written request from an 11-year-old girl named Grace Bedell. She suggested that he grow a beard to improve his appearance, and he complied with her request a few weeks before he was elected president in 1860. Lincoln met Grace Bedell in person in 1861, and he had a full beard b

In [13]:
query = "Who was Abraham Lincoln?"
D, I = top_k_chunks(query, k=100)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] abraham lincoln lincoln, abraham lincoln, abraham [SEP] [CLS] abraham lincoln lincoln, abraham lincoln, abraham [SEP] [CLS] abraham lincoln ( 1861 – 1865 ) [SEP] [CLS] abraham lincoln may also refer to : [SEP] [CLS] presidency of abraham lincoln ( 1861 - 1865 ) [SEP] [CLS] thomas lincoln ( 1778 – 1851 ) was the father of abraham lincoln. [SEP] [CLS] presidency of abraham lincoln ( 1861 – 1865 ) [SEP] [CLS] lincoln, abraham lincoln, abraham outlines [SEP] [CLS] abraham lincoln early lives of the presidents of the united states [SEP] [CLS] presidency of abraham lincoln abraham lincoln - related lists [SEP] [CLS] abraham lincoln ( 1809 – 1865 ) was the president of the united states from 1861 to 1865. [SEP] [CLS] people of power : abraham lincoln [SEP] [CLS] abraham lincoln ( ; february 12, 1809 – april 15, 1865 ) was an american lawyer, politician, and statesman who served as the 16th president of the united states from 1861 until his assassination in 1865. lincoln led the union th

In [14]:
query = "Who was Abraham Lincoln?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=32)
print()

Mistral-7B-Instruct-v0.3
Abraham Lincoln was the 16th President of the United States, serving from 1861 until his assassination in 1865. He is best known for leading the nation during the American Civil War and for his role in the abolition of slavery. Lincoln was born in a log cabin in Kentucky in 1809 and grew up in Indiana and Illinois. He began his political career as a Whig, but later joined the newly formed Republican Party. Lincoln is often regarded as one of the greatest U.S. presidents and is remembered for his leadership during a time of great national crisis. He was assassinated by John Wilkes Booth at Ford's Theatre in Washington, D.C. on April 14, 1865.

Mistral-7B-Instruct-v0.3 + RAG
Abraham Lincoln was an American lawyer, politician, and statesman who served as the 16th president of the United States from 1861 to 1865. He was born on February 12, 1809, in a one-room log cabin near Hodgenville, Kentucky, to Thomas Lincoln and Nancy Hanks. Lincoln led the Union through the

In [44]:
query = "Did Abraham Lincoln have a beard?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=32)
print()

Mistral-7B-Instruct-v0.3
Yes, Abraham Lincoln grew a beard during his presidency. He is often depicted with a beard in images and is known for it. Before his presidency, he did not have a beard.

Mistral-7B-Instruct-v0.3 + RAG
Yes, Abraham Lincoln grew a beard after being elected president in response to a request from an 11-year-old girl named Grace Bedell. He is known for being the first president to have any facial hair beyond sideburns.



In [16]:
query = "How are glacier caves formed?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Glacier caves, also known as moulins or ice caves, are formed in glaciers due to the melting of ice. Here's a simplified explanation of the process:

1. Meltwater: As snow accumulates on a glacier, it eventually turns into ice. Over time, this ice melts due to various factors like geothermal heat, precipitation, and air temperature. This meltwater collects in crevasses and other low spots within the glacier.

2. Enlargement of crevasses: The meltwater continues to flow, enlarging the crevasses. As the crevasse deepens, it may eventually connect with the surface of the glacier, forming a vertical shaft known as a moulin.

3. Drainage: The meltwater drains into the underlying rock or bedrock, carving out a tunnel or cave-like structure. This process continues as long as the conditions (temperature, precipitation, etc.) allow for melting and drainage.

4. Collapse: Over time, the roof of the cave may become too thin to support itself, causing it to collapse. This 

In [18]:
query = "What is a beer can pyramid?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
A beer can pyramid, also known as a can pyramid or a can sculpture, is a structure made by stacking beer cans in a specific pattern to create a pyramid shape. This is often done as a fun and creative activity, and the stability of the structure depends on the number of cans used and the arrangement of the cans in the pyramid. The cans are usually empty, but some people use full cans to create a more stable structure. It's important to note that this activity should be done responsibly and safely, and the cans should be disposed of properly after the pyramid is complete.

Mistral-7B-Instruct-v0.3 + RAG
A beer can pyramid, often referred to as a beeramid, is a pyramid structure made from discarded beer cans. It is built as empty beer cans become available, and it grows gradually as the night, week, or month wears on. Typically, beeramids are temporary structures, either cleaned up or knocked over eventually.



In [19]:
query = "What is a beer can pyramid?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] a beer can pyramid, often called a beeramid as a portmanteau, is a pyramid made from discarded beer cans. beer can pyramids are built as empty beer cans became available, slowly growing as the night ( or week or month ) wears on. in most cases, though, they are temporary structures, eventually being cleaned up or accidentally knocked over. [SEP] [CLS] beeramid may refer to : beer can pyramid, a pyramid made of empty cans of beer beeramid ( comic ), a comic in the daily cardinal, a student newspaper for uw - madison [SEP] [CLS] another attempt to break the world record beer can pyramid was made with beer cans over 5 metres high and contained 10, 660 cans. it was built by the melbourne university student union in 2005, and was featured on blokesworld and in mx. [SEP] [CLS] beers pyramid once featured beers including, ales, lager, weizens, ciders, porters, and ipas. nowadays, pyramid offers six year - round beers, including hefeweizen, apricot ale, thunderhead ipa and outburst imper

In [36]:
query = "What is the current world record for a beer can pyramid?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
As of my last update, the world record for the largest beer can pyramid was set by the University of Wisconsin-La Crosse Eagle's Nest Co-ed Fraternity and Sorority Life on March 23, 2019. They stacked 10,000 cans to create a pyramid that was 11.5 meters tall and 7.6 meters wide. This record was verified by Guinness World Records. However, records can change over time, so it's always a good idea to check the latest records from a reliable source.

Mistral-7B-Instruct-v0.3 + RAG
The current world record for a beer can pyramid, as of September 2000, is 9,455 cans, built by a team of 12 college students from the INTI College Subang Jaya, Malaysia, in 24 minutes at the Mid Valley Megamall in Kuala Lumpur, Malaysia. This record has yet to be broken, according to the information provided.



In [35]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=3)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] another attempt to break the world record beer can pyramid was made with beer cans over 5 metres high and contained 10, 660 cans. it was built by the melbourne university student union in 2005, and was featured on blokesworld and in mx. [SEP] [CLS] on 23 september 2000, the malaysian can team, consisting of 12 college students from the inti college subang jaya, malaysia built a free standing can pyramid created from 9, 455 empty aluminium drink cans in 24 minutes at the mid valley megamall in kuala lumpur, malaysia. it had a square base of cans, measuring. this feat made a successful entry into the guinness world record and to - date this record has yet to be broken. [SEP] [CLS] a beer can pyramid, often called a beeramid as a portmanteau, is a pyramid made from discarded beer cans. beer can pyramids are built as empty beer cans became available, slowly growing as the night ( or week or month ) wears on. in most cases, though, they are temporary structures, eventually being clean

In [45]:
query = "Which Simpsons episode featured a beer can pyramid outside of Duff Gardens?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] in the couch gag for the simpsons episode " the bob next door ", harold is shown drawing the simpson family living room during the regular title sequence. homer also asks harold to draw him a can of duff beer after he finishes with the living room. [SEP] [CLS] it was featured in season 13 the simpsons episode " weekend at burnsie's " where homer simpson ( after he smokes medicinal marijuana ) gets ready for work and pictures his world as a psychedelic wonderland. [SEP] [CLS] in popular media in the 22nd episode of season 3 of the simpsons, " the otto show, " homer simpson pulls a can of billy beer from the pocket of his " concert - going jacket, " presumably from the last concert he had attended when he was younger, and drinks the beer. [SEP] [CLS] duff in the simpsons duff is homer simpson's beer of choice. it is a parody of stereotypical mass - market american lager : cheap, poor - quality, and heavily marketed everywhere. [SEP] [CLS] in the eighth episode of season 9 of the si

In [46]:
query = "Which Simpsons episode featured a beer can pyramid outside of Duff Gardens?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=100)
print()

Mistral-7B-Instruct-v0.3
The Simpsons episode where a beer can pyramid is featured outside Duff Gardens is "Homer the Great" from Season 4, Episode 13. In this episode, Homer becomes a carnival strongman and builds a beer can pyramid as part of a competition. The pyramid collapses, causing chaos at the carnival.

Mistral-7B-Instruct-v0.3 + RAG
The Simpsons episode that featured a beer can pyramid outside of Duff Gardens is "Selma's Choice" from season 4, episode 13. In this episode, Selma takes Bart and Lisa to Duff Gardens, a parody of Busch Gardens, where they encounter various attractions, including a beer can pyramid made of Duff beer cans.



In [23]:
query = "Who is Benjamin Geza Affleck?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)

[CLS] as middle name benjamin geza affleck ( born 1972 ), american actor and filmmaker [SEP] [CLS] ben geurens ( born 24 december 1979 ) is an australian actor. [SEP] [CLS] bob lechevalier : [SEP] [CLS] his younger brother faizat ghazli is also a footballer. [SEP] [CLS] he is the younger half - brother of fellow director jake nava. [SEP] [CLS] his grandson yannick cahuzac is a professional footballer. [SEP] [CLS] ben zabo ( born arouna moussa coulibaly, january 24, 1979 ) is a malian afrobeat - musician from bamako. he was born at tominian in the segou region. he released his self - titled debut album on the international record company glitterhouse records in 2012. [SEP] [CLS] he is the brother of renato marazzi. [SEP] [CLS] in 1988 meretzky had his son daniel meretzky. [SEP] [CLS] he is the son of late broadcaster bob arbogast. [SEP]


In [24]:
query = "Who is Benjamin Geza Affleck?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model)
print()

Mistral-7B-Instruct-v0.3
Benjamin Geza Affleck is a well-known American actor, filmmaker, and screenwriter. He was born on August 15, 1972. Affleck gained fame for his roles in films such as "Good Will Hunting," for which he won an Academy Award for Best Original Screenplay along with Matt Damon, and "Argo," for which he won the Academy Award for Best Picture as a producer. He is also known for his roles in films like "Dazed and Confused," "Chasing Amy," "Gone Baby Gone," and "The Town." In addition to acting, Affleck has directed films such as "Gone Baby Gone," "The Town," and "Argo." He is also a political activist and has been involved in various philanthropic efforts.

Mistral-7B-Instruct-v0.3 + RAG
Benjamin Geza Affleck is an American actor and filmmaker, born in 1972. He is known for his work in films such as "Good Will Hunting," "Argo," and "Gone Baby Gone." He has won multiple awards, including two Academy Awards, for his acting and directing work.



In [47]:
query = "Why are orange cats mostly male?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=100)
print()

Mistral-7B-Instruct-v0.3
Actually, orange cats are not mostly male. The orange color in cats is genetically linked to a specific allele (variant) of the Agouti gene, which is on the X chromosome. Female cats have two X chromosomes, while males have one X and one Y chromosome.

Since the orange gene is on the X chromosome, a male cat will always be orange if it carries the orange gene on its single X chromosome. Female cats, having two X chromosomes, can carry two orange genes, one on each chromosome. However, because of a process called X-inactivation, only one of the two X chromosomes is active in each cell of a female cat. If a female cat has one active X chromosome with the orange gene and one without it, she will not be orange.

So, the misconception that orange cats are mostly male comes from the fact that males carry the orange gene on their single X chromosome, while females need to have the gene on at least one of their two X chromosomes for the cat to be orange. However, when 

In [48]:
query = "Why are orange cats mostly female?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=100)
print()

Mistral-7B-Instruct-v0.3
Actually, there is no truth to the statement that orange cats are mostly female. The color of a cat's fur, including orange, is determined by a gene called the "orange tabby" gene. This gene has no relation to the cat's gender. The distribution of orange cats among males and females is the same as any other color or pattern.

Mistral-7B-Instruct-v0.3 + RAG
Orange cats are mostly female because the gene for the orange coloring is located on the X chromosome. Female cats have two X chromosomes, so they can have both the orange and non-orange genes. In contrast, male cats have one X and one Y chromosome, so they can only have the non-orange gene. Therefore, for a male cat to be orange, it would require a mutation or an extra X chromosome, which is rare.



In [49]:
query = "Are orange cats more likely to be male or female?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=100)
print()

Mistral-7B-Instruct-v0.3
The gender of a cat, whether it's orange or any other color, is not determined by its fur color. Cats can be either male (tom) or female (queen), and they come in a variety of colors and patterns. The orange color in cats is due to a gene called the "orange gene" (O) and is recessive, meaning that a cat must inherit two copies of this gene (OO) to be orange. However, the gender of a cat is determined by other genes, not the orange gene.

Mistral-7B-Instruct-v0.3 + RAG
Orange cats are more likely to be female because the gene for the orange coloring is on the X chromosome, and females have two X chromosomes, while males have one X and one Y chromosome. Therefore, for a female cat to be orange, both of her X chromosomes must carry the orange gene. For a male cat to be orange, he would need to inherit the orange gene from his mother, as males only have one X chromosome. However, it's important to note that not all orange cats are female, and not all female cats ar

In [50]:
query = "What percentage of orange cats are male?"

print("Mistral-7B-Instruct-v0.3")
run_query_streamed(query, model)
print()

print("Mistral-7B-Instruct-v0.3 + RAG")
run_rag_query_streamed(query, model, k=100)
print()

Mistral-7B-Instruct-v0.3
The exact percentage can vary, but generally, about 50% of orange cats are male and 50% are female. This is because the orange coloration in cats is determined by a gene on the X chromosome. Females have two X chromosomes (XX), while males have one X and one Y chromosome (XY). Since the orange gene is on the X chromosome, a female cat needs to inherit the orange gene from both parents to be orange, while a male cat only needs to inherit it from one parent. Therefore, if a cat is orange, it is more likely to be male, but half of all orange cats will still be female. However, it's important to note that this is a simplification, as there are other genes that can influence coat color in cats.

Mistral-7B-Instruct-v0.3 + RAG
Approximately 80% of orange tabby cats are male. This is because the orange coloring is a gene found on the X chromosome, and males have one X and one Y chromosome, while females have two X chromosomes. Therefore, for a female cat to be orange,

### Investigating whether failure to retrieve relevant chunks is due to search index or document/query encoder

In [25]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=10)
formatted_chunks = ' '.join([chunks[i] for i in I[0]])
print(formatted_chunks)


[CLS] another attempt to break the world record beer can pyramid was made with beer cans over 5 metres high and contained 10, 660 cans. it was built by the melbourne university student union in 2005, and was featured on blokesworld and in mx. [SEP] [CLS] a beer can pyramid, often called a beeramid as a portmanteau, is a pyramid made from discarded beer cans. beer can pyramids are built as empty beer cans became available, slowly growing as the night ( or week or month ) wears on. in most cases, though, they are temporary structures, eventually being cleaned up or accidentally knocked over. [SEP] [CLS] olympia beer was praised as one of the top 25 beers in the world in a 2012 mensjournal. com review article. [SEP] [CLS] beers pyramid once featured beers including, ales, lager, weizens, ciders, porters, and ipas. nowadays, pyramid offers six year - round beers, including hefeweizen, apricot ale, thunderhead ipa and outburst imperial ipa. pyramid also offers seasonal beers, including curv

In [26]:
I[0]

array([44419780, 44419778, 46332100, 42621542, 32287273, 48056004,
       10667840, 44684914, 15182634,  6031383])

In [27]:
chunks[44419780 - 1]

'[CLS] on 23 september 2000, the malaysian can team, consisting of 12 college students from the inti college subang jaya, malaysia built a free standing can pyramid created from 9, 455 empty aluminium drink cans in 24 minutes at the mid valley megamall in kuala lumpur, malaysia. it had a square base of cans, measuring. this feat made a successful entry into the guinness world record and to - date this record has yet to be broken. [SEP]'

In [28]:
import numpy as np

In [29]:
# Comparing similarity of query to retrieved chunks and the non-retrieved chunk that contains the necessary fact
emb_query = query_encoder.encode([query])
emb_chunk0 = query_encoder.encode([chunks[44419780]])
emb_chunk1 = query_encoder.encode([chunks[44419780 - 1]])
emb_chunk2 = query_encoder.encode([chunks[15182634]])
emb_chunk3 = query_encoder.encode([chunks[32287273]])

In [30]:
for emb in [emb_chunk0, emb_chunk1, emb_chunk2, emb_chunk3]:
    score1 = np.dot(emb_query.flatten(), emb.flatten())
    print(score1) 

0.80006427
0.6656804
0.5796839
0.61451036


In [41]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=1000)
I[0]

array([44419780, 44419779, 44419778, 46332100, 42621542, 32287273,
       48056004, 10667840, 44684914, 15182634,  6031383,  6724587,
       16419147, 25060650, 42621541,  5588298,  5588295, 47488175,
       47576167, 23784030, 20744553,  6416773, 19033151, 17484807,
       10596834, 22633185,   894007, 12054762, 16875596, 26014709,
       41376691, 44019041,  9782289,  7595671, 48056002, 27735882,
       45963830, 30801274, 36622101,   712694,  7595676, 34792865,
        3406768, 26232251,   193948, 34361474,  5401486, 42621090,
       37327053, 19695556, 29996711, 48932338, 48056001, 26496274,
       42685118, 39727124,  9961092, 14564167,   124767, 47039145,
       22494061, 21606770,  5640383, 30201479, 29734994, 46352283,
       44085428,  5263339, 42090121, 12540172,  2523641,  7819492,
       47576162, 39232830, 17866603, 44843512, 42247104, 14812288,
       39286935,  5322344, 23933341, 32908930, 46775527, 27447164,
       32287276,   973686, 45964882,  2429968, 40666904, 38134

In [42]:
# Even fetching the top 1000 matches in index, required chunk isn't found... :(
44419780 in I[0], 44419779 in I[0], 44419778 in I[0]

(True, True, True)

In [43]:
query = "What is the current world record for a beer can pyramid?"
D, I = top_k_chunks(query, k=1000)

scores = []
emb_query = query_encoder.encode([query])
for idx in I[0]:
    emb= query_encoder.encode([chunks[idx]])
    score = np.dot(emb_query.flatten(), emb.flatten())
    scores.append(score)

print(list(reversed(list(sorted(scores)))))

[0.80006427, 0.6656804, 0.65565145, 0.6291253, 0.6221438, 0.61819845, 0.61451036, 0.60374725, 0.5936669, 0.58356094, 0.58112395, 0.57989484, 0.5796839, 0.5774149, 0.5757826, 0.5742325, 0.5711893, 0.57098013, 0.56738615, 0.56650317, 0.5638323, 0.56263685, 0.5614915, 0.56146127, 0.5585642, 0.5571245, 0.5566888, 0.55643344, 0.55630404, 0.5559496, 0.55562025, 0.55555916, 0.5553293, 0.55501854, 0.5550179, 0.554696, 0.55399704, 0.5532268, 0.5531702, 0.5523143, 0.55080813, 0.5503337, 0.5493349, 0.5492588, 0.5481219, 0.54803777, 0.5476148, 0.5469228, 0.54689753, 0.54689753, 0.5460844, 0.5457463, 0.54552424, 0.5454178, 0.5452452, 0.5449614, 0.54455084, 0.5442039, 0.5438782, 0.5436713, 0.5435318, 0.5431305, 0.5429449, 0.54290843, 0.54290843, 0.54248977, 0.54209405, 0.54198927, 0.5415858, 0.54077667, 0.5407106, 0.53951395, 0.53948474, 0.53899646, 0.5385741, 0.53842926, 0.538288, 0.5382086, 0.5371756, 0.5369379, 0.5369189, 0.5366755, 0.53631496, 0.535685, 0.5356747, 0.53484356, 0.53484356, 0.53484