# quantize BGE

In [2]:
!pip install datasets
!pip install haystack-ai

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from tqdm import tqdm
import time
from haystack.components.generators import HuggingFaceLocalGenerator

In [2]:
# Load the BGE small model and tokenizer
model_name = "BAAI/bge-small-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Quantize the model to int8
model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Move model to CPU (assuming CPU optimization)
model = model.to('cpu')

In [4]:
# Load a sample dataset (you can replace this with your own dataset)
dataset = load_dataset("bilgeyucel/seven-wonders", split="train")

In [5]:
# Function to embed text
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Embed the dataset
embeddings = []
for item in tqdm(dataset, desc="Embedding documents"):
    embedding = embed_text(item['content'])
    embeddings.append(embedding)

print(f"Embedded {len(embeddings)} documents.")
print(f"Embedding shape: {embeddings[0].shape}")

Embedding documents: 100%|██████████| 151/151 [00:23<00:00,  6.40it/s]

Embedded 151 documents.
Embedding shape: (384,)





In [6]:
from sklearn.metrics.pairwise import cosine_similarity
prompt_template = """

You are a helpful AI assistant. You are given contexts and a question.
You must answer the question using the information given in the context ONLY.
If you don't know the answer say 'I don't know!'.

Context: {context}

Question: {question}

Answer:
"""

def get_relevant_context(question, embeddings, top_k=5):
    context_start = time.time()
    question_embedding = embed_text(question)
    similarities = cosine_similarity([question_embedding], embeddings)
    top_indices = similarities.argsort()[0][-top_k:][::-1]
    top_context = [dataset[int(i)]['content'] for i in top_indices]
    context_time = time.time() - context_start
    print(f"Context retrieval time: {context_time:.4f} seconds")
    return top_context


In [10]:
embed_text(question)

array([-5.52479684e-01, -2.10879296e-01,  6.11634910e-01,  1.67584822e-01,
        1.58225179e-01, -7.78047293e-02,  4.19106543e-01,  2.70687997e-01,
        4.06441092e-02,  2.95371562e-02,  1.24708109e-01, -5.93429923e-01,
        1.89952299e-01,  2.54397303e-01, -1.35276005e-01, -4.15784627e-01,
        2.98529088e-01,  2.77787626e-01, -7.09237456e-01,  1.44595534e-01,
        4.44652647e-01, -2.63073772e-01,  6.01049550e-02, -1.03174555e+00,
       -1.69600621e-01,  7.79938161e-01, -1.45345286e-01, -6.67331461e-03,
       -2.29361698e-01, -1.60577774e+00,  1.39026135e-01, -4.86054093e-01,
        2.69018024e-01, -2.30139166e-01, -6.78300625e-03, -1.57196730e-01,
       -7.14094281e-01,  3.94277543e-01, -2.27301821e-01,  1.80117577e-01,
        4.27449614e-01,  2.70393878e-01, -2.81613380e-01, -3.63092959e-01,
       -3.64223450e-01, -7.72437155e-01, -2.02933982e-01, -5.23339920e-02,
        5.06054521e-01,  5.29578030e-02,  4.09504563e-01, -2.11529821e-01,
        3.91138464e-01,  

In [7]:
def generate_answer(question, context):
    start_time = time.time()
    prompt = prompt_template.format(context=context, question=question)
    checkpoint_1 = time.time() - start_time
    print(f"Checkpoint 1: {checkpoint_1:.4f} seconds")

    start_time = time.time()
    generator = HuggingFaceLocalGenerator(model="HuggingFaceTB/SmolLM-1.7B-Instruct",
                                      task="text-generation",
                                      generation_kwargs={
                                        "max_new_tokens": 150,
                                        "do_sample": False,
                                      })
    checkpoint_2 = time.time() - start_time
    print(f"Checkpoint 2: {checkpoint_2:.4f} seconds")

    start_time = time.time()
    generator.warm_up()
    checkpoint_3 = time.time() - start_time
    print(f"Checkpoint 3: {checkpoint_3:.4f} seconds")

    start_time = time.time()
    response = generator.run(prompt)
    checkpoint_4 = time.time() - start_time
    print(f"Checkpoint 4: {checkpoint_4:.4f} seconds")

    replies = response['replies']

    generate_time = sum([checkpoint_1, checkpoint_2, checkpoint_3, checkpoint_4])
    print(f"Answer generation time: {generate_time:.4f} seconds")
    return replies

In [8]:
def answer_question(question, embeddings):
    answer_start = time.time()
    context = get_relevant_context(question, embeddings)
    context_text = " ".join(context[0])
    answer = generate_answer(question, context_text)
    answer_time = time.time() - answer_start
    print(f"Total time: {answer_time:.4f} seconds")
    return answer

In [9]:
# colab
question = "What does The Colossus of Rhodes Statue look like?"
answer = answer_question(question, embeddings)
print(answer)

Context retrieval time: 0.0328 seconds
Checkpoint 1: 0.0000 seconds
Checkpoint 2: 0.0005 seconds


Device set to use cuda:0


Checkpoint 3: 16.5065 seconds
Checkpoint 4: 7.5763 seconds
Answer generation time: 24.0833 seconds
Total time: 24.1166 seconds
['\nThe Colossus of Rhodes Statue is a massive bronze statue of the Greek god Helios, located in the city of Rhodes, Greece. It is one of the Seven Wonders of the Ancient World and is considered one of the largest statues of the ancient world. The statue is over 100 feet (30 meters) tall and weighs around 100 tons. It is made of bronze and is adorned with 230 marble and alabaster columns. The statue is located on the island of Rhodes, which was an important center of trade and commerce in the ancient world.']


In [None]:
# lightning
question = "What does The Colossus of Rhodes Statue look like?"
answer = answer_question(question, embeddings)
print(answer)

Context retrieval time: 0.0222 seconds
Checkpoint 1: 0.0000 seconds
Checkpoint 2: 0.0008 seconds
Checkpoint 3: 7.8145 seconds


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Checkpoint 4: 68.4128 seconds
Answer generation time: 76.2282 seconds
Total time: 76.2525 seconds
['\nThe Colossus of Rhodes Statue is a massive bronze statue of the Greek god Helios, located in the city of Rhodes, Greece. It is one of the Seven Wonders of the Ancient World and is considered one of the largest statues of the ancient world. The statue is over 100 feet (30 meters) tall and weighs around 100 tons. It is made of bronze and is adorned with 230 marble and alabaster columns. The statue is located on the island of Rhodes, which was an important center of trade and commerce in the ancient world.']


In [None]:
get_relevant_context(question, embeddings)

Context retrieval time: 0.0450 seconds


['The Colossus of Rhodes (Ancient Greek: ὁ Κολοσσὸς Ῥόδιος, romanized:\xa0ho Kolossòs Rhódios Greek: Κολοσσός της Ρόδου, romanized:\xa0Kolossós tes Rhódou)[a] was a statue of the Greek sun-god Helios, erected in the city of Rhodes, on the Greek island of the same name, by Chares of Lindos in 280\xa0BC. One of the Seven Wonders of the Ancient World, it was constructed to celebrate the successful defence of Rhodes city against an attack by Demetrius Poliorcetes, who had besieged it for a year with a large army and navy.\nAccording to most contemporary descriptions, the Colossus stood approximately 70 cubits, or 33 metres (108 feet) high – approximately the height of the modern Statue of Liberty from feet to crown – making it the tallest statue in the ancient world.[2] It collapsed during the earthquake of 226 BC, although parts of it were preserved. In accordance with a certain oracle, the Rhodians did not build it again.[3] John Malalas wrote that Hadrian in his reign re-erected the Col

In [None]:
question = "What is the history of the Library of Alexandria"
answer = answer_question(question, embeddings)
print(answer)

Context retrieval time: 0.0170 seconds
Checkpoint 1: 0.0000 seconds
Checkpoint 2: 0.0003 seconds
Checkpoint 3: 9.3937 seconds
Checkpoint 4: 76.4054 seconds
Answer generation time: 85.7994 seconds
Total time: 85.8186 seconds
["\nThe Library of Alexandria was a major center of learning and scholarship in ancient Alexandria, Egypt. It was founded in the 3rd century BC and was destroyed by fire in the 5th century AD. The library was a collection of manuscripts and scrolls containing works of Greek philosophers, scientists, and mathematicians, as well as texts on mathematics, astronomy, and medicine. The library was also a center of learning for the study of philosophy, mathematics, and astronomy.\n\nThe library was founded by Ptolemy I Soter, one of Alexander the Great's generals, and was built on the site of an earlier temple dedicated to the Muses. The library was a major center of learning and scholarship, attracting scholars from all over the Mediterranean world. It"]
