# Base

In [1]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

  from tqdm.autonotebook import tqdm, trange
2025-03-19 20:04:38.341632: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load from csv to dict
def load_csv_to_dict(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
        dict = {}
        for line in lines[1:]:
            line = line.strip().split(',')
            dict[line[0]] = line[1]

    return dict

In [3]:
# Load the data
file_path = 'data/sample_qa_pairs.csv'
qa_dict = load_csv_to_dict(file_path)
qa_dict

{'Was Abraham Lincoln the sixteenth President of the United States?': 'Yes.',
 'Did Lincoln sign the National Banking Act of 1863?': 'Yes.',
 'Did his mother die of pneumonia?': 'No.',
 "How many long was Lincoln's formal education?": '18 months.',
 'When did Lincoln begin his political career?': '1832.',
 'What did The Legal Tender Act of 1862 establish?': '"The United States Note',
 'Who suggested Lincoln grow a beard?': 'Grace Bedell.',
 'When did the Gettysburg address argue that America was born?': '1776.',
 'Did Lincoln beat John C. Breckinridge in the 1860 election?': 'Yes.',
 'Was Abraham Lincoln the first President of the United States?': 'No',
 'Did Lincoln start his political career in 1832?': 'Yes',
 'Did Lincoln ever represent Alton & Sangamon Railroad?': 'Yes',
 'Which county was Lincoln born in?': '"Southeast Hardin County',
 'When did Lincoln first serve as President?': '"March 4',
 'Who assassinated Lincoln?': 'John Wilkes Booth',
 'Did Lincoln win the election of 1860

In [4]:
# Create embeddings
def create_embeddings(questions):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(questions, convert_to_tensor=True)
    return model, embeddings

# Search for best matches
def search(model, embeddings, questions, query, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = scores.topk(top_k)
    return [(questions[idx], top_results.values[i].item()) for i, idx in enumerate(top_results.indices)]

In [5]:
questions = list(qa_dict.keys())

# Create embeddings
model, embeddings = create_embeddings(questions)

# Save embeddings to disk
np.save('embeddings/embeddings.npy', embeddings.cpu().numpy())

In [6]:
# Load embeddings from disk
embeddings = np.load('embeddings/embeddings.npy')
embeddings = torch.tensor(embeddings)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embeddings = embeddings.to(device)
print("Loaded", embeddings.shape[0], "entries")

Loaded 918 entries


In [7]:
query = "Who was Grant in WW USA?"

# Example query
results = search(model, embeddings, questions, query)

# Init context
context = '[Original Question]: ' + query + '\n'

# Print top results
context += "\nTop matches:"
for i, (question, score) in enumerate(results):
    context += f"\n {question} -> Answer: {qa_dict[question]}"

print(context)

[Original Question]: Who was Grant in WW USA?

Top matches:
 What was Grant's political affiliation? -> Answer: Republican
 Where was Grant born? -> Answer: "A log cabin in Point Pleasant
 Who was Grant's brother in law? -> Answer: Fred Dent


In [8]:
system_prompt = "You are a helpful assistant. You provide the answer to a question based on context, if the answer is not available, please type 'Sorry I can't answer that question based on my available information'"

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig, Gemma3ForCausalLM
import torch

model_id = "google/gemma-3-1b-it"

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = Gemma3ForCausalLM.from_pretrained(
    model_id, quantization_config=quantization_config
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}],
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": context}]
        },
    ],
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device).to(torch.bfloat16)


with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=64)

outputs = tokenizer.batch_decode(outputs)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Attempting to cast a BatchEncoding to type torch.bfloat16. This is not supported.


In [12]:
outputs[-1].split('<start_of_turn>model\nSorry')[1]

" I can't answer that question based on my available information.<end_of_turn>"