# Base

In [32]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

In [20]:
def load_qa_pairs(file_path):
    qa_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # Skip header
        for line in f.readlines():
            parts = line.strip().split('\t')
            if len(parts) == 6:
                article_title, question, answer, diff_q, diff_a, article_file = parts

                # We always have a question and an answer, but repeated twice for every question, 
                # This just kicks out any NULL answer, if we have two valid answers, we just take the second one
                if answer != 'NULL':
                    qa_dict[question] = answer
    return qa_dict


In [22]:
# Load the data
file_path = 'sample_data/S08_question_answer_pairs.txt'
qa_dict = load_qa_pairs(file_path)
qa_dict

{'Was Abraham Lincoln the sixteenth President of the United States?': 'Yes.',
 'Did Lincoln sign the National Banking Act of 1863?': 'Yes.',
 'Did his mother die of pneumonia?': 'No.',
 "How many long was Lincoln's formal education?": '18 months.',
 'When did Lincoln begin his political career?': '1832.',
 'What did The Legal Tender Act of 1862 establish?': 'The United States Note, the first paper currency in United States history.',
 'Who suggested Lincoln grow a beard?': 'Grace Bedell.',
 'When did the Gettysburg address argue that America was born?': '1776.',
 'Did Lincoln beat John C. Breckinridge in the 1860 election?': 'Yes.',
 'Was Abraham Lincoln the first President of the United States?': 'No',
 'Did Lincoln start his political career in 1832?': 'Yes',
 'Did Lincoln ever represent Alton & Sangamon Railroad?': 'Yes',
 'Which county was Lincoln born in?': 'Southeast Hardin County, Kentucky',
 'When did Lincoln first serve as President?': 'March 4, 1861',
 'Who assassinated Linco

In [43]:
# Create embeddings
def create_embeddings(questions):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(questions, convert_to_tensor=True)
    return model, embeddings

# Search for best matches
def search(model, embeddings, questions, query, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = scores.topk(top_k)
    return [(questions[idx], top_results.values[i].item()) for i, idx in enumerate(top_results.indices)]

In [44]:
questions = list(qa_dict.keys())

# Create embeddings
model, embeddings = create_embeddings(questions)

# Save embeddings to disk
np.save('embeddings/embeddings.npy', embeddings.cpu().numpy())

In [55]:
# Load embeddings from disk
embeddings = np.load('embeddings/embeddings.npy')
embeddings = torch.tensor(embeddings)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embeddings = embeddings.to(device)
print("Loaded", embeddings.shape[0], "entries")

Loaded 918 entries


In [56]:
query = "Who was Grant in WW USA?"

# Example query
results = search(model, embeddings, questions, query)

# Print top results
print("\nTop matches:")
for i, (question, score) in enumerate(results):
    print(f"{i + 1}. {question} (score: {score:.4f})")
    print(f"   -> Answer: {qa_dict[question]}")


Top matches:
1. What was Grant's political affiliation? (score: 0.6950)
   -> Answer: Republican
2. Where was Grant born? (score: 0.6782)
   -> Answer: A log cabin in Point Pleasant, Clermont County, Ohio
3. Who was Grant's brother in law? (score: 0.6080)
   -> Answer: Fred Dent
