In [2]:
import requests
from sentence_transformers import SentenceTransformer
import json


In [None]:

def text_to_embedding(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text, convert_to_tensor=False).tolist()
    
    # Convert the embedding to the expected format
    embedding_str = "[" + ",".join(map(str, embedding)) + "]"
    return embedding_str

def solr_knn_query(endpoint, collection, embedding):
    url = f"{endpoint}/{collection}/select"

    data = {
        "q": f"{{!knn f=vector topK=10}}{embedding}",
        "fl": "id,title,description",
        "rows": 40,
        "wt": "json"
    }
    
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    
    response = requests.post(url, data=data, headers=headers)
    response.raise_for_status()
    return response.json()

def display_results(results):
    docs = results.get("response", {}).get("docs", [])
    if not docs:
        print("No results found.")
        return

    for doc in docs:
        print(f"* {doc.get('id')} {doc.get('title')} [score: {doc.get('score'):.2f}]")


In [None]:
solr_endpoint = "http://localhost:8983/solr"
collection = "books_m3_embeddings"


In [3]:

with open("../queries.json", "r") as json_file:
    data = json.load(json_file)

information_needs = data.get("information_needs", [])


In [5]:
print(information_needs)
# make dict of id and query
queries = {}
for i in information_needs:
    queries[i.get("id")] = i.get("query")


[{'id': 'IN_1', 'query': 'artificial intelligence', 'qrels_file': 'evaluation/IN_1/qrels.txt', 'systems': [{'name': 'base_system', 'query_url': 'http://localhost:8983/solr/books_m2/select?defType=edismax&indent=true&q.op=OR&q=artificial%20intelligence&qf=book_title%20reviews.text%20reviews.summary%20description%20categories&rows=40&useParams=', 'query_params': {'qf': 'book_title reviews.text reviews.summary description categories', 'rows': 40}}, {'name': 'enhanced_system', 'query_url': 'http://localhost:8983/solr/books_m2/select?defType=edismax&indent=true&q.op=OR&q=artificial%20intelligence&qf=book_title%5E2%20reviews.text%5E2%20reviews.summary%20description%20categories&rows=40&useParams=', 'query_params': {'qf': 'book_title^2 reviews.text^2 reviews.summary description categories', 'rows': 40}}, {'name': 'new_schema_system', 'query_url': 'http://localhost:8983/solr/books_m3_new_schema/select?defType=edismax&indent=true&q.op=OR&q=artificial%20intelligence&qf=book_title%5E2%20reviews.t

In [None]:

# make dict of id and embedding
embeddings = {}
for i in queries:
    embeddings[i] = text_to_embedding(queries[i])


In [None]:

# make dict of id and results
results = {}
for i in embeddings:
    results[i] = solr_knn_query(solr_endpoint, collection, embeddings[i])


In [None]:

# save each result as json
for i in results:
    with open(f"./results/{i}.json", "w") as outfile:
        json.dump(results[i], outfile)

