In [None]:
import duckdb
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from typing import List, Any
from IPython.display import Markdown as md

In [None]:
# source data: survey responses from students.
df = pd.read_csv(Path("data") / 'reflections.csv') 

# load the model for computing embeddings for query string
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

# embeddings for the text responses in the survey data have been pre-computed
embeddings_url = "https://dreamlab-public.s3.us-west-2.amazonaws.com/sorapure/mxbai_embeddings.parquet"
embeddings_file = Path("outputs") / 'mxbai_embeddings.parquet'

# download embeddings file if the file doesn't exist
if not embeddings_file.exists():
    duckdb.execute(f"COPY (SELECT * from read_parquet('{embeddings_url}')) TO '{embeddings_file}' (FORMAT PARQUET);")

In [None]:
def search_df(q: str, limit: int = 10) -> List[Any]:
    query = f"Represent this sentence for searching relevant passages: {q}"
    query_embed = model.encode(query)
    sql = f"""
        FROM read_parquet('{embeddings_file}')
        SELECT 
            student_id as perm,
            question_id,
            array_distance(
                CAST(embedding as FLOAT[1024]),
                CAST($embed as FLOAT[1024])
            ) AS distance
        ORDER BY distance ASC
        LIMIT {limit};
    """
    result = duckdb.execute(sql, {"embed": query_embed}).fetch_df()
    result['text'] = pd.merge(left=result, right=df, on="perm").apply(lambda row: row[row['question_id']], axis=1)
    return result

def search_display(q: str, limit: int = 10):
    result = search_df(q, limit)
    for r in result['text']:
        if type(r) is str:
            display(md(r))


In [None]:
search_display("athletics and sports")