In [4]:
import duckdb
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from typing import List, Any

In [5]:
# source data: survey responses from students.
df = pd.read_csv(Path("data") / 'reflections.csv') 

# load the model for computing embeddings for query string
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

# embeddings for the text responses in the survey data have been pre-computed
embeddings_url = "https://dreamlab-public.s3.us-west-2.amazonaws.com/sorapure/mxbai_embeddings.parquet"
embeddings_file = Path("outputs") / 'mxbai_embeddings.parquet'

# download embeddings file if the file doesn't exist
if not embeddings_file.exists():
    duckdb.execute(f"COPY (SELECT * from read_parquet('{embeddings_url}')) TO '{embeddings_file}' (FORMAT PARQUET);")

In [25]:
def search(q: str, limit: int = 10) -> List[Any]:
    query = f"Represent this sentence for searching relevant passages: {q}"
    query_embed = model.encode(query)
    sql = f"""
        FROM read_parquet('{embeddings_file}')
        SELECT 
            student_id as perm,
            question_id,
            array_distance(
                CAST(embedding as FLOAT[1024]),
                CAST($embed as FLOAT[1024])
            ) AS distance
        ORDER BY distance ASC
        LIMIT {limit};
    """
    return duckdb.execute(sql, {"embed": query_embed}).fetch_df()
    #return df.loc[df['perm'].isin([r[0] for r in results])]

In [29]:
result = search("students with english as a second language")
result


Unnamed: 0,perm,question_id,distance
0,Student2372,r1,11.986319
1,Student1950,r1,12.008486
2,Student3002,r1,12.017325
3,Student1531,r4,12.053126
4,Student153,r2,12.093024
5,Student162,r1,12.108867
6,Student2395,r1,12.13909
7,Student277,r4,12.205295
8,Student2555,r1,12.239036
9,Student2464,r4,12.290321


In [31]:
merged_result = pd.merge(left=result, right=df, on="perm")

In [33]:
pd.options.display.max_colwidth = 10
merged_result

Unnamed: 0,perm,question_id,distance,survey_50,HSGPA,student_selection,final_placement,auto_read,cwp_quarter,data_permission,...,r1,r2,r3,r4,EOPGRP,Gender,ETH_GRP,ETH,DEPT0,first_gen_coll_student
0,Studen...,r1,11.986319,25.0,4.08,W1,W1,auto,F22,My dat...,...,I am a...,I thin...,They a...,I feel...,EOP,F,3,Latino,MATH/S...,1
1,Studen...,r1,12.008486,36.0,3.95,W1,W1,auto,F22,My dat...,...,Having...,The rh...,The cl...,I feel...,NOT_EOP,M,3,Latino,Global...,1
2,Studen...,r1,12.017325,33.0,3.89,W1,W1,auto,M2-23,My dat...,...,I read...,Back w...,These ...,I do n...,NOT_EOP,M,4,Asian,Statis...,0
3,Studen...,r4,12.053126,42.0,4.23,W2,W2,read,M2-21,My dat...,...,I read...,In my ...,When I...,As an ...,NOT_EOP,F,10,Foreign,Chemis...,0
4,Studen...,r2,12.093024,46.0,0.0,W2,W2,auto,W23,My dat...,...,i've a...,Most h...,Throug...,i beli...,NOT_EOP,M,8,White,Psycho...,1
5,Studen...,r1,12.108867,45.0,4.31,W2,W2,auto,F22,My dat...,...,During...,These ...,These ...,I thin...,NOT_EOP,F,8,White,Colleg...,0
6,Studen...,r1,12.13909,42.0,4.07,W2,W1,read,F20,My dat...,...,I have...,I have...,I have...,I beli...,NOT_EOP,M,10,Foreign,MATH/S...,0
7,Studen...,r4,12.205295,42.0,4.27,W2,W2,read,M2-22,My dat...,...,I was ...,Almost...,I have...,Writin...,NOT_EOP,F,4,Asian,EEMB/MCDB,0
8,Studen...,r1,12.239036,44.0,,W2,W2,read,W22,My dat...,...,As a S...,The fi...,I cons...,I thin...,NOT_EOP,M,3,Latino,Statis...,0
9,Studen...,r4,12.290321,40.0,3.92,W1,W1,auto,M2-22,My dat...,...,Both i...,While ...,I have...,In hig...,NOT_EOP,F,8,White,History,0
