In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model.save("model/all-MiniLM-L6-v2")


In [4]:
embedding = model.encode("joe is a student at the university of california. He majors in computer science and mathematics. He is on the swim team and is a sophomore. After his junior year he will be able to study abroad in Europe. He is looking forward to that. He is also a member of the math club and the computer science club. He is a very good student and has a 3.8 GPA. He is also a member of the honor society. He is very active in his church and volunteers at the local food bank. He is a very nice person and gets along with everyone.").tolist()

In [5]:
embedding

[0.003803565399721265,
 0.02723400481045246,
 -0.018084663897752762,
 -0.08185137063264847,
 -0.014777103438973427,
 0.014277405105531216,
 0.013470258563756943,
 -0.01963549479842186,
 -0.040503546595573425,
 -0.01851978898048401,
 0.025883087888360023,
 -0.07763375341892242,
 -0.0033295254688709974,
 0.03704119846224785,
 0.037440843880176544,
 0.019637469202280045,
 -0.03518371284008026,
 -0.03535875678062439,
 -0.06973189115524292,
 -0.016227807849645615,
 -0.08545004576444626,
 -0.08302445709705353,
 -0.03220454230904579,
 -0.018492160364985466,
 0.010004537180066109,
 0.0334843285381794,
 0.0822862833738327,
 -0.04860715568065643,
 -0.03615349158644676,
 0.01669810712337494,
 -0.04814154654741287,
 0.028977753594517708,
 0.02729959227144718,
 0.024720842018723488,
 0.002880653366446495,
 0.0308060385286808,
 0.046716369688510895,
 0.06554529070854187,
 0.001784137450158596,
 0.042290281504392624,
 -0.05554564669728279,
 -0.04643436521291733,
 0.06983175873756409,
 0.0385190211236

In [None]:
import re
import numpy as np
import pandas as pd

# Sample people records
people = [
    {
        "id": 1,
        "name": "Alice",
        "university": "Brown University",
        "major": "Computer Science",
        "hobbies": "rowing, chess",
        "sports_teams": "rowing team",
        "description": "Enjoys algorithms"
    },
    {
        "id": 2,
        "name": "Bob",
        "university": "Harvard University",
        "major": "Mathematics",
        "hobbies": "soccer, reading",
        "sports_teams": "soccer team",
        "description": "Loves numbers"
    },
    {
        "id": 3,
        "name": "Carol",
        "university": "Brown University",
        "major": "Biology",
        "hobbies": "photography, rowing",
        "sports_teams": "rowing team",
        "description": "Biology enthusiast"
    },
]

# Queries to test
queries = [
    "brown university rowing computer science major",
    "harvard math soccer",
    "biology photography"
]

# Tokenizer
def tokenize(text):
    return re.findall(r"\w+", text.lower())

# Build vocabulary from records and queries
all_texts = []
for p in people:
    combined = f"{p['university']} {p['major']} {p['hobbies']} {p['sports_teams']} {p['description']}"
    all_texts.append(combined)
all_texts.extend(queries)

vocab = sorted(set(token for text in all_texts for token in tokenize(text)))

# Vectorize function: simple token-count embedding
def vectorize(text):
    tokens = tokenize(text)
    return np.array([tokens.count(tok) for tok in vocab], dtype=float)

# Precompute vectors for each person
person_vectors = {
    p['id']: vectorize(
        f"{p['university']} {p['major']} {p['hobbies']} {p['sports_teams']} {p['description']}"
    )
    for p in people
}

# Cosine similarity function
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compute and display similarity scores for each query
for q in queries:
    q_vec = vectorize(q)
    sims = []
    for p in people:
        sim_score = cosine_sim(q_vec, person_vectors[p['id']])
        sims.append({
            'query': q,
            'person': p['name'],
            'similarity': sim_score
        })
    df = pd.DataFrame(sims).sort_values('similarity', ascending=False)
    print(f"\nQuery: {q}")
    print(df)



Query: brown university rowing computer science major
                                            query person  similarity
0  brown university rowing computer science major  Alice    0.707107
2  brown university rowing computer science major  Carol    0.452911
1  brown university rowing computer science major    Bob    0.123091

Query: harvard math soccer
                 query person  similarity
1  harvard math soccer    Bob    0.522233
0  harvard math soccer  Alice    0.000000
2  harvard math soccer  Carol    0.000000

Query: biology photography
                 query person  similarity
2  biology photography  Carol    0.588348
0  biology photography  Alice    0.000000
1  biology photography    Bob    0.000000


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Load the embedding model
model = model = SentenceTransformer('model/all-MiniLM-L6-v2')


# Sample people records
people = [
    {
        "id": 1,
        "name": "Alice",
        "university": "Brown University",
        "major": "Computer Science",
        "hobbies": "rowing, chess",
        "sports_teams": "rowing team",
        "description": "Enjoys algorithms"
    },
    {
        "id": 2,
        "name": "Bob",
        "university": "Harvard University",
        "major": "Mathematics",
        "hobbies": "soccer, reading",
        "sports_teams": "soccer team",
        "description": "Loves numbers"
    },
    {
        "id": 3,
        "name": "Carol",
        "university": "Brown University",
        "major": "Biology",
        "hobbies": "photography, rowing",
        "sports_teams": "rowing team",
        "description": "Biology enthusiast"
    },
]

# Queries to test
queries = [
    "brown university rowing computer science major",
    "harvard math soccer",
    "biology photography"
]

# Combine fields into text for embedding
person_texts = [
    f"{p['university']} {p['major']} {p['hobbies']} {p['sports_teams']} {p['description']}"
    for p in people
]

# Compute embeddings for people in one batch
person_embeddings = model.encode(person_texts, convert_to_numpy=True)

# Compute embeddings for queries
query_embeddings = model.encode(queries, convert_to_numpy=True)

# Cosine similarity function
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Build and display results
for q, q_emb in zip(queries, query_embeddings):
    sims = []
    for p, p_emb in zip(people, person_embeddings):
        sim_score = cosine_sim(q_emb, p_emb)
        sims.append({'person': p['name'], 'similarity': sim_score})
    df = pd.DataFrame(sims).sort_values('similarity', ascending=False)
    print(f"\nQuery: {q}")
    print(df.to_string(index=False))


SyntaxError: incomplete input (3238129069.py, line 67)