In [14]:
import openai
import qdrant_client
import json
from dotenv import load_dotenv
import os

load_dotenv(override=True)
openai.api_key = os.getenv("OPENAI_API_KEY")

openai_client = openai.Client(
    api_key=openai.api_key,
)

qdrant = qdrant_client.QdrantClient(":memory:")
embedding_model = "text-embedding-3-small"

# 1) Load your JSON list
with open("linkedin_profiles_raw.json", "r") as f:
    profiles = json.load(f)


In [13]:
from qdrant_client.models import PointStruct
# 3) Prepare your Qdrant collection
collection_name = "linkedin_profiles"
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config={"size": 1536, "distance": "Cosine"},
)

def chunk_text(text, max_chars=2000):
    """Yield successive chunks of at most max_chars characters."""
    for i in range(0, len(text), max_chars):
        yield text[i : i + max_chars]

def profile_to_text(profile: dict) -> str:
    """Format only the key fields of a LinkedIn profile for embedding."""
    parts = []
    name = profile.get("name", "")
    title = profile.get("position", "")
    parts.append(f"{name} — {title}")
    
    if about := profile.get("about"):
        parts.append(about)
    
    if curr := profile.get("current_company", {}) or {}:
        if curr_name := curr.get("name"):
            parts.append(f"Current company: {curr_name}")
    
    # Safely handle experience possibly being None
    exp_list = profile.get("experience") or []
    exp_entries = []
    for exp in exp_list:
        t = exp.get("title")
        c = exp.get("company")
        if t and c:
            exp_entries.append(f"{t} at {c}")
    if exp_entries:
        parts.append("Experience: " + "; ".join(exp_entries))
    
    return "\n\n".join(parts)



# 4) Loop: serialize, embed, upsert
global_idx = 0
for profile in profiles:
    if "id" not in profile:
        print(f"{profile} is missing 'id', skipping.")
        continue
    print(profile)
    text = profile_to_text(profile)
    # serialized = json.dumps(profile, ensure_ascii=False)
    
    for chunk_idx, chunk in enumerate(chunk_text(text, max_chars=2000)):
        resp = openai_client.embeddings.create(
            model=embedding_model,
            input=chunk,
        )
        vector = resp.data[0].embedding
        
        point = PointStruct(
            id=global_idx,
            vector=vector,
            payload={
                "profile_id": profile["id"],
                "chunk_index": chunk_idx,
                "current_company": profile.get("current_company", {}).get("name"),
                "experience_companies": [
                    e.get("company")
                    for e in (profile.get("experience") or [])
                    if e.get("company")
                ],
                "text": chunk,
                # …other metadata…
            },
        )
        qdrant.upsert(collection_name=collection_name, points=[point])
        global_idx += 1


  qdrant.recreate_collection(


{'id': 'violet-johnson-156b831b5', 'name': 'Violet Johnson', 'city': 'Berkeley, California, United States', 'country_code': 'US', 'position': 'ECE Teacher and Nature Lover', 'about': 'I am currently working as an Early Childhood Education Teacher. With my degree in Conservation & Resource Studies with a minor in Education, I focus on outdoor education, community engagement, accessibility and equity, indigenous interactions with nature, and public land management. All in all, I am passionate about connecting people with nature, especially youth. I have experience leading teenage crews in State and National Parks with The Conservation Corps, managing teenage employees in several different settings, and of course classroom experience as a preschool teacher. I believe in the transformative power of nature to educate, uplift, and heal— and that needs to be an opportunity accessible for all.', 'current_company': {'link': 'https://www.linkedin.com/school/uc-berkeley/', 'name': 'University of 

In [None]:
# 2) Build a payload filter
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
payload_filter = Filter(
    must=[
        FieldCondition(
            key="current_company",
            match=MatchValue(value="Apple")
        )
    ]
)

qdrant.search(
    collection_name=collection_name,
    query_vector=openai_client.embeddings.create(
        input=["Find me some teachers"],
        model=embedding_model,
    )
    .data[0]
    .embedding,
    # query_filter=payload_filter,
)


  qdrant.search(


[ScoredPoint(id=2, version=0, score=0.3731542906738558, payload={'profile_id': 'margaret-peterson-2531561b2', 'chunk_index': 0, 'current_company': 'BERKELEY UNIFIED SCHOOL DISTRICT', 'experience_companies': ['BERKELEY UNIFIED SCHOOL DISTRICT', 'Modesto City Schools', 'University of California, Berkeley', 'Brenda Athletic Clubs'], 'text': 'Margaret Peterson — Student in the Berkeley Teacher Education Program\n\nCurrently, I am studying in the Berkeley Teacher Education Program (BTEP) to get my single subject teaching credential in Social Studies and my masters in Education. I am looking for secondary social studies teaching positions for the 25-26 school year.\n\nCurrent company: BERKELEY UNIFIED SCHOOL DISTRICT\n\nExperience: Student Teacher at BERKELEY UNIFIED SCHOOL DISTRICT; Modesto City Schools at Modesto City Schools; University of California, Berkeley at University of California, Berkeley; Brenda Athletic Clubs at Brenda Athletic Clubs'}, vector=None, shard_key=None, order_value=