In [3]:
from ollama import Client

client = Client(host='http://localhost:11434')  # default Ollama host
resp = client.embeddings(model='nomic-embed-text', prompt="What is dimensionality reduction?")
vec = resp["embedding"]
print(len(vec), vec[:8])  # length and a preview


768 [1.5742555856704712, 1.3248658180236816, -3.011218547821045, -1.5568761825561523, 0.3055654764175415, 0.9362785816192627, 0.8319445848464966, 1.15894615650177]


In [4]:
from langchain_community.embeddings import OllamaEmbeddings

emb = OllamaEmbeddings(model='nomic-embed-text')  # or 'mxbai-embed-large'
q_vec = emb.embed_query("What is dimensionality reduction?")
docs_vecs = emb.embed_documents([
    "Who is Laurens van der Maaten?",
    "What is dimensionality reduction?",
])
print(len(q_vec), len(docs_vecs), len(docs_vecs[0]))


768 2 768


  emb = OllamaEmbeddings(model='nomic-embed-text')  # or 'mxbai-embed-large'


In [None]:
import os
os.makedirs("figures", exist_ok=True)

# For matplotlib figures
plt.savefig("figures/umap_education_resumes.png", dpi=300, bbox_inches='tight')
print("Saved: figures/umap_education_resumes.png")

In [None]:
import os
os.makedirs("figures", exist_ok=True)

fig.write_image("figures/umap_education_resumes.png", width=1000, height=700, scale=2)
print("Saved: figures/umap_education_resumes.png")

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import os
from collections import OrderedDict

# Step 1: Set up embeddings (your familiar model)
print("🔄 Loading embedding model...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'mps'},  # Change to 'cuda' if you have GPU
    encode_kwargs={'normalize_embeddings': True}
)

# Step 2: Create Chroma vector store
print("🔄 Creating Chroma vector store...")
# This will create a persistent database in ./chroma_db directory
vectorstore = Chroma.from_documents(
    documents=split_documents,  # Your LangChain documents from previous step
    embedding=embeddings,
    persist_directory="./chroma_db",  # Where to save the database
    collection_name="resume_collection"
)

def deduplicate_by_resume_id(docs, k=5):
    """
    Remove duplicate resumes, keeping only the highest-scoring chunk per resume_id
    """
    seen_resumes = OrderedDict()
    
    for doc in docs:
        resume_id = doc.metadata['resume_id']
        if resume_id not in seen_resumes:
            seen_resumes[resume_id] = doc
        # If we've seen this resume before, we keep the first one (highest scoring)

print(f"✅ Created Chroma vector store with {len(split_documents)} documents")
print(f"📁 Database saved to: ./chroma_db")

# Step 3: Test the vector store
print("\n🧪 Testing vector store...")
test_query = "Python developer with machine learning experience"
results = vectorstore.similarity_search(test_query, k=3)

print(f"Query: '{test_query}'")
print("Top 3 matches:")
for i, doc in enumerate(results):
    print(f"\n{i+1}. Category: {doc.metadata['category']}")
    print(f"   Resume ID: {doc.metadata['resume_id']}")
    print(f"   Word count: {doc.metadata['word_count']}")
    print(f"   Content preview: {doc.page_content[:150]}...")