# Vector Classifier + RAG SDK Demo

This notebook shows how to:

1. Index structured data (animals)
2. Classify structured queries
3. Index images (PlantNet-like)
4. Classify images
5. Index documents (URL/local file/GitHub)
6. Query documents using Python AI SDK tools

Each step includes a short explanation and concise code you can adapt.


In [1]:
# Setup: install optional deps (uncomment if needed)
# %pip install ai-sdk-python pinecone openai pydantic python-dotenv huggingface_hub requests pillow

import os
from vector_classifier import VectorClassifier

# Load .env if present
try:
    from dotenv import load_dotenv  # type: ignore
    load_dotenv()
except Exception:
    pass

PINECONE_API_KEY = (os.getenv("PINECONE_API_KEY") or "").strip().strip('"').strip("'")
OPENAI_API_KEY = (os.getenv("OPENAI_API_KEY") or "").strip().strip('"').strip("'")
PINECONE_INDEX_NAME = (os.getenv("PINECONE_INDEX_NAME") or "").strip().strip('"').strip("'") or "vcp-1536-cosine"

assert PINECONE_API_KEY, "Set PINECONE_API_KEY"
assert OPENAI_API_KEY, "Set OPENAI_API_KEY"



  from .autonotebook import tqdm as notebook_tqdm


## 1) Indexing data (animals)
We convert simple labeled records into embeddings and upsert to Pinecone. The classifier stores metadata (e.g., label, description).


In [None]:
import asyncio

classifier = VectorClassifier({
    "pinecone": {
        "apiKey": PINECONE_API_KEY,
        "indexName": PINECONE_INDEX_NAME,
        "metric": "cosine",
        "namespace": "demo-sdk",
    },
    "embedding": {
        "provider": "openai",
        "model": "text-embedding-3-small",
        "apiKey": OPENAI_API_KEY,
    },
    "defaults": {"topK": 5, "threshold": 0.1, "vote": "weighted"},
})

async def index_animals():
    records = [
        {"id": "1", "label": "Cat", "description": "Small domestic cat", "metadata": {"color": "gray"}},
        {"id": "2", "label": "Dog", "description": "Friendly domestic dog", "metadata": {"size": "medium"}},
        {"id": "3", "label": "Rabbit", "description": "Small herbivorous mammal", "metadata": {"color": "white"}},
    ]
    stats = await classifier.index_data(records)
    return stats

asyncio.run(index_animals())


## 2) Classifying data (basic usage)
We embed a query and retrieve the nearest labeled records. The predicted label is voted from the top matches.


In [None]:
async def classify_query():
    result = await classifier.classify({"description": "Playful feline"}, {"topK": 3, "threshold": 0.1})
    return result

asyncio.run(classify_query())


## 3) Indexing images (PlantNet-like)
For image modality, provide an image URL and switch the embedder provider to `hf` (Hugging Face) with an image-capable model, or keep `openai` if you use an image-embedding model. We store vectors with image metadata.


In [None]:
from vector_classifier import VectorClassifier as ImageClassifier

image_classifier = ImageClassifier({
    "pinecone": {
        "apiKey": PINECONE_API_KEY,
        "indexName": PINECONE_INDEX_NAME,
        "metric": "cosine",
        "namespace": "demo-images",
    },
    "embedding": {
        # Switch to HF image model if desired, e.g., "openai_clip-vit-base-patch32"
        "provider": os.getenv("IMG_EMBED_PROVIDER", "hf"),
        "model": os.getenv("IMG_EMBED_MODEL", "sentence-transformers/clip-ViT-B-32"),
        "apiKey": os.getenv("HUGGINGFACEHUB_API_TOKEN", ""),
        "inputType": "image",
    },
    "defaults": {"topK": 5, "threshold": 0.1, "vote": "weighted"},
})

async def index_images():
    records = [
        {"id": "img1", "label": "daisy", "imageUrl": "https://images.plantnet.org/1.jpg"},
        {"id": "img2", "label": "dandelion", "imageUrl": "https://images.plantnet.org/2.jpg"},
    ]
    stats = await image_classifier.index_data(records)
    return stats

asyncio.run(index_images())


## 4) Classifying images
We embed a query image and retrieve nearest labeled images. The top matches suggest the predicted label.


In [None]:
async def classify_image():
    q = {"imageUrl": os.getenv("TEST_IMAGE_URL", "https://images.plantnet.org/2.jpg")}
    result = await image_classifier.classify(q, {"topK": 3, "threshold": 0.1})
    return result

asyncio.run(classify_image())


## 5) Indexing documents
Use `RAGClient` and helpers to index a URL, a local file, or a GitHub repository. The content is chunked, embedded, and upserted.


In [None]:
from vector_classifier import RAGClient
from vector_classifier.rag.index import index_url, index_file, index_github

rag = RAGClient(
    pinecone_api_key=PINECONE_API_KEY,
    pinecone_index=PINECONE_INDEX_NAME,
    embedding_provider="openai",
    embedding_model="text-embedding-3-small",
)

async def index_docs_demo():
    # Choose one of the following methods:
    res1 = await index_url(rag, "https://example.com", tokens_per_chunk=1024)
    # res2 = await index_file(rag, "./README.md", tokens_per_chunk=1024)
    # res3 = await index_github(rag, "https://github.com/owner/repo", tokens_per_chunk=1024)
    return res1

asyncio.run(index_docs_demo())


## 6) Querying documents (ai-sdk-python tools)
We define two tools: `search_documents` (nearest neighbors) and `extract_content` (fetch chunk content). Then we let the model decide when to call them.


In [None]:
from ai_sdk import generate_text, openai, tool, embed_many
from pydantic import BaseModel, Field
from pinecone import Pinecone, ServerlessSpec

# Ensure Pinecone index exists (auto-create default if missing)
pc = Pinecone(api_key=PINECONE_API_KEY)
idx_name = PINECONE_INDEX_NAME or "vcp-1536-cosine"
existing = pc.list_indexes()
if not any(i.name == idx_name for i in (existing.indexes or [])):
    pc.create_index(name=idx_name, dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))
index = pc.Index(idx_name)

embed_model = openai.embedding("text-embedding-3-small")

def _embed_query(text: str):
    r = embed_many(model=embed_model, values=[text])
    return list(r.embeddings[0])

class SearchParams(BaseModel):
    query: str
    top_k: int = Field(8, ge=1, le=200)
    sources: list[str] | None = None
    threshold: float | None = Field(None, ge=0.0, le=1.0)

def _exec_search_documents(query: str, top_k: int = 8, sources: list[str] | None = None, threshold: float | None = None) -> dict:
    vec = _embed_query(query)
    flt = {"sourceUrl": {"$in": sources}} if sources else None
    res = index.query(top_k=top_k, vector=vec, include_metadata=True, include_values=False, filter=flt)
    out = []
    for m in (res.matches or []):
        score = float(getattr(m, "score", 0.0) or 0.0)
        if threshold is not None and score < threshold:
            continue
        md = getattr(m, "metadata", {}) or {}
        out.append({"id": m.id, "score": score, "chunk": {"id": m.id, "sourceUrl": md.get("sourceUrl"), "sourcePath": md.get("sourcePath"), "sourceTitle": md.get("sourceTitle")}})
    return {"results": out, "totalResults": len(out)}

class ExtractParams(BaseModel):
    url: str
    chunk_id: str | None = None
    include_all_chunks: bool = False
    chunk_ids: list[str] | None = None

def _exec_extract_content(url: str, chunk_id: str | None = None, include_all_chunks: bool = False, chunk_ids: list[str] | None = None) -> dict:
    ids: list[str] = []
    if chunk_id:
        ids = [chunk_id]
    elif include_all_chunks and chunk_ids:
        ids = list(chunk_ids)
    if not ids:
        vec = _embed_query(url)
        qr = index.query(top_k=100, vector=vec, include_metadata=True, filter={"sourceUrl": url})
        ids = [m.id for m in (qr.matches or [])]
    if not ids:
        return {"chunks": []}
    fetched = index.fetch(ids=ids)
    vectors = getattr(fetched, "vectors", {}) or {}
    chunks = []
    for cid in ids:
        v = vectors.get(cid)
        if not v:
            continue
        md = getattr(v, "metadata", None) or {}
        chunks.append({"id": cid, "content": md.get("content"), "sourceUrl": md.get("sourceUrl"), "sourcePath": md.get("sourcePath"), "sourceTitle": md.get("sourceTitle")})
    return {"chunks": chunks}

# Convert Pydantic models to JSON schema for the tool helper
try:
    search_schema = SearchParams.model_json_schema()
    extract_schema = ExtractParams.model_json_schema()
except Exception:
    search_schema = SearchParams.schema()
    extract_schema = ExtractParams.schema()

search_tool = tool(name="search_documents", description="Search indexed documents", parameters=search_schema, execute=_exec_search_documents)
extract_tool = tool(name="extract_content", description="Extract chunk content", parameters=extract_schema, execute=_exec_extract_content)

model = openai("gpt-4o-mini")
prompt = (
    "You can search a documentation corpus. "
    "1) Use search_documents to find relevant chunks about 'Example'. "
    "2) Call extract_content for the best chunk to cite exact text. Answer briefly."
)
res = generate_text(model=model, prompt=prompt, tools=[search_tool, extract_tool])
print(res.text)
