In [1]:
!pip install scikit-learn pypdf pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install -U "qdrant-client>=1.8.2"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!pip show qdrant-client

Name: qdrant-client
Version: 1.15.1
Summary: Client library for the Qdrant vector search engine
Home-page: 
Author: Andrey Vasnetsov
Author-email: andrey@qdrant.tech
License: Apache-2.0
Location: /Users/slincastro/Projects/Master_IA/final_project/saucode/sauco-env/lib/python3.12/site-packages
Requires: grpcio, httpx, numpy, portalocker, protobuf, pydantic, urllib3
Required-by: 


In [23]:
from qdrant_client import QdrantClient

COLLECTION = "my_collection_tfidf"

client = QdrantClient(url="http://localhost:6333", prefer_grpc=False, check_compatibility=False)

if client.collection_exists(COLLECTION):
    client.delete_collection(COLLECTION)
    print(f"Collection '{COLLECTION}' deleted.")
else:
    print(f"Collection '{COLLECTION}' does not exist.")

Collection 'my_collection_tfidf' deleted.


In [7]:
from pathlib import Path
import math, uuid
import pandas as pd
from pypdf import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer

from qdrant_client import QdrantClient
from qdrant_client.http.models import (
    PointStruct, SparseVector, SparseVectorParams, NamedSparseVector
)

PDF_PATH = Path("./knowledge/cc_knowledge_book.pdf")
COLLECTION = "my_collection_tfidf"
BATCH_SIZE = 512

def csr_row_to_sparse(row_csr):
    return row_csr.indices.tolist(), row_csr.data.tolist()

# ⬇️ ID estable: UUID v5 a partir del string original (p.ej. "p2_c1")
def to_uuid(s: str) -> uuid.UUID:
    # usa el nombre de la colección para que sea único por colección
    return uuid.uuid5(uuid.NAMESPACE_URL, f"{COLLECTION}:{s}")

def read_pdf_pages(pdf_path: Path) -> list[dict]:
    r = PdfReader(str(pdf_path))
    return [{"page": i, "text": (p.extract_text() or "")} for i, p in enumerate(r.pages, start=1)]

def chunk_text(text: str, chunk_size: int = 900, overlap: int = 150) -> list[str]:
    toks = text.split()
    out, start, n = [], 0, len(toks)
    while start < n:
        end = min(start + chunk_size, n)
        ch = " ".join(toks[start:end]).strip()
        if ch: out.append(ch)
        if end == n: break
        start = max(0, end - overlap)
    return out

if not PDF_PATH.exists():
    raise FileNotFoundError(f"No se encontró el PDF en {PDF_PATH}.")

pages = read_pdf_pages(PDF_PATH)
rows = []
for p in pages:
    for j, c in enumerate(chunk_text(p["text"], chunk_size=300, overlap=50), start=1):
        rows.append({"id": f"p{p['page']}_c{j}", "page": p["page"], "text": c})

df = pd.DataFrame(rows)
print("Corpus:", df.shape)
print(df.head(2))

vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
X = vectorizer.fit_transform(df["text"])

client = QdrantClient(
    url="http://localhost:6333",
    prefer_grpc=False,
    # quita esto si ya alineaste servidor y cliente a la misma mayor (ideal)
    check_compatibility=False,
)

# Mejor que recreate_collection (deprecado)
if client.collection_exists(COLLECTION):
    client.delete_collection(COLLECTION)

client.create_collection(
    collection_name=COLLECTION,
    vectors_config={},  # sin densos
    sparse_vectors_config={"text": SparseVectorParams()},
)

# Upsert en lotes: USA UUID en id, y guarda el id original en payload
n = len(df)
for b in range((n + BATCH_SIZE - 1) // BATCH_SIZE):
    lo, hi = b * BATCH_SIZE, min((b + 1) * BATCH_SIZE, n)
    points = []
    for i in range(lo, hi):
        orig_id = str(df.iloc[i]["id"])           # "p2_c1"
        qid = to_uuid(orig_id)                     # UUID estable
        page = int(df.iloc[i]["page"])
        txt = df.iloc[i]["text"]
        idx, vals = csr_row_to_sparse(X[i])

        points.append(
            PointStruct(
                id=str(qid),  # <-- UUID, válido para Qdrant
                payload={
                    "page": page,
                    "chunk_id": orig_id,          # conserva tu id original como metadata
                    "text": txt
                },
                # sparse nombrado "text" va en el dict "vector"
                vector={"text": SparseVector(indices=idx, values=vals)},
            )
        )
    client.upsert(collection_name=COLLECTION, points=points)
    print(f"Batch {b+1}: upserted {hi - lo} points")

print(f"Upserted {n} chunks in collection: {COLLECTION}")

# Búsqueda
def search_tfidf(query: str, top_k: int = 5):
    q = vectorizer.transform([query])
    q_idx, q_vals = csr_row_to_sparse(q[0])
    hits = client.search(
        collection_name=COLLECTION,
        query_vector=NamedSparseVector(
            name="text",
            vector=SparseVector(indices=q_idx, values=q_vals),
        ),
        limit=top_k,
        with_payload=True,
    )
    return [
        {"id": str(h.id), "score": h.score, "page": h.payload.get("page"),
         "chunk_id": h.payload.get("chunk_id"), "text": h.payload.get("text")}
        for h in hits
    ]

for r in search_tfidf("introducción conceptos clave", top_k=5):
    print(r["score"], r["chunk_id"], f"(p{r['page']})", (r["text"] or "")[:120], "…")

Corpus: (668, 3)
      id  page                                               text
0  p2_c1     2                                         Clean Code
1  p3_c1     3  Robert C. Martin Series The mission of this se...
Batch 1: upserted 512 points
Batch 2: upserted 156 points
Upserted 668 chunks in collection: my_collection_tfidf


  hits = client.search(


In [25]:
from __future__ import annotations

from pathlib import Path
from typing import Iterable, Union
import uuid
import pandas as pd
from pypdf import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer

from qdrant_client import QdrantClient
# HTTP models → for upserts/collection schema
from qdrant_client.http.models import (
    PointStruct, SparseVector, SparseVectorParams,
)
# Top-level models → for search (resolver-friendly in your client)
from qdrant_client import models as qm

# =========================
# Config
# =========================
COLLECTION = "my_collection_tfidf"
BATCH_SIZE = 512
SPARSE_NAME = "text"  # named sparse vector key in Qdrant

# =========================
# Helpers
# =========================
def is_existing_file(p: Union[str, Path]) -> bool:
    try:
        return Path(p).exists()
    except Exception:
        return False

def read_pdf_pages(pdf_path: Path) -> list[dict]:
    r = PdfReader(str(pdf_path))
    out = []
    for i, page in enumerate(r.pages, start=1):
        text = page.extract_text() or ""
        out.append({"page": i, "text": text})
    return out

def chunk_text(text: str, chunk_size: int = 300, overlap: int = 50) -> list[str]:
    toks = text.split()
    out, start, n = [], 0, len(toks)
    while start < n:
        end = min(start + chunk_size, n)
        ch = " ".join(toks[start:end]).strip()
        if ch:
            out.append(ch)
        if end == n:
            break
        start = max(0, end - overlap)
    return out

def csr_row_to_sparse(row_csr):
    return row_csr.indices.tolist(), row_csr.data.tolist()

def to_uuid_str(namespace: str, local_id: str) -> str:
    # Deterministic UUIDv5 -> string (Qdrant accepts str IDs)
    return str(uuid.uuid5(uuid.NAMESPACE_URL, f"{namespace}:{local_id}"))

# =========================
# Build corpus from many docs
# =========================
def build_corpus_from_documents(
    documents: Iterable[Union[str, Path]],
    chunk_size: int = 300,
    overlap: int = 50,
) -> pd.DataFrame:
    """
    Accepts:
      - Path/str to PDF files
      - raw text strings (treated as virtual docs)
    Returns DataFrame: id, source, page, text
    """
    rows = []
    for doc in documents:
        if is_existing_file(doc):
            p = Path(doc)
            if p.suffix.lower() == ".pdf":
                pages = read_pdf_pages(p)
                for pg in pages:
                    for j, c in enumerate(chunk_text(pg["text"], chunk_size, overlap), start=1):
                        rows.append({
                            "id": f"{p.name}:p{pg['page']}_c{j}",
                            "source": str(p),
                            "page": int(pg["page"]),
                            "text": c,
                        })
            else:
                txt = Path(p).read_text(encoding="utf-8", errors="ignore")
                for j, c in enumerate(chunk_text(txt, chunk_size, overlap), start=1):
                    rows.append({
                        "id": f"{p.name}:c{j}",
                        "source": str(p),
                        "page": None,
                        "text": c,
                    })
        else:
            # Treat plain string as a virtual doc
            txt = str(doc)
            pseudo = f"raw:{hash(txt) & 0xffffffff:x}"
            for j, c in enumerate(chunk_text(txt, chunk_size, overlap), start=1):
                rows.append({
                    "id": f"{pseudo}:c{j}",
                    "source": pseudo,
                    "page": None,
                    "text": c,
                })

    df = pd.DataFrame(rows)
    if df.empty:
        raise ValueError("No chunks produced. Check your inputs.")
    return df

# =========================
# Qdrant collection helpers
# =========================
def create_sparse_collection(client: QdrantClient, collection: str, sparse_name: str = SPARSE_NAME):
    # Sparse vectors must be named in the schema
    if client.collection_exists(collection):
        client.delete_collection(collection)
    client.create_collection(
        collection_name=collection,
        vectors_config={},  # no dense vectors
        sparse_vectors_config={sparse_name: SparseVectorParams()},
    )

def upsert_dataframe_tfidf_sparse(
    client: QdrantClient,
    collection: str,
    df: pd.DataFrame,
    vectorizer: TfidfVectorizer,
    batch_size: int = BATCH_SIZE,
    sparse_name: str = SPARSE_NAME,
    namespace_for_uuid: str | None = None,
):
    """
    Upserts df rows (id, source, page, text) using TF-IDF vectors into Qdrant.
    Uses UUIDv5 *string* IDs derived from df["id"] (and optional namespace).
    """
    X = vectorizer.transform(df["text"])  # CSR aligned with df
    n = len(df)
    total = (n + batch_size - 1) // batch_size

    for b in range(total):
        lo, hi = b * batch_size, min((b + 1) * batch_size, n)
        points = []
        for i in range(lo, hi):
            local_id = str(df.iloc[i]["id"])  # human-readable
            point_id = to_uuid_str(namespace_for_uuid or collection, local_id)
            src = df.iloc[i]["source"]
            page_raw = df.iloc[i]["page"]  # may be None/NaN/int
            page_val = int(page_raw) if pd.notna(page_raw) else None
            txt = df.iloc[i]["text"]
            idx, vals = csr_row_to_sparse(X[i])

            payload = {"source": src, "chunk_id": local_id, "text": txt}
            if page_val is not None:
                payload["page"] = page_val

            # named sparse vector goes in the "vector" dict
            points.append(
                PointStruct(
                    id=point_id,
                    payload=payload,
                    vector={sparse_name: SparseVector(indices=idx, values=vals)},
                )
            )
        client.upsert(collection_name=collection, points=points)
        print(f"Upserted batch {b+1}/{total} ({hi - lo} points)")

# =========================
# Build+Index end-to-end
# =========================
def build_and_index(
    documents: Iterable[Union[str, Path]],
    collection: str = COLLECTION,
    chunk_size: int = 300,
    overlap: int = 50,
    sparse_name: str = SPARSE_NAME,
):
    # 1) Build corpus
    df = build_corpus_from_documents(documents, chunk_size, overlap)
    print("Corpus shape:", df.shape)

    # 2) Fit TF-IDF on ALL texts (shared vocab)
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
    vectorizer.fit(df["text"])

    # 3) Qdrant client (HTTP; modern API)
    client = QdrantClient(
        url="http://localhost:6333",
        prefer_grpc=False,
        # Set True when client/server majors/minors are aligned
        check_compatibility=False,
    )

    # 4) Create collection (only sparse)
    create_sparse_collection(client, collection, sparse_name)

    # 5) Upsert
    upsert_dataframe_tfidf_sparse(
        client=client,
        collection=collection,
        df=df,
        vectorizer=vectorizer,
        batch_size=BATCH_SIZE,
        sparse_name=sparse_name,
        namespace_for_uuid=collection,
    )

    return client, vectorizer, df

# =========================
# Retrieval (use deprecated but stable .search to avoid resolver issues)
# =========================
def query_tfidf(
    client: QdrantClient,
    collection: str,
    vectorizer: TfidfVectorizer,
    query: str,
    top_k: int = 5,
    sparse_name: str = SPARSE_NAME,
    source_filter: str | None = None,
    page_eq: int | None = None,
):
    """
    Uses client.search (deprecated) with top-level models to avoid the fast-embed resolver
    rejecting http.models.* types on your current client version.
    """
    # TF-IDF → sparse indices/values
    q = vectorizer.transform([query])
    idx = q.indices.tolist()
    vals = q.data.tolist()

    # Optional filters
    q_filter = None
    must = []
    if source_filter:
        must.append(qm.FieldCondition(key="source", match=qm.Match(value=source_filter)))
    if page_eq is not None:
        must.append(qm.FieldCondition(key="page", match=qm.Match(value=int(page_eq))))
    if must:
        q_filter = qm.Filter(must=must)

    results = client.search(
        collection_name=collection,
        query_vector=qm.NamedSparseVector(
            name=sparse_name,
            vector=qm.SparseVector(indices=idx, values=vals),
        ),
        query_filter=q_filter,
        limit=top_k,
        with_payload=True,
    )

    out = []
    print(f"\n🔎 '{query}' → top {top_k}\n")
    for p in results:
        pl = p.payload or {}
        out.append({
            "id": str(p.id),
            "score": p.score,
            "source": pl.get("source"),
            "page": pl.get("page"),
            "chunk_id": pl.get("chunk_id"),
            "text": pl.get("text"),
        })
        print(f"• {p.score:.4f} | {pl.get('source')} | p{pl.get('page')} | {pl.get('chunk_id')}")
        print(f"  {(pl.get('text') or '')[:170]}...\n")
    return out

# =========================
# Example usage
# =========================
if __name__ == "__main__":
    docs = [
        Path("./knowledge/cc_knowledge_book.pdf"),
        Path("./knowledge/Fluent.Python.2nd.Edition.(z-lib.org).pdf"),
        "This is an extra note about Clean Code principles to test multi-doc ingestion."
    ]

    client, vectorizer, df = build_and_index(
        documents=docs,
        collection=COLLECTION,
        chunk_size=300,
        overlap=50,
    )

    # Global retrieval
    query_tfidf(client, COLLECTION, vectorizer, "principios de código limpio y mantenibilidad", top_k=5)

    # Example with a filter by source and/or page
    # query_tfidf(
    #     client, COLLECTION, vectorizer,
    #     query="naming functions and readability",
    #     top_k=5,
    #     source_filter=str(Path("./knowledge/cc_knowledge_book.pdf")),
    #     page_eq=3,
    # )

Corpus shape: (2144, 4)
Upserted batch 1/5 (512 points)
Upserted batch 2/5 (512 points)
Upserted batch 3/5 (512 points)
Upserted batch 4/5 (512 points)
Upserted batch 5/5 (96 points)

🔎 'principios de código limpio y mantenibilidad' → top 5

• 0.1600 | knowledge/Fluent.Python.2nd.Edition.(z-lib.org).pdf | p1082 | Fluent.Python.2nd.Edition.(z-lib.org).pdf:p1082_c2
  python3 flags_threadpool.py DE BD CN JP ID EG NG BR RU CD IR MX US PH FR PK VN IN ET TR 20 flags downloaded in 1.37s $ python3 flags_threadpool.py EG BR FR IN BD JP DE RU...

• 0.1311 | knowledge/Fluent.Python.2nd.Edition.(z-lib.org).pdf | p1082 | Fluent.Python.2nd.Edition.(z-lib.org).pdf:p1082_c1
  To make this last point with code, I wrote three simple programs to download images of 20 country flags from the Web. The first one, flags.py, runs sequentially: it only ...

• 0.0932 | knowledge/Fluent.Python.2nd.Edition.(z-lib.org).pdf | p641 | Fluent.Python.2nd.Edition.(z-lib.org).pdf:p641_c2
  once you’ve bagged it, for examp

  results = client.search(


In [12]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import NamedSparseVector, SparseVector

def search_tfidf(client: QdrantClient, collection_name: str, query: str, vectorizer, top_k: int = 5):
    # Convert query text into TF-IDF sparse vector
    q = vectorizer.transform([query])
    idx = q.indices.tolist()
    vals = q.data.tolist()

    # Perform sparse vector search
    results = client.search(
        collection_name=collection_name,
        query_vector=NamedSparseVector(
            name="text",  # must match your sparse_vectors_config key
            vector=SparseVector(indices=idx, values=vals)
        ),
        limit=top_k,
        with_payload=True
    )

    # Pretty-print results
    print(f"\n🔍 Query: '{query}' → Top {top_k} results:\n")
    for r in results:
        print(f"• Score: {r.score:.4f} | Page: {r.payload.get('page')} | Chunk: {r.payload.get('chunk_id')}")
        print(f"  {r.payload.get('text', '')[:180]}...\n")

    return results

In [26]:
# reconnect client if needed
client = QdrantClient(url="http://localhost:6333", prefer_grpc=False)

# your collection and fitted TF-IDF vectorizer
COLLECTION = "my_collection_tfidf"

results = search_tfidf(
    client=client,
    collection_name=COLLECTION,
    query="side efects",
    vectorizer=vectorizer,
    top_k=5
)


🔍 Query: 'side efects' → Top 5 results:

• Score: 0.1718 | Page: 75 | Chunk: cc_knowledge_book.pdf:p75_c1
  44 Chapter 3: Functions Have No Side Effects Side effects are lies. Y our function promises to do one thing, but it also does other hidden things. Sometimes it will make unexpected...

• Score: 0.1188 | Page: 328 | Chunk: Fluent.Python.2nd.Edition.(z-lib.org).pdf:p328_c1
  object created or referenced on the right-hand side. And the object must exist before a name can be bound to it, as Example 6-2 proves. Example 6-2. Variables are bound to objects ...

• Score: 0.1059 | Page: 1384 | Chunk: Fluent.Python.2nd.Edition.(z-lib.org).pdf:p1384_c1
  i. Pipelines of coroutines i. The Meaning of yield from i. Basic behavior of yield from ii. Exception handling in yield from j. Use Case: Coroutines for Discrete Event Simulation i...

• Score: 0.1045 | Page: 1061 | Chunk: Fluent.Python.2nd.Edition.(z-lib.org).pdf:p1061_c1
  Figure 20-3. One possible architecture for a data system that com

  results = client.search(
