## Carga de datos

In [None]:
import pandas as pd

df = pd.read_pickle("../data/curated/shopify_data_to_index.pkl")
df['id_producto'] = df['id_producto'].astype(int)
df.head(2)

In [None]:
df.iloc[0,:].to_dict()

In [None]:
# df_sample = df.sample(5000)
# df_sample.to_pickle("../data/curated/shopify_data_to_index_sample.pkl")

In [None]:
df_sample = pd.read_pickle("../data/curated/shopify_data_to_index_sample.pkl")

## Creación del indice

In [None]:
from qdrant_client import QdrantClient, models

QDRANT_URL   = "http://localhost:6333"
COLLECTION   = "repuesto_motos_mundibot"


## Cliente Qdrant
client = QdrantClient(url=QDRANT_URL)

## Crear el indice con metadatos utiles para filtros
if client.collection_exists(COLLECTION):
    client.delete_collection(COLLECTION)

## Vectores densos y esparsos (Busqeuda hibirida e.g BM25-like & Cosine)
client.create_collection(
    collection_name=COLLECTION,
    vectors_config={
        "dense": models.VectorParams(size=1536, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={"sparse": models.SparseVectorParams()}, 
)

## Indices útiles para filtros
indices = {
    # Identidad básica (no siempre necesario indexar, pero barato)
    "id_producto": models.PayloadSchemaType.KEYWORD,
    "titulo": models.PayloadSchemaType.TEXT,

    # Marca y modelo (para filtros rápidos)
    "marca_original": models.PayloadSchemaType.KEYWORD,
    "marca":          models.PayloadSchemaType.KEYWORD,   # opcional si ya usas marcas_lista
    "marcas_lista":   models.PayloadSchemaType.KEYWORD,
    "modelo":         models.PayloadSchemaType.KEYWORD,   # opcional
    "modelos_lista":  models.PayloadSchemaType.KEYWORD,

    # Categorización
    "categoria":      models.PayloadSchemaType.KEYWORD,
    "subcategoria":   models.PayloadSchemaType.KEYWORD,

    # Flags
    "tipo_repuesto":  models.PayloadSchemaType.KEYWORD,   # "ORIGINAL"/"GENERICO"
    "es_llanta":      models.PayloadSchemaType.BOOL,

    # Dimensiones (rangos)
    "dimensiones.ancho": models.PayloadSchemaType.INTEGER,
    "dimensiones.alto":  models.PayloadSchemaType.INTEGER,
    "dimensiones.rin":   models.PayloadSchemaType.INTEGER,
    "dimensiones.li":    models.PayloadSchemaType.INTEGER,
    # sr es string (no numeric)
    "dimensiones.sr":    models.PayloadSchemaType.KEYWORD,

    # Etiquetas (para incluir/excluir exacto)
    "etiquetas":      models.PayloadSchemaType.KEYWORD,
    "precio":      models.PayloadSchemaType.FLOAT,

    # Texto (opcional FULL_TEXT si quieres buscar por palabras/frases además del sparse)
    "texto":  models.PayloadSchemaType.TEXT,
}

for field, schema in indices.items():
    client.create_payload_index(collection_name=COLLECTION, field_name=field, field_schema=schema)

## Indexacion de documentos

In [None]:
import numpy as np
from openai import OpenAI
from fastembed import SparseTextEmbedding
import os


DENSE_NAME   = "dense"
SPARSE_NAME  = "sparse"
SPARSE_MODEL = "prithivida/Splade_PP_en_v1" 
DENSE_MODEL  = "text-embedding-3-small"

client_oa = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def embed_dense(texts: list[str]) -> list[np.ndarray]:
    resp = client_oa.embeddings.create(model=DENSE_MODEL, input=texts)
    return [np.array(d.embedding, dtype=np.float32) for d in resp.data]

# Disperso (SPLADE) con FastEmbed
sparse_model = SparseTextEmbedding(model_name=SPARSE_MODEL)
def embed_sparse(texts: list[str]):
    out = []
    for sv in sparse_model.embed(texts):
        out.append((sv.indices, sv.values))
    return out


# Ejemplo con 10 documentos
docs = df_sample.head(1000).to_dict(orient="records")

# Embeddings
texts = [d["texto"] for d in docs]
dense_vecs  = embed_dense(texts)
sparse_vecs = embed_sparse(texts)

# Upsert
points = []
for i, d in enumerate(docs):
    idxs, vals = sparse_vecs[i]
    points.append(
        models.PointStruct(
            id=d["id_producto"],
            vector={
                "dense": dense_vecs[i],
                "sparse": models.SparseVector(indices=idxs, values=vals),
            },
            payload={
                "id_producto": d["id_producto"],
                "titulo": d["titulo"],
                "url": d["url"],
                "marca": d["marca"],
                "marca_original": d["marca_original"],
                "marcas_lista": d["marcas_lista"],
                "modelo": d["modelo"],
                "modelos_lista": d["modelos_lista"],
                "categoria": d["categoria"],
                "subcategoria": d["subcategoria"],
                "tipo_repuesto": d["tipo_repuesto"],
                "es_llanta": bool(d["es_llanta"]),
                "dimensiones": d["dimensiones"],
                "dimensiones_str": d["dimensiones_str"],
                "etiquetas": d["etiquetas"],
                "precio": d["precio"],
                "texto": d["texto"],
            }
        )
    )
client.upsert(COLLECTION, points=points)

## Recuperacion

In [None]:
query = "Llanta S1 90/90-10 Delantera"

query_emb  = embed_dense([query])[0]
idx, vals = embed_sparse([query])[0]

In [None]:
resultados = client.query_points(
    collection_name=COLLECTION,
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(indices=idx, values=vals),
            using="sparse",
            limit=20,
        ),
        models.Prefetch(
            query=query_emb,
            using="dense",
            limit=20,
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
    limit=5,
        query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="marcas_lista",
                match=models.MatchAny(any=["MICHELIN", "HONDA"])
            ),
            models.FieldCondition(
                key="tipo_repuesto",
                match=models.MatchValue(value="ORIGINAL")
            ),
        ]
    )
).points

for i, hit in enumerate(resultados, 1):
    p = hit.payload
    print(f"{i}. score={hit.score:.3f} | id={hit.id}")
    print("   ", p.get("titulo"))
    print("   ", p.get("url"))

In [None]:
df

In [None]:
df[["id_producto", "titulo", "texto", "marcas_lista", "modelo", "categoria", "tipo_repuesto"]].sample(100).to_dict(orient="records")

In [None]:
df.tipo_repuesto.value_counts()

In [None]:
df.modelo.value_counts()

In [None]:
resultados.points

In [None]:
metadata = [point.payload for point in search_result]
    return metadata

In [None]:
hit

In [None]:
# === 1) Imports y clients ===
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding, SparseTextEmbedding
from openai import OpenAI
import numpy as np

from collections import defaultdict

QDRANT_URL = "http://localhost:6333"
COLLECTION  = "repuesto_motos_mundibot"

client_q = QdrantClient(url=QDRANT_URL)

# --- Opción A: Denso con OpenAI (1536 dims) ---
client_oa = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def embed_dense(texts: list[str]) -> list[np.ndarray]:
    resp = client_oa.embeddings.create(model="text-embedding-3-small", input=texts)
    return [np.array(d.embedding, dtype=np.float32) for d in resp.data]

# Disperso (SPLADE) con FastEmbed
sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
def embed_sparse(texts: list[str]):
    out = []
    for sv in sparse_model.embed(texts):
        out.append((sv.indices, sv.values))
    return out

# === 2) Crear colección (como en doc actual) ===
if client_q.collection_exists(COLLECTION):
    client_q.delete_collection(COLLECTION)

client_q.create_collection(
    collection_name=COLLECTION,
    vectors_config={
        "dense": models.VectorParams(size=1536, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={"sparse": models.SparseVectorParams()}, 
)

# Índices de payload típicos
client_q.create_payload_index(COLLECTION, "marca", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "categoria", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "tipo_repuesto", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "es_llanta", models.PayloadSchemaType.BOOL)

In [None]:
aaa

In [None]:
## == Imports ==

import os
import openai
from qdrant_client import QdrantClient, models
import pandas as pd
from dotenv import load_dotenv
load_dotenv()


## == Configuración ==

QDRANT_URL   = "http://localhost:6333"
COLLECTION   = "repuesto_motos_mundibot"
DENSE_NAME   = "dense"
SPARSE_NAME  = "sparse"
SPARSE_MODEL = "prithivida/Splade_PP_en_v1" 
DENSE_MODEL  = "text-embedding-3-small"


## == Clientes ==
client = QdrantClient(url=QDRANT_URL)



In [None]:
client = QdrantClient(url=QDRANT_URL)

if client.collection_exists(COLLECTION):
    client.delete_collection(COLLECTION)

client.create_collection(
    collection_name=COLLECTION,
    vectors_config={
        DENSE_NAME: models.VectorParams(size=1536, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={
        SPARSE_NAME: models.SparseVectorParams()
    },
)

# índices útiles para filtros
client.create_payload_index(COLLECTION, "marca",         models.PayloadSchemaType.KEYWORD)
client.create_payload_index(COLLECTION, "categoria",     models.PayloadSchemaType.KEYWORD)
client.create_payload_index(COLLECTION, "tipo_repuesto", models.PayloadSchemaType.KEYWORD)
client.create_payload_index(COLLECTION, "es_llanta",     models.PayloadSchemaType.BOOL)

print("✅ Colección creada")


In [None]:
import os
from openai import OpenAI
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding, SparseTextEmbedding

QDRANT_URL = "http://localhost:6333"
COLLECTION  = "repuesto_motos_mundibot"

client_oa = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_q = QdrantClient(url=QDRANT_URL)


def embed_dense(texts: list[str]) -> list[np.ndarray]:
    resp = client_oa.embeddings.create(model="text-embedding-3-small", input=texts)
    return [np.array(d.embedding, dtype=np.float32) for d in resp.data]


sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
def embed_sparse(texts: list[str]):
    out = []
    for sv in sparse_model.embed(texts):
        out.append((sv.indices, sv.values))
    return out

In [None]:
embed_sparse(["hola mundo"])

In [None]:
os.getenv("OPENAI_API_KEY")

In [None]:
# === 1) Imports y clients ===
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding, SparseTextEmbedding
from openai import OpenAI
import numpy as np

from collections import defaultdict

QDRANT_URL = "http://localhost:6333"
COLLECTION  = "repuesto_motos_mundibot"

client_q = QdrantClient(url=QDRANT_URL)

# --- Opción A: Denso con OpenAI (1536 dims) ---
client_oa = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def embed_dense(texts: list[str]) -> list[np.ndarray]:
    resp = client_oa.embeddings.create(model="text-embedding-3-small", input=texts)
    return [np.array(d.embedding, dtype=np.float32) for d in resp.data]

# Disperso (SPLADE) con FastEmbed
sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
def embed_sparse(texts: list[str]):
    out = []
    for sv in sparse_model.embed(texts):
        out.append((sv.indices, sv.values))
    return out

# === 2) Crear colección (como en doc actual) ===
if client_q.collection_exists(COLLECTION):
    client_q.delete_collection(COLLECTION)

client_q.create_collection(
    collection_name=COLLECTION,
    vectors_config={
        "dense": models.VectorParams(size=1536, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={"sparse": models.SparseVectorParams()}, 
)

# Índices de payload típicos
client_q.create_payload_index(COLLECTION, "marca", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "categoria", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "tipo_repuesto", models.PayloadSchemaType.KEYWORD)
client_q.create_payload_index(COLLECTION, "es_llanta", models.PayloadSchemaType.BOOL)

In [None]:
docs = df.head(10).to_dict(orient="records")

# Embeddings
texts = [d["texto"] for d in docs]
dense_vecs  = embed_dense(texts)
sparse_vecs = embed_sparse(texts)

# Upsert
points = []
for i, d in enumerate(docs):
    idxs, vals = sparse_vecs[i]
    points.append(
        models.PointStruct(
            id=d["id_producto"],
            vector={
                "dense": dense_vecs[i],
                "sparse": models.SparseVector(indices=idxs, values=vals),
            },
            payload={
                "id_producto": d["id_producto"],
                "titulo": d["titulo"],
                "url": d["url"],
                "marca": d["marca"],
                "marca_original": d["marca_original"],
                "marcas_lista": d["marcas_lista"],
                "modelo": d["modelo"],
                "modelos_lista": d["modelos_lista"],
                "categoria": d["categoria"],
                "subcategoria": d["subcategoria"],
                "tipo_repuesto": d["tipo_repuesto"],
                "es_llanta": bool(d["es_llanta"]),
                "dimensiones": d["dimensiones"],
                "dimensiones_str": d["dimensiones_str"],
                "etiquetas": d["etiquetas"],
                "texto": d["texto"],
            }
        )
    )
client_q.upsert(COLLECTION, points=points)

In [None]:
df.iloc[0,:].to_dict()

In [None]:
from qdrant_client import QdrantClient, models
import numpy as np

client_qdrant = QdrantClient(url="http://localhost:6333")  # o tu endpoint

# Crear la colección (si no existe) con vector denso
client_qdrant.create_collection(
    collection_name="repuesto_motos_mundibot",
    vectors_config={
        "dense": models.VectorParams(size=1536, distance=models.Distance.COSINE)
    },
    sparse_vectors_config={"sparse": models.SparseVectorParams()}
)

In [None]:

from qdrant_client import QdrantClient, models
from openai import OpenAI
import numpy as np
import os

# Qdrant
QDRANT_URL = "http://localhost:6333"
COLLECTION = "repuesto_motos_mundibot"

client_qdrant = QdrantClient(url=QDRANT_URL)

# OpenAI (embeddings densos)
client_openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def embed_dense_openai(texts: list[str]) -> list[np.ndarray]:
    """
    Embeddings densos con OpenAI text-embedding-3-small (1536 dims).
    Retorna una lista de vectores np.float32.
    """
    resp = client_openai.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [np.array(d.embedding, dtype=np.float32) for d in resp.data]


# --- Crear colección (forma recomendada y sin deprecation warnings) ---

# Si existe, opcionalmente borrarla (para demo reproducible)
if client_qdrant.collection_exists(COLLECTION):
    client_qdrant.delete_collection(COLLECTION)

client_qdrant.create_collection(
    collection_name=COLLECTION,
    vectors_config={
        "dense": models.VectorParams(size=1536, distance=models.Distance.COSINE),
    },
    # Si más adelante quieres híbrido con sparse/BM25, añade:
    # sparse_vectors_config={"sparse": models.SparseVectorParams()},
)

# Índices de payload útiles para filtrar
client_qdrant.create_payload_index(
    collection_name=COLLECTION,
    field_name="marca",
    field_schema=models.PayloadSchemaType.KEYWORD,
)
client_qdrant.create_payload_index(
    collection_name=COLLECTION,
    field_name="modelos_lista",
    field_schema=models.PayloadSchemaType.KEYWORD,
)
client_qdrant.create_payload_index(
    collection_name=COLLECTION,
    field_name="categoria",
    field_schema=models.PayloadSchemaType.KEYWORD,
)
client_qdrant.create_payload_index(
    collection_name=COLLECTION,
    field_name="tipo_repuesto",
    field_schema=models.PayloadSchemaType.KEYWORD,
)
client_qdrant.create_payload_index(
    collection_name=COLLECTION,
    field_name="es_llanta",
    field_schema=models.PayloadSchemaType.BOOL,
)

In [None]:
docs = df.head(10).to_dict(orient="records")
len(docs), docs[0]["id_producto"], docs[-1]["id_producto"]

In [None]:
# --- Indexar los 10 documentos (embeddings + payload) ---

# 1) Embeddings densos del campo `texto`
textos = [d["texto"] for d in docs]
vecs = embed_dense_openai(textos)

points = []
for i, d in enumerate(docs):
    payload = {
        "id_producto": d["id_producto"],
        "titulo": d["titulo"],
        "url": d["url"],
        "marca": d["marca"],
        "marca_original": d["marca_original"],
        "marcas_lista": d["marcas_lista"],
        "modelo": d["modelo"],
        "modelos_lista": d["modelos_lista"],
        "categoria": d["categoria"],
        "subcategoria": d["subcategoria"],
        "tipo_repuesto": d["tipo_repuesto"],
        "es_llanta": bool(d["es_llanta"]),
        "dimensiones": d["dimensiones"],
        "dimensiones_str": d["dimensiones_str"],
        "etiquetas": d["etiquetas"],
        # Puedes guardar también 'texto' si quieres verlo en resultados
        "texto": d["texto"],
    }
    points.append(
        models.PointStruct(
            id=d["id_producto"],
            vector={"dense": vecs[i]},
            payload=payload
        )
    )

# 3) Upsert
client_qdrant.upsert(collection_name=COLLECTION, points=points)

In [None]:
df.head(10)

In [None]:
consulta = "Nombre: llanta Michelin 120/90-17 Trasera - Original. \nMarca: MICHELIN  \nMarca: Scooter"
q_vec = embed_dense_openai([consulta])[0]

resultados = client_qdrant.search(
    collection_name=COLLECTION,
    query_vector=("dense", q_vec),
    limit=5,
    with_payload=True,
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="marca",
                match=models.MatchAny(any=["MICHELIN"])
            ),
            models.FieldCondition(
                key="tipo_repuesto",
                match=models.MatchValue(value="ORIGINAL")
            ),
        ]
    )
)

for i, hit in enumerate(resultados, 1):
    p = hit.payload
    print(f"{i}. score={hit.score:.3f} | id={hit.id}")
    print("   ", p.get("titulo"))
    print("   ", p.get("url"))