In [None]:
# %% 📝 Souring Path
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(f"✅ SRC Path: {SRC_PATH}")


In [None]:
# Python imports & libraries
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client.http.models import Range
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType, FieldCondition, MatchValue, Filter
from typing import Dict, Union, Any
import pandas as pd
import re
import json

# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path

# %% 📁 Paths
BASE_DIR = get_base_dir()
DATA_PATH = get_data_path()
SCHEMA_OUTPUT_PATH = get_schema_path()
qdrant_store_path = get_qdrant_store_path()
COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Base Dir: {BASE_DIR}")
print(f"📌 CSV Path: {DATA_PATH}")
print(f"📌 Qdrant Local Path: {qdrant_store_path}")
print(f"📌 Collection Name: {COLLECTION_NAME}")
print(f"📌 Schema Path: {SCHEMA_OUTPUT_PATH}")



In [None]:

# --- Utility: Normalize ---
def normalize_field_name(field: str) -> str:
    field = field.strip().lower()
    field = re.sub(r"[ ()/]", "_", field)
    return re.sub(r"[^a-zA-Z0-9_]", "", field)

def normalize_field_value(value) -> str:
    return str(value).strip().lower()


In [None]:
# --- 1. Load CSV ---
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Company Description (Long)"]).reset_index(drop=True)

# --- 2. Prepare Documents ---
def load_documents_from_df(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        metadata = {
            normalize_field_name(str(k)): normalize_field_value(v)
            for k, v in row.items() if pd.notna(v)
        }
        # Only the main description as page_content!
        content = str(row['Company Description (Long)']) if 'Company Description (Long)' in row else ""
        docs.append(Document(page_content=content, metadata=metadata))
    return docs

documents = load_documents_from_df(df)
print(f"📚 Loaded {len(documents)} documents")
print("🔑 Sample Metadata Keys:", list(documents[0].metadata.keys()))



In [None]:
docs = client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=5)[0]
for d in docs:
    print(d.payload)

In [None]:
# %% ✅ Explicitly Create Qdrant Collection
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

# %% 🔁 Recreate Qdrant Collection (Optional but Recommended)
client = QdrantClient(host="localhost", port=6333)

if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
print(f"✅ Re-created collection: {COLLECTION_NAME}")

In [None]:
# --- 4. Define Payload Schema ---
# All category fields as KEYWORD for exact matching
payload_schema = {
    "company_name": PayloadSchemaType.KEYWORD,
    "legal_entity_type": PayloadSchemaType.KEYWORD,
    "state": PayloadSchemaType.KEYWORD,
    "headquarters_city": PayloadSchemaType.KEYWORD,
    "industry_sector": PayloadSchemaType.KEYWORD,
    # All others as TEXT or INT/FLOAT as needed
}


In [None]:

# --- 5. Create Payload Indexes for filtered fields ---
for field, schema in payload_schema.items():
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name=field,
        field_schema=schema
    )

print(f"✅ Created payload indexes.")


In [None]:
# --- 6. Ingest documents into Qdrant ---
embedding_model = OpenAIEmbeddings()
qdrant = Qdrant.from_documents(
    documents=documents,
    embedding=embedding_model,
    url="http://localhost:6333",
    collection_name=COLLECTION_NAME,
)

print(f"✅ Ingested {len(documents)} documents.")


In [None]:
docs = client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=5)[0]
for d in docs:
    print(d.payload)

In [None]:
states = set()
docs = client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=1000)[0]
for d in docs:
    if "state" in d.payload:
        states.add(d.payload["state"])
print(states)

In [None]:

# --- 7. Test: Filter Query for 'state' = 'delhi' ---
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

q_filter = Filter(must=[
    FieldCondition(key="state", match=MatchValue(value="delhi"))
])

result = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=q_filter,
    limit=10,
    with_payload=True
)

print("Number of results:", len(result[0]))
for pt in result[0]:
    print(pt.payload)


In [None]:
print("Filter:", q_filter)

In [None]:
result = client.scroll(
    collection_name=COLLECTION_NAME,
    limit=10,
    with_payload=True
)
print("All results (no filter):")
for pt in result[0]:
    print(pt.payload)

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

client = QdrantClient(host="localhost", port=6333)
COLLECTION_NAME = "demo_payload"

# Recreate collection
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=3, distance=Distance.COSINE)
)
# Upsert ONE point
client.upsert(
    collection_name=COLLECTION_NAME,
    points=[
        {"id": 1, "vector": [0.1, 0.2, 0.3], "payload": {"state": "delhi", "company_name": "test"}}
    ]
)

# Check
docs = client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=5)[0]
for d in docs:
    print(d.payload)  # << Should print {'state': 'delhi', ...}
