In [1]:
import sys, os

try:
    # ✅ Running from a Python script (.py file)
    base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
except NameError:
    # ✅ Running from a Jupyter notebook (__file__ is not defined)
    base_path = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

SRC_PATH = os.path.join(base_path)

if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print(f"✅ SRC path added: {SRC_PATH}")
else:
    print(f"🔁 SRC path already in sys.path: {SRC_PATH}")

✅ SRC path added: /home/prashant-agrawal/projects/company_talk2data/src


In [2]:
# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name


# 📂 Define paths and configurations

COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Collection Name: {COLLECTION_NAME}")


📌 Collection Name: indian_startups


In [3]:
# --- Utility: Normalization ---
def normalize_field_name(field: str) -> str:
    return (
        field.strip().lower()
        .replace(" ", "_").replace("(", "").replace(")", "")
        .replace("/", "_")
    )

def normalize_field_value(value) -> str:
    return str(value).strip().lower()

In [4]:
# src/tools/qdrant_tool.py

import re
from typing import List, Dict, Any, Union

from qdrant_client import QdrantClient
from qdrant_client.http.models import FieldCondition, MatchValue, Range, Filter
from langchain_openai import OpenAIEmbeddings


class QdrantSearchTool:
    """
    Tool for performing hybrid semantic + metadata searches against a Qdrant collection.
    """

    def __init__(
        self,
        host: str,
        port: int,
        collection_name: str,
        embedding_model: OpenAIEmbeddings,
    ):
        self.client = QdrantClient(host=host, port=port)
        self.collection = collection_name
        self.embedding_model = embedding_model

    @staticmethod
    def _normalize_field_name(field: str) -> str:
        f = field.strip().lower()
        f = re.sub(r"[ ()/]", "_", f)
        return re.sub(r"[^a-z0-9_]", "", f)

    @staticmethod
    def _normalize_field_value(value: Any) -> str:
        return str(value).strip().lower()

    def _build_filter(self, filters: Dict[str, Union[str, int, float, Dict[str, Any]]]) -> Filter:
        """
        Convert a user-provided dict of filters into a Qdrant Filter object.
        Supports:
          - exact match: {"state": "delhi"}
          - range match: {"year_founded": {"gte": 2000, "lte": 2010}}
        """
        conditions = []
        for raw_field, cond in filters.items():
            key = self._normalize_field_name(raw_field)

            if isinstance(cond, dict) and ("gte" in cond or "lte" in cond):
                conditions.append(
                    FieldCondition(
                        key=key,
                        range=Range(gte=cond.get("gte"), lte=cond.get("lte")),
                    )
                )
            else:
                val = self._normalize_field_value(cond)
                conditions.append(
                    FieldCondition(key=key, match=MatchValue(value=val))
                )

        return Filter(must=conditions)

    def search(
        self,
        query: str,
        filters: Dict[str, Union[str, int, float, Dict[str, Any]]] = None,
        k: int = 5,
    ) -> List[Dict[str, Any]]:
        """
        Perform a similarity search with optional metadata filtering.
        Returns a list of dicts: { "id", "score", "payload" }.
        """
        # 1. Embed the query
        vector = self.embedding_model.embed_query(query)

        # 2. Build Qdrant filter if provided
        q_filter = self._build_filter(filters) if filters else None

        # 3. Execute search
        results = self.client.search(
            collection_name=self.collection,
            query_vector=vector,
            query_filter=q_filter,
            limit=k,
            with_payload=True,
        )

        # 4. Format output
        output = []
        for pt in results:
            output.append({
                "id": pt.id,
                "score": pt.score,
                "payload": pt.payload,
            })
        return output

# 4️⃣ Instantiate once
embedding_model = OpenAIEmbeddings()
qdrant_search_tool = QdrantSearchTool(
    host="localhost",
    port=6333,
    collection_name=COLLECTION_NAME,
    embedding_model=embedding_model
)

In [5]:
# 5️⃣ Test functions
def test_semantic():
    print("🔍 Test: pure semantic (no filters)")
    for r in qdrant_search_tool.search(query="emerging fintech startups", k=3):
        print(f" • [{r['score']:.4f}] {r['payload'].get('company_name')}")

def test_metadata():
    print("🔍 Test: metadata-only filter state=delhi")
    for r in qdrant_search_tool.search(query="", filters={"state": "delhi"}, k=5):
        print(f" • {r['payload']['company_name']} (state={r['payload']['state']})")

def test_full_query():
    for r in qdrant_search_tool.search(
        query="List D2C or SaaS companies in Delhi or Hyderabad",
        filters = {
            "hiring_status": "actively hiring",
            "industry_sector": "saas"
        },
        k=5
    ):
        print(f" • {r['payload']['company_name']} (founded={r['payload']['year_founded']})")
# ─── 6) MAIN: TEST THE FULL AGENT (ReAct LOOP) ────────────────────────────────

    
    print("\n─────────────────────────────────────────────────────\n")


# 6️⃣ Run all tests
test_full_query()


ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x77283b5c4730>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno -3] Temporary failure in name resolution)"))

In [8]:
def print_structured_results(results):
    if not results:
        print("No results found.")
        return
    for i, r in enumerate(results, 1):
        payload = r.get("payload", {})
        print(f"\nResult {i}:")
        print(f"  Company Name: {payload.get('company_name', 'N/A')}")
        print(f"  Industry Sector: {payload.get('industry_sector', 'N/A')}")
        print(f"  State: {payload.get('state', 'N/A')}")
        print(f"  Funding (INR): {payload.get('total_funding_raised_inr', 'N/A')}")
        print(f"  Hiring Status: {payload.get('hiring_status', 'N/A')}")
        print(f"  Lead Investors: {payload.get('lead_investors', 'N/A')}")
        print(f"  Score: {r.get('score', 'N/A')}")
        print("-" * 40)

In [9]:
if __name__ == "__main__":
    # Example query and filters (adjust as needed)
    query = "SaaS startups in Delhi or Hyderabad with funding above 100 crore and actively hiring"
    filters = {
            "industry_sector": "saas",
            "state": "delhi",
            "hiring_status": "actively hiring",
             
          }
    k = 5

    # Call the tool directly
    print("=== Qdrant Tool Direct Test ===")
    result = wrapped_qdrant_search({
    "query": query,
    "filters": filters,
    "k":3
      })

    print_structured_results(result)

=== Qdrant Tool Direct Test ===

[DEBUG] Query: SaaS startups in Delhi or Hyderabad with funding above 100 crore and actively hiring
[DEBUG] Filters: {'industry_sector': 'saas', 'state': 'delhi', 'hiring_status': 'actively hiring'}
[DEBUG] Top K: 3
[DEBUG] Raw results: [{'id': 406, 'score': 0.7650749, 'payload': {'company_name': 'nobroker', 'legal_entity_type': 'pvt ltd', 'state': 'delhi', 'headquarters_city': 'hyderabad', 'year_founded': '2013', 'company_website': 'https://wardinc.in', 'logo_url': 'https://logo.clearbit.com/wardinc.in', 'company_description_short': 'integrated background standardization', 'company_description_long': 'news far arm or season place. life simply rather outside.\r\ntrue office training environment. idea reveal my newspaper source.\r\npaper road air box also clear reach town. door research free hand data.', 'industry_sector': 'saas', 'total_funding_raised_inr': '₹153 cr', 'number_of_funding_rounds': '3', 'latest_funding_round_type': 'series b', 'latest_fund

  results = self.client.search(
