In [25]:
# %%
import sys, os

try:
    # ✅ Running from a Python script (.py file)
    base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
except NameError:
    # ✅ Running from a Jupyter notebook (__file__ is not defined)
    base_path = os.path.abspath(os.path.join(os.getcwd(), ".."))

SRC_PATH = os.path.join(base_path)

if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print(f"✅ SRC path added: {SRC_PATH}")
else:
    print(f"🔁 SRC path already in sys.path: {SRC_PATH}")

🔁 SRC path already in sys.path: /home/prashant-agrawal/projects/netflix_talk2data/src


In [32]:
from enhancer_agent_tools.keyword_extractor import keyword_extractor_fn
from enhancer_agent_tools.numeric_extractor import extract_numeric_constraints
from enhancer_agent_tools.classify_category import classify_categories
from enhancer_agent_tools.entity_extractor import extract_entities
from enhancer_agent_tools.filter_composer import compose_filters

In [33]:
# Keyword extractor tool
# This tool is designed to extract relevant company-related fields from the user's query based on keyword matching.

from langchain.agents import Tool
keyword_extractor_tool = Tool( 
    name="keyword_extractor",
    func=keyword_extractor_fn,
    description="""Extracts relevant company-related fields from the user's query based on keyword matching. Use this tool to identify which structured metadata fields—like headquarters city, tech stack, product category, funding stage, hiring status, or founder information—are being asked about. 
    Returns a dictionary mapping each detected field to the matched keyword.""".strip()
    )   


In [8]:
# Extract Numeric Constraints
# This tool identifies numeric constraints in the user's query, such as funding amount, revenue, employee size, valuation, or year of founding.

from langchain.agents import Tool

extract_numeric_constraints_tool = Tool(
    name="Numeric Constraint Extraction",
    func=extract_numeric_constraints,
    description=""""Extract numeric constraints from the query such as funding amount, revenue, employee size, valuation, or year of founding. 
    Use this tool to detect conditions like 'raised over $5 million', 'less than 100 employees', or 
    'founded after 2018' and convert them into structured filters for downstream company search.""".strip()
    
)

In [30]:
# Entity extraction tool
# This tool is used to extract named entities from the user's query.

from langchain.agents import Tool
entity_extractor_tool = Tool(
    name="Entities Extraction",
    func=extract_entities,
    description="""Extracts named entities like actor names, directors, production companies, locations, or specific movie/show titles from the input query. 
                    Use this when the query mentions people, places, or titles explicitly.""".strip()
)

In [10]:
# Category Classification Tool

# This tool classifies the user query into one or more predefined company categories
# such as Fintech, SaaS, HealthTech, B2B, Logistics, etc.

from langchain.agents import Tool
classify_categories_tool = Tool(
    name="Classify Categories",
    func=classify_categories,
    description="""Classifies the input query into predefined categories like SaaS, FinTech, Edtech, etc. 
                    Returns a dictionary with the key 'industry_category'.",""".strip()
)

In [11]:
# Compose Filters Tool

##  This tool combines the outputs of the previous tools into a structured filter object.
##  This is the final step in the pipeline, so it should be called after all other tools have been applied.
##  It takes the outputs of the keyword extractor, numeric constraints, entity extractor, and category classifier
##  and combines them into a single filter object that can be used for downstream search agents. 
from langchain.agents import Tool

filter_composer_tool = Tool(
    name="filter_composer",
    func=compose_filters,
    description="""Combines extracted metadata from enhancer tools into a unified dictionary of filters. It merges outputs from keyword, numeric, entity, and category extractors into a structured format. 
                    Should be used after all extraction tools have run. 
                    Accepts valid JSON-like inputs and returns a single filter object for downstream search agents.""".strip()

)

In [12]:
# RAG Search Tool
# This tool performs a semantic search over startup documents using FAISS.


#from langchain_core.tools import Tool
#from enhancer_agent_tools.rag_search_tool import RagSearchInput  # Import RagSearchInput

#rag_search_tool = Tool(
#    name="rag_search_tool",
#    func=rag_search_fn,
#    description="Semantic search tool over startup documents using FAISS. Supports filters like sector, funding, location.",
#    args_schema=RagSearchInput,
#)

In [34]:
# Qdrant Search Tool
from langchain_openai import OpenAIEmbeddings
from langchain.tools import Tool
from tools.qdrant_tools.qdrant_server_tool import QdrantSearchTool

# Instantiate embedding + Qdrant tool
embedding_model = OpenAIEmbeddings()

qdrant_tool = QdrantSearchTool(
    host="localhost",
    port=6333,
    collection_name="indian_startups",  # ✅ Replace this
    embedding_model=embedding_model
)

# Wrapper for LangChain agent compatibility
def wrapped_qdrant_search(inputs: dict) -> list:
    query = inputs.get("query", "")
    filters = inputs.get("filters", None)
    k = inputs.get("k", 5)
    print(f"\n[DEBUG] Query: {query}")
    print(f"[DEBUG] Filters: {filters}")
    print(f"[DEBUG] Top K: {k}")
    try:
        results = qdrant_tool.search(query=query, filters=filters, k=k)
        print(f"[DEBUG] Raw results: {results}")
        return results
    except Exception as e:
        print(f"[ERROR] Qdrant search failed: {e}")
        return []

# LangChain Tool registration
qdrant_search_tool = Tool(
    name="qdrant_search",
    func=wrapped_qdrant_search,
    description=(
        """Perform hybrid semantic + metadata searches over our SuperVator startup knowledge base 🔍🚀

• **Semantic Retrieval** – Leverage OpenAI embeddings + Qdrant’s cosine‐distance vector index to find the most contextually relevant companies for ANY natural-language query (“emerging fintech players”, “best agritech startups”, etc.).

• **Metadata & Keyword Filters** – Narrow down results by exact or fuzzy matching on structured fields:
    – ▶️ *Categorical*: `state`, `industry_sector`, `hiring_status`, `tech_stack`, `founders`, etc.  
    – ▶️ *Numeric Ranges*: `year_founded`, `total_funding_raised_inr`, `number_of_employees_current`, etc. (supports `gte`/`lte` filters)

• **Fully Hybrid** – Mix & match: e.g. “top funded SaaS companies in Delhi founded after 2015”  
simply by passing your free-text query plus a `filters` dict:"""
    )
)


In [29]:
def test_semantic():
    print("🔍 Test: pure semantic (no filters)")
    results = wrapped_qdrant_search({"query": "emerging fintech startups", "filters": None, "k": 3})
    for r in results:
        print(f" • [{r['score']:.4f}] {r['payload'].get('company_name', '[No Name]')}")

def test_metadata():
    print("🔍 Test: metadata-only filter state=delhi")
    results = wrapped_qdrant_search({"query": "", "filters": {"state": "delhi"}, "k": 5})
    for r in results:
        payload = r['payload']
        print(f" • {payload.get('company_name', '[No Name]')} (state={payload.get('state', '[No State]')})")

def test_range():
    print("🔍 Test: range filter year_founded in [2000,2010]")
    results = wrapped_qdrant_search({
        "query": "",
        "filters": {"year_founded": {"gte": 2000, "lte": 2010}},
        "k": 5
    })
    for r in results:
        payload = r['payload']
        print(f" • {payload.get('company_name', '[No Name]')} (founded={payload.get('year_founded', '[No Year]')})")

# 6️⃣ Run all tests
test_semantic()
print()
test_metadata()
print()
test_range()



🔍 Test: pure semantic (no filters)

[DEBUG] Query: emerging fintech startups
[DEBUG] Filters: None
[DEBUG] Top K: 3
[DEBUG] Raw results: [{'id': 118, 'score': 0.7725888, 'payload': {'company_name': 'boat', 'legal_entity_type': 'pvt ltd', 'state': 'uttar pradesh', 'headquarters_city': 'delhi', 'year_founded': '2019', 'company_website': 'https://woodsllc.in', 'logo_url': 'https://logo.clearbit.com/woodsllc.in', 'company_description_short': 'total needs-based hardware', 'company_description_long': 'certain young water until able ball art join. similar meet decade trial speak station write.\r\narrive throw data. take still worry sign create risk.\r\nissue discover white. government cell live capital option eight onto. degree effort else. as discussion near environmental.', 'industry_sector': 'healthtech', 'total_funding_raised_inr': '₹35 cr', 'number_of_funding_rounds': '2', 'latest_funding_round_type': 'pre-seed', 'latest_funding_date': '2022-11-21', 'lead_investors': 'hartman, santana an

In [20]:
from qdrant_client import QdrantClient
client = QdrantClient(host="localhost", port=6333)
print(client.get_collections())
print(client.count(collection_name="indian_startups"))

collections=[CollectionDescription(name='terraforming'), CollectionDescription(name='demo_payload'), CollectionDescription(name='indian_startups'), CollectionDescription(name='star_charts')]
count=1500
