In [173]:
# %% 📝 Souring Path
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(f"✅ SRC Path: {SRC_PATH}")


✅ SRC Path: /home/prashant-agrawal/projects/netflix_talk2data/src


In [174]:
# Python imports & libraries
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client.http.models import Range
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType, FieldCondition, MatchValue, Filter
from typing import Dict, Union, Any
import pandas as pd
import re
import json

# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path

# %% 📁 Paths
BASE_DIR = get_base_dir()
DATA_PATH = get_data_path()
SCHEMA_OUTPUT_PATH = get_schema_path()
qdrant_store_path = get_qdrant_store_path()
COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Base Dir: {BASE_DIR}")
print(f"📌 CSV Path: {DATA_PATH}")
print(f"📌 Qdrant Local Path: {qdrant_store_path}")
print(f"📌 Collection Name: {COLLECTION_NAME}")
print(f"📌 Schema Path: {SCHEMA_OUTPUT_PATH}")



📌 Base Dir: /home/prashant-agrawal/projects/netflix_talk2data/src
📌 CSV Path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
📌 Qdrant Local Path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/qdrant_store_local_db/collection
📌 Collection Name: indian_startups
📌 Schema Path: /home/prashant-agrawal/projects/netflix_talk2data/src/schema/payload_schema.json


In [175]:
# %% 🔍 Normalize field names
def normalize_field_name(field: str) -> str:
    field = field.strip().lower()
    field = re.sub(r"[ ()/]", "_", field)
    return re.sub(r"[^a-zA-Z0-9_]", "", field)

def normalize_field_value(value: Any) -> str:
    return str(value).strip().lower()


In [176]:
# %% 🔍 Load CSV
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Company Description (Long)"]).reset_index(drop=True)

# %% 📄 Convert to LangChain Documents
def load_documents_from_df(df: pd.DataFrame) -> list:
    documents = []
    for _, row in df.iterrows():
        metadata = {
            normalize_field_name(str(k)): str(v).strip().lower()
            for k, v in row.items() if pd.notna(v)
        }
        content = "\n".join(f"{k}: {v}" for k, v in metadata.items())
        documents.append(Document(page_content=content, metadata=metadata))
    return documents

documents = load_documents_from_df(df)
print(f"📚 Loaded {len(documents)} documents")
print("🔑 Sample Metadata Keys:", list(documents[0].metadata.keys()))

# %% 🧠 Embedding Model
embedding_model = OpenAIEmbeddings()


📚 Loaded 500 documents
🔑 Sample Metadata Keys: ['company_name', 'legal_entity_type', 'state', 'headquarters_city', 'year_founded', 'company_website', 'logo_url', 'company_description__short_', 'company_description__long_', 'industry_sector', 'total_funding_raised__inr_', 'number_of_funding_rounds', 'latest_funding_round_type', 'latest_funding_date', 'lead_investors', 'revenue_estimate__annual_', 'valuation_estimate__if_available_', 'number_of_employees__current_', 'number_of_employees__estimate_range_', 'key_people', 'founders', 'board_members___advisors', 'employee_growth__yoy__', 'hiring_status', 'popular_roles_open', 'primary_products___services', 'product_categories', 'tech_stack', 'integrations___apis_offered', 'target_market', 'major_customers___logos', 'press_mentions___recent_news', 'competitors']


In [168]:
# %% ✅ Explicitly Create Qdrant Collection
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

# %% 🔁 Recreate Qdrant Collection (Optional but Recommended)
client = QdrantClient(host="localhost", port=6333)

if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
print(f"✅ Re-created collection: {COLLECTION_NAME}")

✅ Re-created collection: indian_startups


In [None]:
# --- Qdrant Payload Schema Indexing ---
# ✅ Define schema (range-enabled where needed)
from qdrant_client.models import PayloadSchemaType

payload_schema = {
    "Company Name": PayloadSchemaType.TEXT,
    "Legal Entity Type": PayloadSchemaType.TEXT,
    "State": PayloadSchemaType.TEXT,
    "Headquarters City": PayloadSchemaType.TEXT,

    "Year Founded": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Company Website": PayloadSchemaType.TEXT,
    "Logo URL": PayloadSchemaType.TEXT,
    "Company Description (Short)": PayloadSchemaType.TEXT,
    "Company Description (Long)": PayloadSchemaType.TEXT,
    "Industry Sector": PayloadSchemaType.TEXT,

    "Total Funding Raised (INR)": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Number of Funding Rounds": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Latest Funding Round Type": PayloadSchemaType.TEXT,
    "Latest Funding Date": PayloadSchemaType.TEXT,
    "Lead Investors": PayloadSchemaType.TEXT,

    "Revenue Estimate (Annual)": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Valuation Estimate (if available)": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Number of Employees (Current)": {
        "type": PayloadSchemaType.INTEGER,
        "params": {"range": True}
    },

    "Number of Employees (Estimate Range)": PayloadSchemaType.TEXT,
    "Key People": PayloadSchemaType.TEXT,
    "Founders": PayloadSchemaType.TEXT,
    "Board Members / Advisors": PayloadSchemaType.TEXT,

    "Employee Growth (YoY %)": {
        "type": PayloadSchemaType.FLOAT,
        "params": {"range": True}
    },

    "Hiring Status": PayloadSchemaType.TEXT,
    "Popular Roles Open": PayloadSchemaType.TEXT,
    "Primary Products / Services": PayloadSchemaType.TEXT,
    "Product Categories": PayloadSchemaType.TEXT,
    "Tech Stack": PayloadSchemaType.TEXT,
    "Integrations / APIs offered": PayloadSchemaType.TEXT,
    "Target Market": PayloadSchemaType.TEXT,
    "Major Customers / Logos": PayloadSchemaType.TEXT,
    "Press Mentions / Recent News": PayloadSchemaType.TEXT,
    "Competitors": PayloadSchemaType.TEXT
}


✅ Created payload schema for collection: indian_startups


In [177]:
# ✅ Sanitize field names and create payload indexes
import re

for field, schema in payload_schema.items():
    safe_field = normalize_field_name(field)   #Using the normalize function to ensure field names are safe
    
    # If schema is a dict (with 'type'), extract just the type
    if isinstance(schema, dict) and "type" in schema:
        field_schema = schema["type"]
    else:
        field_schema = schema
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name=safe_field,
        field_schema=field_schema
    )

print(f"✅ Created payload schema for collection: {COLLECTION_NAME}")


✅ Created payload schema for collection: indian_startups


In [178]:
# %% 📤 Ingest Documents into Qdrant

qdrant = Qdrant.from_documents(
    documents=documents,
    embedding=embedding_model,
    url="http://localhost:6333",
    collection_name=COLLECTION_NAME,
)

# Verify insertion
print(f"✅ Ingested {len(documents)} documents.")
print(f"📊 Total in Qdrant: {qdrant.client.count(COLLECTION_NAME, exact=True).count}")

✅ Ingested 500 documents.
📊 Total in Qdrant: 500


In [186]:
from qdrant_client.http.models import FieldCondition, MatchText, Range, Filter

def build_filter_conditions(filters: dict):
    conditions = []
    for key, value in filters.items():
        norm_key = normalize_field_name(key)

        if isinstance(value, dict) and ("gte" in value or "lte" in value):
            # 📏 Range filter
            conditions.append(
                FieldCondition(
                    key=norm_key,
                    range=Range(
                        gte=value.get("gte"),
                        lte=value.get("lte")
                    )
                )
            )
        else:
            # 🔤 Textual match (case-insensitive)
            norm_value = normalize_field_value(value)
            conditions.append(
                FieldCondition(
                    key=norm_key,
                    match=MatchText(text=norm_value)
                )
            )
    return conditions

def search_with_metadata_and_range_filter(qdrant, query: str, k: int, filters: dict):
    print(f"🔍 Query: {query}")
    print(f"📎 Filters: {filters}")
    print(f"🔢 Top K: {k}")

    conditions = build_filter_conditions(filters)
    q_filter = Filter(must=conditions)

    results = qdrant.similarity_search_with_score(query=query, k=k, filter=q_filter)

    if not results:
        print("⚠️ No results found.")
    else:
        for doc, score in results:
            print(f"\n🎯 Score: {score:.4f}")
            print(doc.page_content)
            print("-" * 60)


In [187]:
# --- Example Query ---
search_with_metadata_and_range_filter(
    qdrant,
    query="growing startups in delhi",
    k=5,
    filters={
        "state": "delhi",
    }
)


🔍 Query: growing startups in delhi
📎 Filters: {'state': 'delhi'}
🔢 Top K: 5
⚠️ No results found.


In [190]:
print(df['State'].unique())

print(df[df['State'].str.lower().str.contains('delhi', na=False)])

['Telangana' 'Rajasthan' 'Tamil Nadu' 'Karnataka' 'Uttar Pradesh' 'Delhi'
 'West Bengal' 'Punjab' 'Maharashtra' 'Gujarat']
    Company Name Legal Entity Type  State Headquarters City  Year Founded  \
6          Noise    Proprietorship  Delhi            Mumbai          1996   
8    Practically               LLC  Delhi         Ahmedabad          2010   
16   Tork Motors    Proprietorship  Delhi         Hyderabad          2004   
27     Mamaearth        Public Ltd  Delhi             Delhi          2012   
28      Scripbox               LLC  Delhi        Chandigarh          2013   
..           ...               ...    ...               ...           ...   
482     Cashfree           Pvt Ltd  Delhi         Bengaluru          2005   
484        Udaan    Proprietorship  Delhi           Kolkata          2010   
488     Ola Cabs    Proprietorship  Delhi         Ahmedabad          2004   
493    BigBasket               LLC  Delhi           Kolkata          2003   
497    ShareChat              

In [192]:
from qdrant_client.http.models import Filter, FieldCondition, MatchText

q_filter = Filter(must=[
    FieldCondition(key="state", match=MatchText(text="delhi"))
])
result = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=q_filter,
    limit=10,
    with_payload=True
)
for pt in result[0]:
    print(pt.payload)

In [193]:
# After ingestion
docs = qdrant.client.scroll(collection_name=COLLECTION_NAME, limit=5, with_payload=True)[0]
for doc in docs:
    print(doc.payload)

{'page_content': 'company_name: policybazaar\nlegal_entity_type: proprietorship\nstate: gujarat\nheadquarters_city: jaipur\nyear_founded: 2002\ncompany_website: https://jones,berryandcordova.in\nlogo_url: https://logo.clearbit.com/jones,berryandcordova.in\ncompany_description__short_: synergized asynchronous function\ncompany_description__long_: machine your weight seat.\r\nlife however ok. hope where for seat once share. boy sport under important agreement decision purpose rest.\r\nstudent with claim. plant value serve federal.\nindustry_sector: logistics\ntotal_funding_raised__inr_: ₹443 cr\nnumber_of_funding_rounds: 2\nlatest_funding_round_type: pre-seed\nlatest_funding_date: 2024-06-18\nlead_investors: gilbert, leblanc and morgan\nrevenue_estimate__annual_: ₹136 cr\nvaluation_estimate__if_available_: ₹1573 cr\nnumber_of_employees__current_: 77\nnumber_of_employees__estimate_range_: 16-848\nkey_people: ceo: karen king, cto: mr. russell graham\nfounders: edward harper (https://linked

In [194]:
from qdrant_client.http.models import Filter, FieldCondition, MatchText

q_filter = Filter(must=[
    FieldCondition(key="state", match=MatchText(text="delhi"))
])
result = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=q_filter,
    limit=10,
    with_payload=True
)
print("Number of results:", len(result[0]))
for pt in result[0]:
    print(pt.payload)

Number of results: 0


In [195]:
# Inspect all 'state' values in Qdrant directly (no filters)
all_states = set()
for pt in client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=1000)[0]:
    if 'state' in pt.payload:
        all_states.add(pt.payload['state'])
print("All states stored in Qdrant:", all_states)

All states stored in Qdrant: set()
