In [1]:
# %% 📝 Souring Path
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(f"✅ SRC Path: {SRC_PATH}")


✅ SRC Path: /home/prashant-agrawal/projects/netflix_talk2data/src


In [4]:
# Python imports & libraries
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client.http.models import Range
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType, FieldCondition, MatchValue, Filter
from typing import Dict, Union, Any
import pandas as pd
import re
import json

# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path

# %% 📁 Paths
BASE_DIR = get_base_dir()
DATA_PATH = get_data_path()
SCHEMA_OUTPUT_PATH = get_schema_path()
qdrant_store_path = get_qdrant_store_path()
COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Base Dir: {BASE_DIR}")
print(f"📌 CSV Path: {DATA_PATH}")
print(f"📌 Qdrant Local Path: {qdrant_store_path}")
print(f"📌 Collection Name: {COLLECTION_NAME}")
print(f"📌 Schema Path: {SCHEMA_OUTPUT_PATH}")

Qdrant store path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/qdrant_store_local_db/collection
Data path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
Schema path: /home/prashant-agrawal/projects/netflix_talk2data/src/schema/payload_schema.json
📌 Base Dir: /home/prashant-agrawal/projects/netflix_talk2data/src
📌 CSV Path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
📌 Qdrant Local Path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/qdrant_store_local_db/collection
📌 Collection Name: indian_startups
📌 Schema Path: /home/prashant-agrawal/projects/netflix_talk2data/src/schema/payload_schema.json


In [8]:
# --- Utility: Normalization ---
def normalize_field_name(field: str) -> str:
    return (
        field.strip().lower()
        .replace(" ", "_").replace("(", "").replace(")", "")
        .replace("/", "_")
    )

def normalize_field_value(value) -> str:
    return str(value).strip().lower()

In [5]:
# --- 1. Load & Process Data ---
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Company Description (Long)"]).reset_index(drop=True)

def build_points(df):
    for idx, row in df.iterrows():
        metadata = {
            normalize_field_name(str(k)): normalize_field_value(v)
            for k, v in row.items() if pd.notna(v)
        }
        # Use only main description as page_content
        content = str(row["Company Description (Long)"]) if "Company Description (Long)" in row else ""
        yield {
            "id": int(idx),
            "vector": embedding_model.embed_query(content),
            "payload": metadata
        }

embedding_model = OpenAIEmbeddings()

In [6]:
# --- 2. Qdrant Setup ---
client = QdrantClient(host="localhost", port=6333)
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

True

In [11]:
import os
from qdrant_client.http.models import PayloadSchemaType
from schema.qdrant_schema import PAYLOAD_SCHEMA

for field, schema in PAYLOAD_SCHEMA.items():
    # Normalize names (snake_case, all lowercase)
    if isinstance(schema, dict) and "type" in schema:
        field_schema = schema["type"]
    else:
        field_schema = schema
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name=field,
        field_schema=field_schema
    )

✅ SRC Path: /home/prashant-agrawal/projects/netflix_talk2data/src


In [12]:
# --- 3. Upload Data ---
points = list(build_points(df))
client.upsert(collection_name=COLLECTION_NAME, points=points)
print(f"✅ Ingested {len(points)} points into {COLLECTION_NAME}.")

✅ Ingested 500 points into indian_startups.


In [23]:
## Inspecting the Collection & Payload Schema

# 1️⃣ See the vector-config & overall collection info
collection_info = client.get_collection(collection_name=COLLECTION_NAME)
print(collection_info)

# 2️⃣ Peek at the payload schema you just registered
print(collection_info.payload_schema)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=1000 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=No

In [14]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

qf = Filter(must=[
    FieldCondition(key="state", match=MatchValue(value="delhi"))
])

hits = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=qf,
    with_payload=True,
    limit=5
)[0]

print(f"Exact‐match ‘state=delhi’ → {len(hits)} hit(s):")
for pt in hits:
    print(" ", pt.payload)

Exact‐match ‘state=delhi’ → 5 hit(s):
  {'company_name': 'noise', 'legal_entity_type': 'proprietorship', 'state': 'delhi', 'headquarters_city': 'mumbai', 'year_founded': '1996', 'company_website': 'https://perkinsllc.in', 'logo_url': 'https://logo.clearbit.com/perkinsllc.in', 'company_description_short': 'streamlined needs-based flexibility', 'company_description_long': 'again customer performance director sure media. boy seat however road area shake if.\r\nmouth chance believe fill sometimes those necessary various. serve quality happy under.\r\nhistory full energy our allow. under marriage last represent night.\r\nstate vote heavy art hope political five.', 'industry_sector': 'e-commerce', 'total_funding_raised_inr': '₹115 cr', 'number_of_funding_rounds': '2', 'latest_funding_round_type': 'series b', 'latest_funding_date': '2020-07-18', 'lead_investors': 'smith, zhang and walker', 'revenue_estimate_annual': '₹108 cr', 'valuation_estimate_if_available': '₹679 cr', 'number_of_employees

In [17]:
from qdrant_client.http.models import Filter, FieldCondition, Range

def test_range_filter(field: str, gte=None, lte=None, limit=10):
    r = Range(gte=gte, lte=lte)
    qf = Filter(must=[ FieldCondition(key=field, range=r) ])
    hits = client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=qf,
        with_payload=True,
        limit=limit,
    )[0]
    print(f"Filter {field} in [{gte},{lte}] → {len(hits)} hits")
    for pt in hits:
        print(" ", pt.payload)

# e.g. companies founded between 2000 and 2005
test_range_filter("year_founded", gte=2000, lte=2005)


Filter year_founded in [2000,2005] → 0 hits


In [24]:
# if you also want to test your similarity_search_with_score call:
from qdrant_client.http.models import FieldCondition, MatchValue
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

# Convert DataFrame rows to langchain Document objects, ensuring page_content is a string
documents = [
    Document(
        page_content=str(row["Company Description (Long)"]) if pd.notna(row["Company Description (Long)"]) else "",
        metadata={col: row[col] for col in df.columns if col != "Company Description (Long)"}
    )
    for _, row in df.iterrows()
]

qdrant = Qdrant.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(),
    url="http://localhost:6333",
    collection_name=COLLECTION_NAME,
)

def hybrid_search(query: str, filters: dict, k: int = 5):
    from qdrant_client.http.models import Filter, FieldCondition, MatchValue
    conds = [
        FieldCondition(key=f, match=MatchValue(value=v))
        for f,v in filters.items()
    ]
    qf = Filter(must=conds)
    results = qdrant.similarity_search_with_score(query=query, k=k, filter=qf)
    print(f"Query={query!r}, filters={filters} → {len(results)} hits")
    for doc,score in results:
        print(f"  {score:.3f}", doc.metadata)

# e.g.
hybrid_search("fintech startups", {"state":"delhi"}, k=3)


ValidationError: 1 validation error for Document
page_content
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type