In [1]:
# %% 📝 Souring Path
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(f"✅ SRC Path: {SRC_PATH}")


✅ SRC Path: /home/prashant-agrawal/projects/netflix_talk2data/src


In [2]:
# Python imports & libraries
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client.http.models import Range
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType, FieldCondition, MatchValue, Filter
from typing import Dict, Union, Any
import pandas as pd
import re
import json

# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path

# %% 📁 Paths
BASE_DIR = get_base_dir()
DATA_PATH = get_data_path()
SCHEMA_OUTPUT_PATH = get_schema_path()
qdrant_store_path = get_qdrant_store_path()
COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Base Dir: {BASE_DIR}")
print(f"📌 CSV Path: {DATA_PATH}")
print(f"📌 Qdrant Local Path: {qdrant_store_path}")
print(f"📌 Collection Name: {COLLECTION_NAME}")
print(f"📌 Schema Path: {SCHEMA_OUTPUT_PATH}")



Qdrant store path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/qdrant_store_local_db/collection
Data path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
Schema path: /home/prashant-agrawal/projects/netflix_talk2data/src/schema/payload_schema.json
📌 Base Dir: /home/prashant-agrawal/projects/netflix_talk2data/src
📌 CSV Path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
📌 Qdrant Local Path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/qdrant_store_local_db/collection
📌 Collection Name: indian_startups
📌 Schema Path: /home/prashant-agrawal/projects/netflix_talk2data/src/schema/payload_schema.json


In [None]:
import pandas as pd
import re

def normalize_field_name(field: str) -> str:
    field = field.strip().lower()
    field = re.sub(r"[ ()/]", "_", field)
    return re.sub(r"[^a-zA-Z0-9_]", "", field)

def normalize_field_value(value) -> str:
    return str(value).strip().lower()

# Load and clean your data
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Company Description (Long)"]).reset_index(drop=True)



In [5]:
embedding_model = OpenAIEmbeddings()
vectors = embedding_model.embed_documents(df['Company Description (Long)'].tolist())

In [6]:
## Build Points
points = []
for idx, row in df.iterrows():
    payload = {
        normalize_field_name(str(k)): normalize_field_value(v)
        for k, v in row.items() if pd.notna(v)
    }
    vector = vectors[idx]
    points.append({
        "id": idx,
        "vector": vector,
        "payload": payload
    })

In [7]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType

client = QdrantClient(host="localhost", port=6333)
COLLECTION_NAME = COLLECTION_NAME

# Delete and re-create the collection
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)

True

In [8]:
payload_schema = {
    "company_name": PayloadSchemaType.KEYWORD,
    "legal_entity_type": PayloadSchemaType.KEYWORD,
    "state": PayloadSchemaType.KEYWORD,
    "headquarters_city": PayloadSchemaType.KEYWORD,
    "industry_sector": PayloadSchemaType.KEYWORD,
    # add more if needed
}
for field, schema in payload_schema.items():
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name=field,
        field_schema=schema
    )

In [9]:
# Ingest points in chunks if you have a large dataset
batch_size = 100
for i in range(0, len(points), batch_size):
    client.upsert(collection_name=COLLECTION_NAME, points=points[i:i+batch_size])

print(f"✅ Ingested {len(points)} documents.")

✅ Ingested 500 documents.


In [10]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

q_filter = Filter(must=[
    FieldCondition(key="state", match=MatchValue(value="delhi"))
])

result = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=q_filter,
    limit=10,
    with_payload=True
)

print("Number of results:", len(result[0]))
for pt in result[0]:
    print(pt.payload)

Number of results: 10
{'company_name': 'noise', 'legal_entity_type': 'proprietorship', 'state': 'delhi', 'headquarters_city': 'mumbai', 'year_founded': '1996', 'company_website': 'https://perkinsllc.in', 'logo_url': 'https://logo.clearbit.com/perkinsllc.in', 'company_description__short_': 'streamlined needs-based flexibility', 'company_description__long_': 'again customer performance director sure media. boy seat however road area shake if.\r\nmouth chance believe fill sometimes those necessary various. serve quality happy under.\r\nhistory full energy our allow. under marriage last represent night.\r\nstate vote heavy art hope political five.', 'industry_sector': 'e-commerce', 'total_funding_raised__inr_': '₹115 cr', 'number_of_funding_rounds': '2', 'latest_funding_round_type': 'series b', 'latest_funding_date': '2020-07-18', 'lead_investors': 'smith, zhang and walker', 'revenue_estimate__annual_': '₹108 cr', 'valuation_estimate__if_available_': '₹679 cr', 'number_of_employees__curren

In [11]:
docs = client.scroll(collection_name=COLLECTION_NAME, with_payload=True, limit=5)[0]
for d in docs:
    print(d.payload)


{'company_name': 'swiggy', 'legal_entity_type': 'llc', 'state': 'telangana', 'headquarters_city': 'lucknow', 'year_founded': '2018', 'company_website': 'https://nichols,castilloandjones.in', 'logo_url': 'https://logo.clearbit.com/nichols,castilloandjones.in', 'company_description__short_': 'front-line multi-tasking flexibility', 'company_description__long_': 'type once whatever trouble. executive raise nation writer why.\r\nwin style her window your pick. goal stay fire hope around. order provide gun go attention scene myself.\r\nindustry fund man behavior theory student. firm day recognize order.', 'industry_sector': 'e-commerce', 'total_funding_raised__inr_': '₹457 cr', 'number_of_funding_rounds': '2', 'latest_funding_round_type': 'pre-seed', 'latest_funding_date': '2022-08-31', 'lead_investors': 'nicholson-rogers', 'revenue_estimate__annual_': '₹202 cr', 'valuation_estimate__if_available_': '₹770 cr', 'number_of_employees__current_': '303', 'number_of_employees__estimate_range_': '1

In [16]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

query = "fintech startups in bengaluru"
query_vector = embedding_model.embed_query(query)

q_filter = Filter(must=[
    FieldCondition(key="headquarters_city", match=MatchValue(value="bengaluru"))
])

results = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    query_filter=q_filter,
    limit=5,
    with_payload=True
)

for pt in results:
    print("Score:", pt.score)
    print("Payload:", pt.payload)
    print()


Score: 0.75562274
Payload: {'company_name': "byju's", 'legal_entity_type': 'proprietorship', 'state': 'punjab', 'headquarters_city': 'bengaluru', 'year_founded': '2020', 'company_website': 'https://brooks,wrightandsullivan.in', 'logo_url': 'https://logo.clearbit.com/brooks,wrightandsullivan.in', 'company_description__short_': 'stand-alone coherent software', 'company_description__long_': 'way rich travel hope. challenge business up deal technology.\r\neconomic quite leader realize for. policy behind feeling so.\r\npage decide term place source ground. situation student hotel more.\r\nbecome among discuss its hope including adult. most ground why many without color.', 'industry_sector': 'e-commerce', 'total_funding_raised__inr_': '₹86 cr', 'number_of_funding_rounds': '1', 'latest_funding_round_type': 'seed', 'latest_funding_date': '2021-04-23', 'lead_investors': 'clark-russell', 'revenue_estimate__annual_': '₹184 cr', 'valuation_estimate__if_available_': '₹194 cr', 'number_of_employees_

  results = client.search(
