In [1]:
##%% [markdown]
# # Qdrant Ingestion & Indexing Pipeline
#
# **What this notebook does**:
# 1. Creates a Qdrant collection with 384‑dim vectors and an HNSW graph configured for fast search  
# 2. Registers payload indexes on your metadata fields  
# 3. Embeds and ingests your dataset of 500 startups from the CSV  
# 4. Polls the collection until the HNSW index is fully built  
# 5. Demonstrates vector similarity and metadata filtering  
#  
# **Key parameters**:
# - **Vector dim**: 384 (MiniLM)  
# - **HNSW**: `m=16`, `ef_construct=100`  
# - **Metadata indexes**: all fields in `PAYLOAD_SCHEMA`  
#  
# This notebook is meant to serve as both runnable code and living documentation.

##%% [markdown]

In [2]:
# %% 📝 Souring Path
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(f"✅ SRC Path: {SRC_PATH}")


✅ SRC Path: /home/prashant-agrawal/projects/company_talk2data/src


In [3]:
# Python imports & libraries
from langchain_community.vectorstores import Qdrant
#from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client.http.models import Range
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType, FieldCondition, MatchValue, Filter,HnswConfigDiff
from typing import Dict, Union, Any
from langchain_together.embeddings import TogetherEmbeddings
import pandas as pd
import re
import json

# 🚀 Import your utility loaders
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path

# %% 📁 Paths
BASE_DIR = get_base_dir()
DATA_PATH = get_data_path()
SCHEMA_OUTPUT_PATH = get_schema_path()
qdrant_store_path = get_qdrant_store_path()
COLLECTION_NAME = get_qdrant_collection_name()

print(f"📌 Base Dir: {BASE_DIR}")
print(f"📌 CSV Path: {DATA_PATH}")
print(f"📌 Qdrant Local Path: {qdrant_store_path}")
print(f"📌 Collection Name: {COLLECTION_NAME}")
print(f"📌 Schema Path: {SCHEMA_OUTPUT_PATH}")

📌 Base Dir: /home/prashant-agrawal/projects/company_talk2data/src
📌 CSV Path: /home/prashant-agrawal/projects/company_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
📌 Qdrant Local Path: /home/prashant-agrawal/projects/company_talk2data/src/database/qdrant_store_local_db/collection
📌 Collection Name: indian_startups
📌 Schema Path: /home/prashant-agrawal/projects/company_talk2data/src/schema/payload_schema.json


In [4]:
# --- Utility: Normalization ---
def normalize_field_name(field: str) -> str:
    return (
        field.strip().lower()
        .replace(" ", "_").replace("(", "").replace(")", "")
        .replace("/", "_")
    )

def normalize_field_value(value) -> str:
    return str(value).strip().lower()

In [5]:
#%pip install sentence-transformers
#%pip install -U langchain-huggingface
from langchain_huggingface import HuggingFaceEmbeddings
#embedding_model = OpenAIEmbeddings()

embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [6]:
# --- 1. Load & Process Data ---
from langchain.embeddings import HuggingFaceEmbeddings
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["Company Description (Long)"]).reset_index(drop=True)

def build_points(df):
    for idx, row in df.iterrows():
        metadata = {
            normalize_field_name(str(k)): normalize_field_value(v)
            for k, v in row.items() if pd.notna(v)
        }
        # Use only main description as page_content
        content = str(row["Company Description (Long)"]) if "Company Description (Long)" in row else ""
        yield {
            "id": int(idx),
            "vector": embeddings_model.embed_query(content),
            "payload": metadata
        }



In [7]:
# --- 2. Qdrant Setup ---
from schema.qdrant_schema import PAYLOAD_SCHEMA
from qdrant_client.http.models import HnswConfig

client = QdrantClient(host="localhost", port=6333)
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(collection_name=COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)



✅ SRC Path: /home/prashant-agrawal/projects/company_talk2data/src


True

In [8]:
import os
from qdrant_client.http.models import PayloadSchemaType
from schema.qdrant_schema import PAYLOAD_SCHEMA

for field, schema in PAYLOAD_SCHEMA.items():
    # Normalize names (snake_case, all lowercase)
    if isinstance(schema, dict) and "type" in schema:
        field_schema = schema["type"]
    else:
        field_schema = schema
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name=field,
        field_schema=field_schema
    )

In [9]:
# --- 3. Upload Data ---
points = list(build_points(df))
client.upsert(collection_name=COLLECTION_NAME, points=points)
print(f"✅ Ingested {len(points)} points into {COLLECTION_NAME}.")

✅ Ingested 500 points into indian_startups.


In [77]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import (
    VectorParams, Distance,
    HnswConfigDiff, OptimizersConfig
)

client = QdrantClient(host="localhost", port=6333)
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(
        size=384, distance=Distance.COSINE,
        hnsw_config=HnswConfigDiff(
            m=16,
            ef_construct=100,
            full_scan_threshold=10     # minimum allowed value is 10
        )
    )
)

# upsert your 500 points
points = list(build_points(df))
client.upsert(collection_name=COLLECTION_NAME, points=points)

# poll until all are indexed
import time
while True:
    info = client.get_collection(COLLECTION_NAME)
    print(f"Indexed {info.indexed_vectors_count}/{info.points_count}")
    if info.indexed_vectors_count >= info.points_count:
        break
    time.sleep(1)
print("✅ HNSW graph now built and ready!")


Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500
Indexed 0/500


KeyboardInterrupt: 

In [71]:
# 3) **Explicitly build the HNSW graph**
client.update_collection(
    collection_name=COLLECTION_NAME,
    hnsw_config=HnswConfigDiff(
        m=16,
        ef_construct=100,
        full_scan_threshold=100
    )
)


True

In [74]:
import requests, time
from qdrant_client import QdrantClient

# — after your upsert(…) call —
#
# Trigger the HNSW index build via the official REST path
resp = requests.put(
    f"http://localhost:6333/collections/{COLLECTION_NAME}/index/hnsw",
    json={
      "field_name": "vector",
      "hnsw_config": {
         "m": 16,
         "ef_construct": 100,
         "full_scan_threshold": 100
      }
    },
)
resp.raise_for_status()
print("🔍 HNSW index build started:", resp.json())



HTTPError: 404 Client Error: Not Found for url: http://localhost:6333/collections/indian_startups/index/hnsw

In [None]:

# 4) Poll until it's done
import time
while True:
    info = client.get_collection(COLLECTION_NAME)
    print(f"Indexed {info.indexed_vectors_count}/{info.points_count}")
    if info.indexed_vectors_count >= info.points_count:
        break
    time.sleep(1)
print("✅ HNSW graph fully built—ready for low‑latency searches!")

In [41]:
from qdrant_client import QdrantClient

client = QdrantClient(host="localhost", port=6333)
info = client.get_collection(collection_name=COLLECTION_NAME)

print("Vector config:      ", info.config.params.vectors)
print("Total points:       ", info.points_count)
print("Indexed vectors:    ", info.indexed_vectors_count)
print("HNSW parameters M:  ", info.config.hnsw_config.m)
print("HNSW ef_construct:  ", info.config.hnsw_config.ef_construct)
print("HNSW built segments:", info.segments_count)

Vector config:       size=384 distance=<Distance.COSINE: 'Cosine'> hnsw_config=HnswConfigDiff(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=None, on_disk=None, payload_m=None) quantization_config=None on_disk=None datatype=None multivector_config=None
Total points:        500
Indexed vectors:     0
HNSW parameters M:   16
HNSW ef_construct:   100
HNSW built segments: 8


In [42]:
# ## 5. Verify HNSW Index Build

# %%
import time
while True:
    info = client.get_collection(collection_name=COLLECTION_NAME)
    done, total = info.indexed_vectors_count, info.points_count
    print(f"🔄 Indexed {done}/{total}")
    if done >= total:
        break
    time.sleep(2)
print("✅ HNSW graph fully built!")

🔄 Indexed 0/500
🔄 Indexed 0/500
🔄 Indexed 0/500
🔄 Indexed 0/500
🔄 Indexed 0/500
🔄 Indexed 0/500
🔄 Indexed 0/500


KeyboardInterrupt: 

In [26]:
## Inspecting the Collection & Payload Schema

# 1️⃣ See the vector-config & overall collection info
collection_info = client.get_collection(collection_name=COLLECTION_NAME)
print(collection_info)

# 2️⃣ Peek at the payload schema you just registered
print(collection_info.payload_schema)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=0 points_count=500 segments_count=8 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None

In [27]:
from qdrant_client.http.models import Filter, FieldCondition, MatchValue

qf = Filter(must=[
    FieldCondition(key="state", match=MatchValue(value="Maharashtra"))
])

hits = client.scroll(
    collection_name=COLLECTION_NAME,
    scroll_filter=qf,
    with_payload=True,
    limit=5
)[0]

print(f"Exact‐match ‘state=Maharashtra’ → {len(hits)} hit(s):")
for pt in hits:
    print(" ", pt.payload)

Exact‐match ‘state=Maharashtra’ → 0 hit(s):


In [28]:
from qdrant_client.http.models import Filter, FieldCondition, Range

def test_range_filter(field: str, gte=None, lte=None, limit=10):
    r = Range(gte=gte, lte=lte)
    qf = Filter(must=[ FieldCondition(key=field, range=r) ])
    hits = client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=qf,
        with_payload=True,
        limit=limit,
    )[0]
    print(f"Filter {field} in [{gte},{lte}] → {len(hits)} hits")
    for pt in hits:
        print(" ", pt.payload)

# e.g. companies founded between 2000 and 2005
test_range_filter("year_founded", gte=2000, lte=2005)


Filter year_founded in [2000,2005] → 0 hits
