In [12]:
# %%
import sys, os
SRC_PATH = os.path.abspath(os.path.join(os.getcwd(), "..","..","src"))
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
print(SRC_PATH)

/home/prashant-agrawal/Netflix_Project/src


In [13]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PayloadSchemaType
import pandas as pd
import re
import os
import json
from utils.qdrant_client_loader import get_qdrant_collection_name
from utils.path_config import get_base_dir, get_data_path, get_qdrant_store_path, get_schema_path


# 📁 Paths
BASE_DIR = get_base_dir()
print(f"Base Directory: {BASE_DIR}")

DATA_PATH = get_data_path()
print(f"CSV Path: {DATA_PATH}")

qdrant_store_path = get_qdrant_store_path()
print(f"Qdrant Local Path: {qdrant_store_path}")

COLLECTION_NAME = get_qdrant_collection_name()
print(f"Qdrant Collection Name: {COLLECTION_NAME}")

SCHEMA_OUTPUT_PATH = get_schema_path()
print(f"Schema Output Path: {SCHEMA_OUTPUT_PATH}")


Base Directory: /home/prashant-agrawal/Netflix_Project/src
CSV Path: /home/prashant-agrawal/Netflix_Project/src/Data/Enriched_Indian_Startup_Dataset.csv
Qdrant Local Path: /home/prashant-agrawal/Netflix_Project/src/database/qdrant_store_local_db/collection
Qdrant Collection Name: indian_startups
Schema Output Path: /home/prashant-agrawal/Netflix_Project/src/schema/payload_schema.json


In [8]:
from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
import pandas as pd
import json

def load_documents_from_df(df: pd.DataFrame) -> list:
    documents = []
    for _, row in df.iterrows():
        metadata = row.dropna().to_dict()
        content = "\n".join(f"{k}: {v}" for k, v in metadata.items())
        documents.append(Document(page_content=content, metadata=metadata))
    return documents

def run_ingestion_pipeline(csv_path: str, collection_name: str):
    df = pd.read_csv(csv_path)
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    client = QdrantClient(url="http://localhost:6333")

    # Clean slate
    if client.collection_exists(collection_name):
        client.delete_collection(collection_name=collection_name)

    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )

    documents = load_documents_from_df(df)
    Qdrant.from_documents(
        documents=documents,
        embedding=OpenAIEmbeddings(),
        url="http://localhost:6333",
        collection_name=collection_name
    )
    print(f"✅ Ingested {len(documents)} documents with metadata into Qdrant.")


In [14]:
# ingest_and_query_qdrant.py

def infer_payload_schema_from_df(df: pd.DataFrame) -> dict:
    inferred_schema = {}
    for col in df.columns:
        dtype = df[col].dropna().infer_objects().dtype
        if pd.api.types.is_integer_dtype(dtype):
            inferred_schema[col] = PayloadSchemaType.INTEGER
        elif pd.api.types.is_float_dtype(dtype):
            inferred_schema[col] = PayloadSchemaType.FLOAT
        else:
            inferred_schema[col] = PayloadSchemaType.KEYWORD
    return inferred_schema


def generate_payload_json(schema: dict, output_path: str):
    raw_json = {k: v.value for k, v in schema.items()}
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(raw_json, f, indent=2)
    print(f"📄 Saved payload schema to {output_path}")


def load_documents_from_df(df: pd.DataFrame) -> list:
    documents = []
    for _, row in df.iterrows():
        metadata = row.dropna().to_dict()
        content = "\n".join(f"{k}: {v}" for k, v in metadata.items())
        documents.append(Document(page_content=content, metadata=metadata))
    return documents


def recreate_qdrant_collection(client: QdrantClient, collection_name: str, schema: dict):
    if client.collection_exists(collection_name):
        client.delete_collection(collection_name=collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
        payload_schema=schema
    )
    print(f"✅ Created collection `{collection_name}` with schema of {len(schema)} fields")


def run_ingestion_pipeline(csv_path: str, collection_name: str, schema_path: str):
    df = pd.read_csv(csv_path)
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    # Step 1: Infer & save schema
    schema = infer_payload_schema_from_df(df)
    generate_payload_json(schema, schema_path)

    # Step 2: Create collection
    client = QdrantClient(url="http://localhost:6333")
    recreate_qdrant_collection(client, collection_name, schema)

    # Step 3: Upload documents
    documents = load_documents_from_df(df)
    Qdrant.from_documents(
        documents=documents,
        embedding=OpenAIEmbeddings(),
        url="http://localhost:6333",
        collection_name=collection_name
    )
    print(f"🚀 Uploaded {len(documents)} documents to Qdrant.")


def run_sample_query(query: str, filters: dict, collection_name: str):
    # Step 4: Perform query
    client = QdrantClient(url="http://localhost:6333")
    qdrant_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embedding=OpenAIEmbeddings(),
    )

    # Convert dictionary filters into Qdrant Filter format
    filter_conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k, v in filters.items()]
    qdrant_filter = Filter(must=filter_conditions)

    results = qdrant_store.similarity_search_with_score(query=query, k=5, filter=qdrant_filter)

    print("\n📌 Query Results:\n")
    for doc, score in results:
        print(f"🧠 Score: {score:.4f}")
        print(doc.page_content)
        print("-" * 80)



In [15]:

# Optional CLI usage
if __name__ == "__main__":
    run_ingestion_pipeline(DATA_PATH, COLLECTION_NAME, SCHEMA_OUTPUT_PATH)

    run_sample_query(
        query="Top funded fintech startups",
        filters={
            "headquarters_city": "Bengaluru",
            "industry_sector": "Fintech"
        },
        collection_name=COLLECTION_NAME
    )


📄 Saved payload schema to /home/prashant-agrawal/Netflix_Project/src/schema/payload_schema.json


AssertionError: Unknown arguments: ['payload_schema']

In [16]:
import qdrant_client
print(qdrant_client.__version__)

AttributeError: module 'qdrant_client' has no attribute '__version__'