In [3]:
%%capture
%pip install -U langchain langchain-community faiss-cpu openai langchain-openai pandas

In [4]:
# -- Build full-row vector store for Indian Startup Dataset --
# This script creates a vector store from the Indian Startup Dataset, where each document represents a full row of data.
# The vector store is saved locally for later use in retrieval tasks.


import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import os

# Define base paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PATH = os.path.join(BASE_DIR, "Data", "Enriched_Indian_Startup_Dataset.csv")
INDEX_PATH = os.path.join(BASE_DIR, "database","vector_store", "faiss_full_row_index" )  # üîÑ Changed the index name to reflect strategy

print(f"Base Directory: {BASE_DIR}")
print(f"Data Path: {DATA_PATH}")
print(f"Index Path: {INDEX_PATH}")


Base Directory: /home/prashant-agrawal/projects/netflix_talk2data/src
Data Path: /home/prashant-agrawal/projects/netflix_talk2data/src/Data/Enriched_Indian_Startup_Dataset.csv
Index Path: /home/prashant-agrawal/projects/netflix_talk2data/src/database/vector_store/faiss_full_row_index


In [5]:

def build_fullrow_vectorstore():
    df = pd.read_csv(DATA_PATH)
    docs = []

    for idx, row in df.iterrows():
        row_id = idx
        metadata = row.to_dict()

        # Join all columns into one document
        content = "\n".join(
            f"{col}: {row[col]}" for col in df.columns
            if pd.notna(row[col]) and str(row[col]).strip()
        )

        doc = Document(
            page_content=content,
            metadata={"row_id": row_id, **metadata}
        )
        docs.append(doc)

    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(INDEX_PATH)

    print(f"‚úÖ Full-row vector store created with {len(docs)} documents at: {INDEX_PATH}")

if __name__ == "__main__":
    build_fullrow_vectorstore()


‚úÖ Full-row vector store created with 500 documents at: /home/prashant-agrawal/projects/netflix_talk2data/src/database/vector_store/faiss_full_row_index


In [10]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document


# Paths
INDEX_DIR = "src/Data/faiss_field_chunk_index"
FAISS_INDEX_PATH = f"{INDEX_DIR}/index.faiss"
PKL_INDEX_PATH = f"{INDEX_DIR}/index.pkl"

embeddings = OpenAIEmbeddings()

# üîÅ Load existing vector store
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

# üéØ Sample query
query = "tell me only the 5 companies name in the fintech sector that are based in Delhi and have raised more than 10 million USD in funding"

# üîç Run similarity search
results = vectorstore.similarity_search_with_score(query, k=10)


# üß™ Output test results
for i, (doc, score) in enumerate(results):
    print(f"\nüîπ Result #{i+1}")
    print(f"üìÑ Content:\n{doc.page_content}")
    print(f"üìè Score: {score:.4f}")


üîπ Result #1
üìÑ Content:
Company Name: Freshworks
Legal Entity Type: Public Ltd
State: Delhi
Headquarters City: Ahmedabad
Year Founded: 2019
Company Website: https://velezllc.in
Logo URL: https://logo.clearbit.com/velezllc.in
Company Description (Short): Digitized upward-trending concept
Company Description (Long): Light wind small might. Recognize from voice call issue. Must audience commercial fill stock.
Think face lose view present. Trouble director question single impact series.
Six already job blood bar conference. But various candidate though upon today. Change land decide almost language respond.
Industry Sector: Healthtech
Total Funding Raised (INR): ‚Çπ277 Cr
Number of Funding Rounds: 2
Latest Funding Round Type: Pre-seed
Latest Funding Date: 2022-03-03
Lead Investors: House-Kline
Revenue Estimate (Annual): ‚Çπ35 Cr
Valuation Estimate (if available): ‚Çπ127 Cr
Number of Employees (Current): 269
Number of Employees (Estimate Range): 348-547
Key People: CEO: Brandy Mcguire