In [1]:
%%capture
%pip install -U langchain langchain-community faiss-cpu openai langchain-openai

In [2]:
# -- Build full-row vector store for Indian Startup Dataset --
# This script creates a vector store from the Indian Startup Dataset, where each document represents a full row of data.
# The vector store is saved locally for later use in retrieval tasks.


import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
import os

# Define base paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PATH = os.path.join(BASE_DIR, "Data", "Enriched_Indian_Startup_Dataset.csv")
INDEX_PATH = os.path.join(BASE_DIR, "database","vector_store", "faiss_full_row_index" )  # 🔄 Changed the index name to reflect strategy

print(f"Base Directory: {BASE_DIR}")
print(f"Data Path: {DATA_PATH}")
print(f"Index Path: {INDEX_PATH}")


Base Directory: /home/prashant-agrawal/Netflix_Project/src
Data Path: /home/prashant-agrawal/Netflix_Project/src/Data/Enriched_Indian_Startup_Dataset.csv
Index Path: /home/prashant-agrawal/Netflix_Project/src/database/vector_store/faiss_full_row_index


In [3]:

def build_fullrow_vectorstore():
    df = pd.read_csv(DATA_PATH)
    docs = []

    for idx, row in df.iterrows():
        row_id = idx
        metadata = row.to_dict()

        # Join all columns into one document
        content = "\n".join(
            f"{col}: {row[col]}" for col in df.columns
            if pd.notna(row[col]) and str(row[col]).strip()
        )

        doc = Document(
            page_content=content,
            metadata={"row_id": row_id, **metadata}
        )
        docs.append(doc)

    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local(INDEX_PATH)

    print(f"✅ Full-row vector store created with {len(docs)} documents at: {INDEX_PATH}")

if __name__ == "__main__":
    build_fullrow_vectorstore()


✅ Full-row vector store created with 500 documents at: /home/prashant-agrawal/Netflix_Project/src/database/vector_store/faiss_full_row_index


In [4]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document


# Paths
INDEX_DIR = "src/Data/faiss_field_chunk_index"
FAISS_INDEX_PATH = f"{INDEX_DIR}/index.faiss"
PKL_INDEX_PATH = f"{INDEX_DIR}/index.pkl"

embeddings = OpenAIEmbeddings()

# 🔁 Load existing vector store
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)

# 🎯 Sample query
query = "tell me the all the details of 5  related to funding in Bengaluru-based SaaS companies with over ₹1000 Cr funding"

# 🔍 Run similarity search
results = vectorstore.similarity_search_with_score(query, k=10)

# 🧪 Output test results
for i, (doc, score) in enumerate(results):
    print(f"\n🔹 Result #{i+1}")
    print(f"📄 Content:\n{doc.page_content}")
    print(f"📏 Score: {score:.4f}")


🔹 Result #1
📄 Content:
Company Name: Infosys
Legal Entity Type: Pvt Ltd
State: Rajasthan
Headquarters City: Delhi
Year Founded: 2019
Company Website: https://allen,morganandhayes.in
Logo URL: https://logo.clearbit.com/allen,morganandhayes.in
Company Description (Short): Expanded multi-state hierarchy
Company Description (Long): To single early side. Detail three plan in.
Away keep price data true everybody standard large. Consumer data level ago degree most term evidence.
Time couple mention street. Rule court ten but.
Amount cut change sing fill. Similar trouble street sea go.
Industry Sector: Edtech
Total Funding Raised (INR): ₹103 Cr
Number of Funding Rounds: 1
Latest Funding Round Type: Series B
Latest Funding Date: 2022-07-04
Lead Investors: Long, Moore and Torres
Revenue Estimate (Annual): ₹250 Cr
Valuation Estimate (if available): ₹305 Cr
Number of Employees (Current): 384
Number of Employees (Estimate Range): 99-942
Key People: CEO: Andrew Delacruz, CTO: Denise Reyes
Founders: