In [5]:
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document

import pandas as pd
import pickle

In [2]:
# Load the cleaned CSV 
df_clean = pd.read_csv('../data/cleaned_reports.csv')  # If you saved it

In [6]:
# prepare data for chunking
texts = df_clean['findings'].tolist()

# chunk the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)

chunks = []
for i, text in enumerate(texts):
    split_docs = text_splitter.create_documents([text], metadatas=[{"source": f"report_{i}"}])
    chunks.extend(split_docs)

print(f"Total chunks: {len(chunks)}")

Total chunks: 3381


In [9]:
# Load API key from .env file"
load_dotenv()

# Set Up Embeddings
embedding_model = OpenAIEmbeddings()

# Create FAISS vector store
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Save the vector store locally
vectorstore.save_local("../results/faiss_vectorstore")