In [4]:
import getpass
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [6]:
import glob
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.tools.tavily_search import TavilySearchResults
from langgraph.graph import END, StateGraph, START
from typing import List, Dict, Any
from typing_extensions import TypedDict

In [7]:
# initiate supabase db
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

In [21]:
import os

def load_and_split_pdfs(pdf_directory: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Document]:
    """
    Load and split PDFs from a directory with enhanced metadata and error handling.
    """
    all_docs = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        length_function=len
    )
    
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]  # List all PDF files in the directory
    
    for pdf in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf)
        try:
            print(f"Processing PDF: {pdf_path}")  # Inform about the PDF being processed
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            
            # Enhance metadata
            for doc in docs:
                doc.metadata.update({
                    "source": pdf_path,
                    "document_type": "pdf",
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap
                })
            
            split_docs = text_splitter.split_documents(docs)
            all_docs.extend(split_docs)
            print(f"Successfully processed {len(split_docs)} chunks from {pdf_path}")
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")  # Inform about any errors
    
    return all_docs

In [22]:
# Example call to the load_and_split_pdfs function
pdf_directory = "/Users/shivanshmahajan/Desktop/Cyber/the_content"  # Replace with your actual PDF directory path
chunk_size = 500  # You can adjust the chunk size as needed
chunk_overlap = 50  # You can adjust the chunk overlap as needed

# Call the function
all_documents = load_and_split_pdfs(pdf_directory, chunk_size, chunk_overlap)

# Optionally, print the number of documents loaded
print(f"Total documents loaded: {len(all_documents)}")

Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like the Enemy by Micah Zenko.pdf
Successfully processed 1781 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like the Enemy by Micah Zenko.pdf
Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/dokumen.pub_open-source-intelligence-techniques-resources-for-searching-and-analyzing-online-information-8nbsped-9798578577086.pdf
Successfully processed 3799 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/dokumen.pub_open-source-intelligence-techniques-resources-for-searching-and-analyzing-online-information-8nbsped-9798578577086.pdf
Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/defensive-security-handbook-best-practices-for-securing-infrastructure-1nbsped-1491960388-9781491960387_compress.pdf
Successfully processed 1295 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/defensive-secur

In [20]:
# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


In [None]:
from langchain_community.vectorstores import SupabaseVectorStore
vector_store = SupabaseVectorStore.from_documents(
    all_documents,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
    chunk_size=1000,
)