In [1]:
import getpass
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

True

In [2]:
import glob
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.tools.tavily_search import TavilySearchResults
from langgraph.graph import END, StateGraph, START
from typing import List, Dict, Any
from typing_extensions import TypedDict

In [3]:
# initiate supabase db
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

In [6]:
import os
from typing import List
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import tempfile

def extract_text_with_ocr(pdf_path: str) -> List[Document]:
    """
    Extract text from PDF using OCR when regular extraction fails.
    """
    documents = []
    try:
        # Convert PDF pages to images
        images = convert_from_path(pdf_path)
        
        for i, image in enumerate(images):
            # Perform OCR on each image
            text = pytesseract.image_to_string(image, lang='eng')
            
            # Create a Document for each page
            doc = Document(
                page_content=text,
                metadata={
                    "source": pdf_path,
                    "page": i + 1,
                    "extraction_method": "ocr"
                }
            )
            documents.append(doc)
            
    except Exception as e:
        print(f"OCR extraction failed for {pdf_path}: {str(e)}")
        return []
        
    return documents

def load_and_split_pdfs(pdf_directory: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Document]:
    """
    Load and split PDFs from a directory with enhanced metadata, error handling, and OCR fallback.
    """
    all_docs = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        length_function=len
    )
    
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
    
    for pdf in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf)
        try:
            print(f"Processing PDF: {pdf_path}")
            # Try regular PDF extraction first
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            
            if not docs or not any(doc.page_content.strip() for doc in docs):
                print(f"Regular extraction yielded no text for {pdf_path}, attempting OCR...")
                docs = extract_text_with_ocr(pdf_path)
            
            # Enhance metadata
            for doc in docs:
                doc.metadata.update({
                    "source": pdf_path,
                    "document_type": "pdf",
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap
                })
            
            split_docs = text_splitter.split_documents(docs)
            all_docs.extend(split_docs)
            print(f"Successfully processed {len(split_docs)} chunks from {pdf_path}")
            
        except Exception as e:
            print(f"Initial extraction failed for {pdf_path}, attempting OCR: {str(e)}")
            try:
                # Fallback to OCR
                docs = extract_text_with_ocr(pdf_path)
                if docs:
                    # Enhance metadata for OCR documents
                    for doc in docs:
                        doc.metadata.update({
                            "source": pdf_path,
                            "document_type": "pdf",
                            "chunk_size": chunk_size,
                            "chunk_overlap": chunk_overlap,
                            "extraction_method": "ocr"
                        })
                    
                    split_docs = text_splitter.split_documents(docs)
                    all_docs.extend(split_docs)
                    print(f"Successfully processed {len(split_docs)} chunks from {pdf_path} using OCR")
                else:
                    print(f"Both regular extraction and OCR failed for {pdf_path}")
            except Exception as ocr_error:
                print(f"OCR processing failed for {pdf_path}: {str(ocr_error)}")
    
    return all_docs

In [8]:
# Example call to the load_and_split_pdfs function
pdf_directory = "/Users/shivanshmahajan/Desktop/Cyber/the_content"  # Replace with your actual PDF directory path
chunk_size = 500  # You can adjust the chunk size as needed
chunk_overlap = 50  # You can adjust the chunk overlap as needed

# Call the function
all_documents = load_and_split_pdfs(pdf_directory, chunk_size, chunk_overlap)

# Optionally, print the number of documents loaded
print(f"Total documents loaded: {len(all_documents)}")

Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like the Enemy by Micah Zenko.pdf
Successfully processed 1781 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like the Enemy by Micah Zenko.pdf
Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/defensive-security-handbook-best-practices-for-securing-infrastructure-1nbsped-1491960388-9781491960387_compress.pdf
Successfully processed 1295 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/defensive-security-handbook-best-practices-for-securing-infrastructure-1nbsped-1491960388-9781491960387_compress.pdf
Processing PDF: /Users/shivanshmahajan/Desktop/Cyber/the_content/Incident Response and Computer Forensics 2nd ed. - C. Prosise, K. Mandia (2003) WW.pdf
Successfully processed 2566 chunks from /Users/shivanshmahajan/Desktop/Cyber/the_content/Incident Response and Computer Forensics 2nd ed. - C. Prosise, K. Mandia 

In [21]:
all_documents

[Document(metadata={'producer': 'calibre (2.41.0) [http://calibre-ebook.com]', 'creator': 'calibre (2.41.0) [http://calibre-ebook.com]', 'creationdate': '2020-05-02T14:09:58+00:00', 'author': 'Micah Zenko', 'moddate': '2020-05-02T16:09:59+02:00', 'title': 'Red Team: How to Succeed By Thinking Like the Enemy', 'source': '/Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like the Enemy by Micah Zenko.pdf', 'total_pages': 378, 'page': 1, 'page_label': '2', 'document_type': 'pdf', 'chunk_size': 500, 'chunk_overlap': 50}, page_content='RED\nTEAM'),
 Document(metadata={'producer': 'calibre (2.41.0) [http://calibre-ebook.com]', 'creator': 'calibre (2.41.0) [http://calibre-ebook.com]', 'creationdate': '2020-05-02T14:09:58+00:00', 'author': 'Micah Zenko', 'moddate': '2020-05-02T16:09:59+02:00', 'title': 'Red Team: How to Succeed By Thinking Like the Enemy', 'source': '/Users/shivanshmahajan/Desktop/Cyber/the_content/Red Team - How to Succeed by Thinking Like

In [22]:
import json

# Assuming all_documents is a list of Document objects
# Convert Document objects to a serializable format
documents_to_save = [
    {
        "content": doc.page_content,
        "metadata": doc.metadata
    }
    for doc in all_documents
]

# Save to a JSON file
with open('all_documents.json', 'w') as json_file:
    json.dump(documents_to_save, json_file, indent=4)

print("Documents saved to all_documents.json")

Documents saved to all_documents.json


In [16]:
# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


In [23]:
from langchain_community.vectorstores import SupabaseVectorStore
vector_store = SupabaseVectorStore.from_documents(
    all_documents,
    embeddings,
    client=supabase,
    table_name="documents",
    chunk_size=1000
)

KeyboardInterrupt: 

In [19]:
data_to_insert = [
    {
        "content": doc.page_content,
        "embedding": embeddings.embed_documents([doc.page_content])[0],  # Correct method to generate embedding
        "metadata": doc.metadata
    }
    for doc in all_documents
]

# Insert data into Supabase
for data in data_to_insert:
    response = supabase.table("documents").insert(data).execute()
    if response.status_code != 201:
        print(f"Error inserting document: {response.data}")
    else:
        print(f"Inserted document with content: {data['content'][:30]}...")  # Print first 30 characters

KeyboardInterrupt: 