### Load document into index

In [16]:
import sys
import os
from dotenv import load_dotenv

load_dotenv("../src/utils/.env")
sys.path.append(os.path.abspath("../src"))

from pipeline.loader import load_pdf_chunks
from pipeline.indexer import index_documents

pdf_path = "../sample_data/Certificado_Senen Fernandez_UPM.pdf"

chunks = load_pdf_chunks(pdf_path)
index_documents(chunks, persist_directory="../src/db", collection_name="knowledge_base" )

✅ Indexed 1 documents into 'knowledge_base'


### Load image Document into Index

In [17]:
# 📦 Install required packages (only run once)
!pip install openai pillow python-dotenv

# 🧠 Import libraries
import os
import sys
import re
import base64
import openai
from PIL import Image
from typing import List
from dotenv import load_dotenv
from langchain.schema import Document

# 🌍 Load environment variables (expects OPENAI_API_KEY in .env)
load_dotenv("../src/utils/.env")
openai.api_key = os.getenv("OPENAI_API_KEY")

# 📂 Load helper functions from your pipeline
sys.path.append(os.path.abspath("../src"))
from pipeline.indexer import index_documents

# 🧠 Generate caption using GPT-4V
def caption_image_with_gpt4v(image_path: str) -> str:
    with open(image_path, "rb") as img_file:
        img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that describes images accurately."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in one or two sentences."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}}
                ]
            }
        ],
        max_tokens=100
    )
    return response.choices[0].message.content.strip()

# 🧾 Extract page number from filename
def extract_page_number(filename: str) -> int:
    match = re.search(r'page(\d+)', filename)
    return int(match.group(1)) if match else -1

# 🗂️ Process images in a folder (or single image)
def process_images(folder_path: str, specific_image: str = None) -> List[Document]:
    if specific_image:
        image_files = [os.path.join(folder_path, specific_image)]
    else:
        image_files = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ]

    documents = []
    for image_path in image_files:
        filename = os.path.basename(image_path)
        page = extract_page_number(filename)
        print(f"📄 Processing: {filename} (page {page})")

        try:
            caption = caption_image_with_gpt4v(image_path)
        except Exception as e:
            print(f"⚠️ Error processing {filename}: {e}")
            continue

        if not caption.strip():
            print(f"⚠️ Empty caption for {filename}. Skipping.")
            continue
        print(caption)
        doc = Document(
            page_content=caption,
            metadata={
                "page_number": page,
                "image_file": filename,
                "source": os.path.splitext(filename)[0].split("_extracted_images")[0] + ".pdf",  # match PDF source
                "type": "image",
                "path": image_path  # helpful if showing image
            }
        )
        documents.append(doc)

    return documents

# ✅ Index image documents into ChromaDB
def index_image_documents(folder_path: str, specific_image: str = None, collection_name: str = "knowledge_base"):
    image_docs = process_images(folder_path, specific_image)

    valid_docs = [doc for doc in image_docs if doc.page_content and doc.page_content.strip()]
    if not valid_docs:
        print("⚠️ No valid documents to index.")
        return

    index_documents(valid_docs, persist_directory="../src/db", collection_name=collection_name)
    print(f"✅ Indexed {len(valid_docs)} image documents into '{collection_name}' collection.")




In [18]:

# 🔍 Example usage:
# To ingest all images in the folder
# index_image_documents("../sample_data/pdf_with_images_extracted_images")

# To ingest a single image
index_image_documents("../sample_data/pdf_with_images_extracted_images", specific_image="image_page1_1.jpeg")


📄 Processing: image_page1_1.jpeg (page 1)
The image shows a soldier in camouflage uniform and helmet preparing to launch a small gray drone with highlighted internal components, standing in an open field with tall grass and a cloudy sky in the background.
✅ Indexed 1 documents into 'knowledge_base'
✅ Indexed 1 image documents into 'knowledge_base' collection.
