In [4]:
import os
import numpy as np
from PIL import Image
import faiss
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
import torch
import fitz  # PyMuPDF for PDF text extraction
from tqdm import tqdm  # Progress bar

# === Paths ===
BASE_DIR = r"C:\22ad053\Navigate Labs\Extracted"
TEXT_FOLDER = os.path.join(BASE_DIR, "extracted_texts")
IMAGE_FOLDER = os.path.join(BASE_DIR, "extracted_images")
DOC_INDEX_PATH = os.path.join(BASE_DIR, "documentss_index.faiss")
DOC_META_PATH = os.path.join(BASE_DIR, "metadatass.npy")
IMG_INDEX_PATH = os.path.join(BASE_DIR, "image_indexss.faiss")
IMG_META_PATH = os.path.join(BASE_DIR, "image_metadatass.npy")
IMAGE_TEXT_MAPPING_PATH = os.path.join(BASE_DIR, "image_text_mapss.npy")

os.makedirs(BASE_DIR, exist_ok=True)

# === Load Embedding Models ===
text_model = SentenceTransformer("all-MiniLM-L6-v2")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# === Helper Functions ===
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text() for page in doc)
    return text

def get_text_embeddings(texts):
    return text_model.encode(texts, convert_to_numpy=True)

def get_clip_text_embedding(text):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = clip_model.get_text_features(**inputs)
    return outputs.cpu().numpy()

def get_image_embeddings(image_paths):
    embeddings = []
    for path in tqdm(image_paths, desc="🔢 Embedding images"):
        image = Image.open(path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            outputs = clip_model.get_image_features(**inputs)
        embeddings.append(outputs[0].cpu().numpy())
    return np.array(embeddings)

# === Create Text Index ===
texts = []
doc_file_names = []

for f in tqdm(os.listdir(TEXT_FOLDER), desc="📄 Processing text files"):
    path = os.path.join(TEXT_FOLDER, f)
    try:
        if f.endswith(".txt"):
            with open(path, encoding="utf-8") as file:
                texts.append(file.read())
                doc_file_names.append(f)
        elif f.endswith(".pdf"):
            texts.append(extract_text_from_pdf(path))
            doc_file_names.append(f)
    except:
        continue

if texts:
    print("📦 Embedding and indexing documents...")
    text_embeddings = get_text_embeddings(texts)
    doc_index = faiss.IndexFlatL2(text_embeddings.shape[1])
    doc_index.add(text_embeddings)
    faiss.write_index(doc_index, DOC_INDEX_PATH)
    np.save(DOC_META_PATH, doc_file_names)

# === Create Image Index ===
image_paths = []
for folder in tqdm(os.listdir(IMAGE_FOLDER), desc="🖼️ Scanning image folders"):
    folder_path = os.path.join(IMAGE_FOLDER, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.lower().endswith((".jpg", ".png", ".jpeg")):
                image_paths.append(os.path.join(folder_path, file))

if image_paths:
    image_embeddings = get_image_embeddings(image_paths)
    image_index = faiss.IndexFlatL2(image_embeddings.shape[1])
    image_index.add(image_embeddings)
    faiss.write_index(image_index, IMG_INDEX_PATH)
    np.save(IMG_META_PATH, image_paths)

# === Create Image-to-Text Mapping ===
img_text_map = {}
if image_paths and texts:
    print("🔗 Creating image-to-text mappings...")
    for img_path, img_embed in tqdm(zip(image_paths, image_embeddings), total=len(image_paths), desc="🔍 Matching images to paragraphs"):
        max_sim = -1
        best_para = ""
        for doc_text in texts:
            for para in doc_text.split("\n\n"):
                para = para.strip()
                if not para:
                    continue
                para_embed = get_clip_text_embedding(para)[0]
                sim = np.dot(img_embed, para_embed) / (np.linalg.norm(img_embed) * np.linalg.norm(para_embed))
                if sim > max_sim:
                    max_sim = sim
                    best_para = para
        img_key = os.path.join(os.path.basename(os.path.dirname(img_path)), os.path.basename(img_path))
        img_text_map[img_key] = best_para
    np.save(IMAGE_TEXT_MAPPING_PATH, img_text_map)

print("\n✅ All indices created and saved locally.")


📄 Processing text files: 100%|██████████| 12/12 [00:00<00:00, 3701.95it/s]


📦 Embedding and indexing documents...


🖼️ Scanning image folders: 100%|██████████| 12/12 [00:00<00:00, 3908.95it/s]
🔢 Embedding images: 100%|██████████| 438/438 [00:46<00:00,  9.45it/s]


🔗 Creating image-to-text mappings...


🔍 Matching images to paragraphs: 100%|██████████| 438/438 [3:44:51<00:00, 30.80s/it]   


✅ All indices created and saved locally.



