In [None]:
import os
import glob
import tempfile
import ollama
import shutil
import subprocess
import pytesseract
import faiss
import numpy as np
import pickle
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from concurrent.futures import ThreadPoolExecutor

class DocumentProcessor:
    def __init__(self, poppler_path=None, dpi=200):
        self.poppler_path = poppler_path
        self.dpi = dpi

    def extract_text_from_pdf(self, pdf_path):
        images = convert_from_path(pdf_path, dpi=self.dpi, poppler_path=self.poppler_path)
        with ThreadPoolExecutor() as executor:
            texts = list(executor.map(pytesseract.image_to_string, images))
        return texts

    def extract_text_from_image(self, image_path):
        return pytesseract.image_to_string(Image.open(image_path))

    def clean_text(self, text):
        return [p.strip() for p in text.split('\n\n') if len(p.strip()) > 5]

    def create_documents_pdf(self, text_pages, pdf_path):
        documents = []
        for page_num, page_text in enumerate(text_pages):
            for para_num, para in enumerate(self.clean_text(page_text)):
                doc_id = f"pdf_{os.path.basename(pdf_path)}_{page_num}_{para_num}"
                documents.append({
                    "id": doc_id,
                    "text": para,
                    "metadata": {"source": "pdf"}
                })
        return documents

    def create_documents_image(self, text, image_path):
        cleaned_paragraph = ' '.join([p.strip() for p in text.split('\n\n') if p.strip()])
        doc_id = f"image_{os.path.basename(image_path)}"
        return [{
            "id": doc_id,
            "text": cleaned_paragraph,
            "metadata": {"source": "figure"}
        }]


class KeywordGenerator:
    def __init__(self, model_name="mistral:7b-instruct-v0.2-q5_K_M"):
        self.model_name = model_name

    def generate_keywords(self, text):
        prompt = "Summarize the following paragraph into 3 keywords separated by commas: " + text
        response = ollama.generate(model=self.model_name, prompt=prompt, options={"temperature": 0.1})["response"]
        return [kw.strip() for kw in response.split(",") if len(kw.strip()) > 2]


class EmbeddingManager:
    def __init__(self, db_path="rag_db"):
        self.vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
        self.index = None
        self.embeddings = None
        self.db_path = db_path
        self.documents = []
        self._load_database()

    def _load_database(self):
        vectorizer_path = os.path.join(self.db_path, "vectorizer.pkl")
        index_path = os.path.join(self.db_path, "index.faiss")
        docs_path = os.path.join(self.db_path, "docs.pkl")

        # Load documents
        if os.path.exists(docs_path):
            with open(docs_path, "rb") as f:
                self.documents = pickle.load(f)

        # Load FAISS index
        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)

        # Load or fit vectorizer
        if os.path.exists(vectorizer_path):
            with open(vectorizer_path, "rb") as f:
                self.vectorizer = pickle.load(f)
        else:
            # Retroactively fit on existing documents
            if self.documents:
                texts = [doc["text"] for doc in self.documents]
                self.vectorizer.fit(texts)
            os.makedirs(self.db_path, exist_ok=True)
            with open(vectorizer_path, "wb") as f:
                pickle.dump(self.vectorizer, f)

    def _save_database(self):
        docs_path = os.path.join(self.db_path, "docs.pkl")
        index_path = os.path.join(self.db_path, "index.faiss")
        vectorizer_path = os.path.join(self.db_path, "vectorizer.pkl")

        # Save documents
        with open(docs_path, "wb") as f:
            pickle.dump(self.documents, f)

        # Save FAISS index
        if self.index is not None:
            faiss.write_index(self.index, index_path)

        # Save vectorizer
        with open(vectorizer_path, "wb") as f:
            pickle.dump(self.vectorizer, f)

    def create_embeddings(self, documents):
        texts = [doc["text"] for doc in documents]
        embeddings = self.vectorizer.fit_transform(texts).toarray()
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings.astype("float32"))
        self.documents.extend(documents)
        self._save_database()

    def add_documents(self, new_documents):
        new_texts = [doc["text"] for doc in new_documents]
        new_embeddings = self.vectorizer.transform(new_texts).toarray()
        self.index.add(new_embeddings.astype("float32"))
        self.documents.extend(new_documents)
        self._save_database()

    def search(self, query, top_k=1):
        query_embedding = self.vectorizer.transform([query]).toarray().astype("float32")
        _, indices = self.index.search(query_embedding, top_k)
        return indices[0]


class RAGPipeline:
    def __init__(self, poppler_path=None):
        self.processor = DocumentProcessor(poppler_path=poppler_path)
        self.keyword_gen = KeywordGenerator()
        self.embedding_manager = EmbeddingManager()
        self.documents = self.embedding_manager.documents

    def process_pdf(self, pdf_path):
        text_pages = self.processor.extract_text_from_pdf(pdf_path)
        pdf_docs = self.processor.create_documents_pdf(text_pages, pdf_path)
        self._add_documents(pdf_docs)
        self._process_embedded_images(pdf_path)

    def _process_embedded_images(self, pdf_path):
        temp_dir = tempfile.mkdtemp()
        try:
            output_prefix = os.path.join(temp_dir, "image")
            pdfimages_path = os.path.join(self.processor.poppler_path, "pdfimages")
            args = [pdfimages_path, "-all", pdf_path, output_prefix]
            subprocess.run(args, capture_output=True, text=True)
            image_files = glob.glob(os.path.join(temp_dir, "image-*"))
            for img_path in image_files:
                try:
                    self.process_image(img_path)
                except Exception as e:
                    print(f"Error processing embedded image {img_path}: {str(e)}")
        finally:
            shutil.rmtree(temp_dir, ignore_errors=True)

    def process_image(self, image_path):
        text = self.processor.extract_text_from_image(image_path)
        image_docs = self.processor.create_documents_image(text, image_path)
        self._add_documents(image_docs)

    def _add_documents(self, new_docs):
        for doc in tqdm(new_docs, desc="Processing documents"):
            doc["metadata"]["keywords"] = self.keyword_gen.generate_keywords(doc["text"])
        if not self.embedding_manager.index:
            self.embedding_manager.create_embeddings(new_docs)
        else:
            self.embedding_manager.add_documents(new_docs)

    def process_directory(self, directory_path):
        pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
        image_files = glob.glob(os.path.join(directory_path, "*.png")) + glob.glob(os.path.join(directory_path, "*.jpg"))
        for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
            self.process_pdf(pdf_path)
        for img_path in tqdm(image_files, desc="Processing images"):
            self.process_image(img_path)

    def search(self, query, top_k=1):
        indices = self.embedding_manager.search(query, top_k)
        return [{
            "document": self.documents[idx]["text"],
            "metadata": self.documents[idx]["metadata"]
        } for idx in indices if 0 <= idx < len(self.documents)]

    def generate_answer(self, query, use_context=True):
        context_docs = self.search(query)
        context = " ".join([doc["document"] for doc in context_docs])
        prompt = f"Answer using {'ONLY the following context' if use_context else 'your knowledge and this additional context'}:\n{context}\n\nQuestion: {query}"
        response = ollama.chat(
            model="mistral:7b-instruct-v0.2-q5_K_M",
            messages=[{"role": "user", "content": prompt}],
            stream=False
        )
        return response["message"]["content"]


if __name__ == "__main__":
    # Adjust these paths accordingly
    poppler_path = r"C:\\Users\\Dr.Wael Abouelwafa\\Downloads\\poppler-24.08.0\\Library\\bin"
    rag = RAGPipeline(poppler_path=poppler_path)
    
    # Process directory
    rag.process_directory(r"C:\\Users\\Dr.Wael Abouelwafa\\Desktop\\graduation project 2025\\all data in one file")
    
 