In [None]:
# Requirements:
!pip install langchain unstructured[all-docs] pydantic lxml openai chromadb tiktoken pytesseract langchain_google_genai
!pip install langchain-huggingface transformers torch
!pip install -U langchain-community
!pip install pytesseract

!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr

!pip install google-generativeai

!pip install langchain-google-community[drive]
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib


**AuthenTications**

In [2]:
import os
import io
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from unstructured.partition.auto import partition
import traceback # Import traceback for more detailed error info (optional)




# --------- SETTINGS ---------
SERVICE_ACCOUNT_FILE = "/content/steady-citron-457407-q7-5c09ed11e0d4.json"



# --------- AUTH ---------
try:
    creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE)
    drive_service = build('drive', 'v3', credentials=creds)
    print("Google Drive authentication successful.")
except Exception as e:
    print(f"Error during Google Drive authentication: {e}")
    print("Please check your service account file path and ensure it's valid.")
    # In a real application, you might want to exit or handle this differently
    # For this script, we'll let it continue but no files will be loaded.
    drive_service = None

Error during Google Drive authentication: [Errno 2] No such file or directory: '/content/steady-citron-457407-q7-5c09ed11e0d4.json'
Please check your service account file path and ensure it's valid.


**Load and Partition the Googe Drive Documentss**

In [3]:
def load_and_partition_drive_documents(drive_id, download_dir, figures_base_dir):
    """
    Loads and partitions documents from a Google Drive folder or file.

    Args:
        drive_id (str): The ID of the Google Drive folder or file.
        download_dir (str): Directory to temporarily download files.
        figures_base_dir (str): Base directory to save extracted figures/images.

    Returns:
        List[Dict]: A list of dictionaries, each containing filename and extracted elements.
    """
    processed_docs = []

    try:
        # Retrieve metadata to determine if the ID is a folder or file
        file_metadata = drive_service.files().get(fileId=drive_id, fields="id, name, mimeType").execute()
        mime_type = file_metadata.get("mimeType")
        name = file_metadata.get("name")

        if mime_type == "application/vnd.google-apps.folder":
            # It's a folder; fetch all files within
            files = get_files_from_folder(drive_id)
        else:
            # It's a single file
            files = [file_metadata]

    except Exception as e:
        print(f"Error retrieving metadata for ID {drive_id}: {e}")
        return []

    if not files:
        print("No files to process.")
        return []

    for file in files:
        file_id = file["id"]
        name = file["name"]

        local_path = os.path.join(download_dir, name)
        output_figures_path = os.path.join(figures_base_dir, os.path.splitext(name)[0])
        os.makedirs(output_figures_path, exist_ok=True)

        print(f"\nAttempting to process: {name}")

        try:
            request = drive_service.files().get_media(fileId=file_id)

            # Download file to temporary directory
            with io.FileIO(local_path, "wb") as fh:
                downloader = MediaIoBaseDownload(fh, request)
                done = False
                while not done:
                    status = downloader.next_chunk()
                    _, done = status

            print(f"Downloaded: {name} to {local_path}")

            # Check if the file is a ZIP archive
            if name.lower().endswith('.zip'):
                # Extract ZIP file
                with zipfile.ZipFile(local_path, 'r') as zip_ref:
                    zip_ref.extractall(download_dir)
                os.remove(local_path)  # Remove ZIP file after extraction

                # Process each extracted file
                for root, _, filenames in os.walk(download_dir):
                    for filename in filenames:
                        extracted_path = os.path.join(root, filename)
                        try:
                            elements = partition(
                                filename=extracted_path,
                                strategy="hi_res",
                                languages=["eng"],
                                extract_image_block_to_payload=True,
                                extract_image_block_types=["Image", "Table"],
                                infer_table_structure=True,
                                image_output_dir_path=output_figures_path
                            )
                            processed_docs.append({
                                "filename": filename,
                                "elements": elements,
                                "extracted_figures_dir": output_figures_path
                            })
                            print(f"✅ Successfully loaded and partitioned: {filename} ({len(elements)} elements found)")
                        except Exception as e:
                            print(f"❌ Failed to partition {filename}: {e}")
                        finally:
                            os.remove(extracted_path)  # Clean up extracted file
            else:
                # Process non-ZIP file
                elements = partition(
                    filename=local_path,
                    strategy="hi_res",
                    languages=["eng"],
                    extract_image_block_to_payload=True,
                    extract_image_block_types=["Image", "Table"],
                    infer_table_structure=True,
                    image_output_dir_path=output_figures_path
                )
                processed_docs.append({
                    "filename": name,
                    "elements": elements,
                    "extracted_figures_dir": output_figures_path
                })
                print(f"✅ Successfully loaded and partitioned: {name} ({len(elements)} elements found)")

        except Exception as e:
            print(f"❌ Failed to process {name}: {e}")
        finally:
            if os.path.exists(local_path):
                os.remove(local_path)

    return processed_docs


**Save the Images in directory**

In [4]:
import os
import base64
from io import BytesIO
from PIL import Image as PILImage
from unstructured.documents.elements import Image as UnstructuredImage

def save_unstructured_images(documents, output_dir="saved_images"):
    """
    Decode and save each UnstructuredImage element's Base64 payload
    into the specified output directory as individual image files.
    """
    # 1. Create output directory (and any parents) if missing
    os.makedirs(output_dir, exist_ok=True)

    image_count = 0
    for doc in documents:
        for element in doc.get("elements", []):
            if isinstance(element, UnstructuredImage):
                # 2. Extract Base64 payload from metadata
                data = element.to_dict().get("metadata", {}).get("image_base64", "")
                if not data:
                    continue
                # 3. Strip off data URI header if present
                if "," in data:
                    data = data.split(",", 1)[1]
                # 4. Decode Base64 to bytes
                raw_bytes = base64.b64decode(data)
                # 5. Wrap in BytesIO buffer
                buffer = BytesIO(raw_bytes)
                buffer.seek(0)
                # 6. Load as PIL Image
                pil_img = PILImage.open(buffer)
                pil_img.load()  # optional: ensure full read

                # 7. Choose a filename and save
                filename = f"image_{image_count}.png"
                filepath = os.path.join(output_dir, filename)
                pil_img.save(filepath)

                print(f"Saved image #{image_count} to {filepath}")
                image_count += 1

# Example usage:



**Generate Captions of Images and Tables**

In [5]:
import base64
from io import BytesIO
from typing import Any, Dict, List, Tuple

import torch
from PIL import Image as PILImage
from transformers import BlipProcessor, BlipForConditionalGeneration
from unstructured.documents.elements import Image as UnstructuredImage, Table as UnstructuredTable

def generate_captions_from_memory(
    documents: List[Dict[str, Any]],
    model_name: str = "Salesforce/blip-image-captioning-base",
    device: str = None,
) -> Tuple[Dict[str, str], Dict[str, str]]:
    """
    Iterate all Image and Table elements in `documents`, decode their in‑payload Base64,
    and generate captions via BLIP (in memory, no disk I/O).

    Args:
        documents: list of dicts, each with key "elements": List[Element]
        model_name: HF model to load
        device: 'cuda' or 'cpu' (auto‑chosen if None)

    Returns:
        (image_captions, table_captions):
            image_captions: Dict[element_id, caption]
            table_captions: Dict[element_id, caption]
    """
    # Device setup
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    # Load BLIP processor & model once
    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)

    image_captions: Dict[str, str] = {}
    table_captions: Dict[str, str] = {}
    img_count = 0
    tbl_count = 0

    for doc in documents:
        for el in doc.get("elements", []):
            # Only process images or tables
            if isinstance(el, UnstructuredImage) or isinstance(el, UnstructuredTable):
                # 1) Extract Base64 from metadata
                md = el.to_dict().get("metadata", {})
                b64 = md.get("image_base64", "")
                if not b64:
                    continue
                if "," in b64:
                    b64 = b64.split(",", 1)[1]

                # 2) Decode & load PIL image
                raw = base64.b64decode(b64)
                buf = BytesIO(raw)
                buf.seek(0)
                img = PILImage.open(buf).convert("RGB")

                # 3) Generate caption
                inputs = processor(images=img, return_tensors="pt").to(device)
                out = model.generate(**inputs)
                cap = processor.decode(out[0], skip_special_tokens=True)

                # 4) Determine key
                elem_id = md.get("element_id") or getattr(el, "element_id", None) or el.id
                if not elem_id:
                    # fallback to type+count
                    if isinstance(el, UnstructuredImage):
                        elem_id = f"image_{img_count}"
                        img_count += 1
                    else:
                        elem_id = f"table_{tbl_count}"
                        tbl_count += 1

                # 5) Store in respective dict
                if isinstance(el, UnstructuredImage):
                    image_captions[elem_id] = cap
                else:
                    table_captions[elem_id] = cap

    return image_captions, table_captions

# -----------------------
# Example usage:

# docs = partition(..., extract_image_block_to_payload=True, extract_image_block_types=["Image","Table"])

# print("Images:", img_caps)
# print("Tables:", tbl_caps)


In [6]:
#documents = load_and_partition_drive_documents(FOLDER_ID, DOWNLOAD_TMP_DIR, EXTRACTED_FIGURES_BASE_DIR)

**restructure/Seperates the elements**

In [7]:
from typing import Any, Dict, List, Tuple
from unstructured.documents.elements import (
    Element,
    Image as UnstructuredImage,
    Table as UnstructuredTable,
    Text,
    Title,
    ListItem,
    NarrativeText,
)

# Define which classes count as “text”
TEXT_TYPES = (Text, Title, ListItem, NarrativeText)

def restructure_all_elements_flat(
    documents: List[Dict[str, Any]]
) -> Tuple[List[UnstructuredImage], List[UnstructuredTable], List[Element]]:
    """
    Flattens a list of parsed documents into three lists:
      1) all UnstructuredImage elements
      2) all UnstructuredTable elements
      3) all text-based elements (Text, Title, ListItem, NarrativeText)

    Args:
        documents: List of dicts, each with keys:
            - "filename": str
            - "elements": List[Element]

    Returns:
        Tuple of three lists: (all_images, all_tables, all_texts)
    """
    all_images: List[UnstructuredImage] = []
    all_tables: List[UnstructuredTable] = []
    all_texts: List[Element] = []

    for doc in documents:
        for el in doc.get("elements", []):
            if isinstance(el, UnstructuredImage):
                all_images.append(el)
            elif isinstance(el, UnstructuredTable):
                all_tables.append(el)
            elif isinstance(el, TEXT_TYPES):
                all_texts.append(el)
            # else: ignore other element types

    return all_images, all_tables, all_texts

# --------------------
# Example usage:

# Suppose you already have:
# documents = partition(..., extract_image_block_to_payload=True, extract_image_block_types=["Image","Table"])

# You can now pass these into your captioning or LangChain conversion steps.


**Save the Images locally**

In [8]:
#!pip install langchain-text-splitters


**Convert unstructured elements to langChain Documentss**

In [9]:
from typing import List, Dict, Any
from langchain.schema import Document
from unstructured.documents.elements import (
    Text,
    Title,
    ListItem,
    NarrativeText,
    Image as UnstructuredImage,
    Table as UnstructuredTable,
)

TEXT_TYPES = (Text, Title, ListItem, NarrativeText)

def convert_elements_to_langchain_docs(
    texts: List[Any],
    images: List[UnstructuredImage],
    tables: List[UnstructuredTable],
    image_captions: Dict[str, str],
    table_captions: Dict[str, str],
) -> List[Document]:
    """
    Build a unified list of LangChain Documents from text elements,
    image elements + captions, and table elements + captions.
    """
    docs: List[Document] = []

    # 1) Text elements → Documents
    for el in texts:
        docs.append(Document(
            page_content=el.text,
            metadata={
                "type": "text",
                "element_id": el.id,
                "source": getattr(el.metadata, "filename", None),
            },
        ))

    # 2) Image elements → caption Docs
    for el in images:
        el_dict = el.to_dict()
        elem_id = el_dict.get("id") or el.id
        caption = image_captions.get(elem_id, "")
        # skip if no caption
        if not caption:
            continue
        base64_str = el_dict.get("metadata", {}).get("image_base64")
        docs.append(Document(
            page_content=f"[IMAGE:{elem_id}] {caption}",
            metadata={
                "type": "image_caption",
                "element_id": elem_id,
                "image_base64": base64_str,
            },
        ))

    # 3) Table elements → caption Docs
    for el in tables:
        el_dict = el.to_dict()
        elem_id = el_dict.get("id") or el.id
        caption = table_captions.get(elem_id, "")
        if not caption:
            continue
        base64_str = el_dict.get("metadata", {}).get("image_base64")
        docs.append(Document(
            page_content=f"[TABLE:{elem_id}] {caption}",
            metadata={
                "type": "table_caption",
                "element_id": elem_id,
                "image_base64": base64_str,
            },
        ))

    return docs

# ----------------------
# EXAMPLE PIPELINE USAGE:

# 1) Flatten raw elements:
# all_images, all_tables, all_texts = restructure_all_elements_flat(documents)

# 2) Generate captions:
# img_caps, tbl_caps = generate_captions_from_memory(documents)

# 3) Convert to LangChain Documents:


# 4) Ready for dynamic_chunk_documents(lc_docs) or embedding!


**Chunking / Text splitting**

In [10]:
from typing import List
from langchain.schema import Document
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    MarkdownTextSplitter,
    HTMLHeaderTextSplitter,
)
from langchain_text_splitters import RecursiveJsonSplitter


def get_splitter_for_type(doc_type: str):
    """
    Return the optimal TextSplitter for the given document type.
    """
    if doc_type == "markdown":
        # Splits at Markdown headings and paragraphs for coherent sections
        return MarkdownTextSplitter(chunk_size=512, chunk_overlap=64)  # :contentReference[oaicite:6]{index=6}
    if doc_type == "html":
        # Splits by HTML header tags, preserving section context
        return HTMLHeaderTextSplitter(
            headers_to_split_on=["h1", "h2", "h3"],
            chunk_size=800,
            chunk_overlap=100,
        )  # :contentReference[oaicite:7]{index=7}
    if doc_type == "json":
        # Recursively splits nested JSON objects into character‑bounded chunks
        return RecursiveJsonSplitter(max_chunk_size=500, min_chunk_size=100)  # :contentReference[oaicite:8]{index=8}
    if doc_type == "code":
        # Language‑aware splitting using syntax separators for code
        return RecursiveCharacterTextSplitter.from_language(
            "python", chunk_size=1000, chunk_overlap=200
        )  # :contentReference[oaicite:9]{index=9}
    # Fallback for plain text: fixed‑size character chunks with overlap
    return CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  # :contentReference[oaicite:10]{index=10}

def dynamic_chunk_documents(
    lc_docs: List[Document]
) -> List[Document]:
    """
    Apply document‑specific chunking to a list of LangChain Documents.

    Args:
        lc_docs: List of Documents, each with metadata["doc_type"] in
                 {"markdown", "html", "json", "code", "text"}.

    Returns:
        Flat list of smaller Document chunks.
    """
    all_chunks: List[Document] = []
    for doc in lc_docs:
        # Determine which splitter to use based on doc_type
        doc_type = doc.metadata.get("doc_type", "text").lower()
        splitter = get_splitter_for_type(doc_type)
        # Split the document into chunks
        chunks = splitter.split_documents([doc])  # :contentReference[oaicite:11]{index=11}
        # Annotate each chunk with its parent type for traceability
        for chunk in chunks:
            chunk.metadata["parent_doc_type"] = doc_type
            all_chunks.append(chunk)
    return all_chunks

# --------------------------
# Example usage:
#
# Assume `lc_docs` is your list of Documents already
# created from text, image captions, etc., each having
# metadata["doc_type"] ∈ {"markdown","html","json","code","text"}.
#

# print(f"Total chunks generated: {len(chunked_docs)}")
# for c in chunked_docs[:3]:
#     print(c.metadata["parent_doc_type"], "→", c.page_content[:100])


**generate Embeddings and store in DB**

In [11]:
from langchain_community.vectorstores import Chroma

def ingest_chroma(chunked_docs,embedding_model):
# Define the directory to persist the Chroma database
    persist_directory = "./chroma_db"

    # Create the Chroma vector store from documents
    vector_store = Chroma.from_documents(
        documents=chunked_docs,
        embedding=embedding_model,
        persist_directory=persist_directory
    )
    return vector_store
# Persist the vector store to disk




**Extract id from link**

In [12]:
import re
from urllib.parse import urlparse, parse_qs

def extract_drive_id(url: str) -> str | None:
    """
    Return the file/folder ID portion of any common Google-Drive link.

    Works with links like:
      • https://drive.google.com/file/d/<ID>/view?usp=sharing
      • https://drive.google.com/uc?id=<ID>&export=download
      • https://drive.google.com/open?id=<ID>
      • https://drive.google.com/drive/folders/<ID>?usp=drive_link

    Returns
    -------
    str | None
        The 33-character Drive ID if found, otherwise None.
    """
    # pattern 1:  .../d/<id>/...
    m = re.search(r"/d/([a-zA-Z0-9_-]{10,})", url)
    if m:
        return m.group(1)

    # pattern 2:  .../folders/<id>
    m = re.search(r"/folders?/([a-zA-Z0-9_-]{10,})", url)
    if m:
        return m.group(1)

    # pattern 3:  id=<id> in the query string
    qs_vals = parse_qs(urlparse(url).query).get("id")
    if qs_vals:
        return qs_vals[0]

    # nothing matched
    return None


In [13]:
from langchain_huggingface import HuggingFaceEmbeddings

# Specify the model name; you can choose any model from Hugging Face's model hub
model_name = "sentence-transformers/all-mpnet-base-v2"

# Optional: Define model and encoding parameters
model_kwargs = {'device': 'cpu'}  # or 'cuda' if using GPU
encode_kwargs = {'normalize_embeddings': True}

# Initialize the embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Complete Pipeline**

---



In [16]:
DOWNLOAD_TMP_DIR = "/content/drive_downloads"
EXTRACTED_FIGURES_BASE_DIR = os.path.join(os.getcwd(), "figures") # Base directory for saving figures

# Ensure download and figures base directories exist
os.makedirs(DOWNLOAD_TMP_DIR, exist_ok=True)
os.makedirs(EXTRACTED_FIGURES_BASE_DIR, exist_ok=True)



FOLDER_ID = "1LyiyENXg85q-uLQjlddNEeRsi6bgKdTq" # misc documents

link = "https://drive.google.com/file/d/1YtSduxc-jYJAimnslOgtNUWTBX_fa2By/view?usp=drive_link"
link = "https://drive.google.com/file/d/12lb9IUzpbrT4mmkIbA3KxRy6oK9VyvMw/view?usp=sharing"
DRIVE_ID = extract_drive_id(link)

#documents = load_and_partition_drive_documents(FOLDER_ID, DOWNLOAD_TMP_DIR, EXTRACTED_FIGURES_BASE_DIR)
documents = load_and_partition_drive_documents(DRIVE_ID, DOWNLOAD_TMP_DIR, EXTRACTED_FIGURES_BASE_DIR)
print(f"\n🎉 Done processing files. Successfully partitioned {len(documents)} documents.")

print(f"Extracted figures and tables are saved in subdirectories within: {EXTRACTED_FIGURES_BASE_DIR}")



save_unstructured_images(documents, output_dir="my_extracted_images")
img_caps, tbl_caps = generate_captions_from_memory(documents)
all_images, all_tables, all_texts = restructure_all_elements_flat(documents)

print(f"Found {len(all_images)} images, {len(all_tables)} tables, {len(all_texts)} text elements.")

lc_docs = convert_elements_to_langchain_docs(
     texts=all_texts,
     images=all_images,
     tables=all_tables,
     image_captions=img_caps,
     table_captions=tbl_caps,
 )


chunked_docs = dynamic_chunk_documents(lc_docs)
vector_store = ingest_chroma(chunked_docs,embedding_model)
vector_store.persist()


Error retrieving metadata for ID 12lb9IUzpbrT4mmkIbA3KxRy6oK9VyvMw: 'NoneType' object has no attribute 'files'

🎉 Done processing files. Successfully partitioned 0 documents.
Extracted figures and tables are saved in subdirectories within: /content/figures
Found 0 images, 0 tables, 0 text elements.


ValueError: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.

In [None]:
#chunked_docs

In [18]:
import os
import base64
import zipfile
import tempfile
import shutil
from io import BytesIO
from pathlib import Path
from PIL import Image as PILImage
from unstructured.partition.auto import partition
from unstructured.documents.elements import Image as UnstructuredImage

def save_unstructured_images_from_directory(
    source_path: str,
    output_dir: str = "saved_images",
    strategy: str = "hi_res",
    languages: list[str] = ["eng"]
) -> list[dict]:
    """
    Walk a directory (or single file, or ZIP), partition each document,
    extract UnstructuredImage elements, decode & save them, and return
    a summary of processed docs.

    Args:
        source_path: Path to a directory, file, or ZIP archive.
        output_dir:  Base directory in which to save extracted images.
        strategy:    Unstructured partition strategy.
        languages:   List of language codes for OCR partitioning.

    Returns:
        A list of dicts, each with:
          {
            "source": <filename>,
            "elements": <List[Element]>,
            "saved_images": <List[path to saved image files]>
          }
    """
    processed = []
    source = Path(source_path)

    # Gather input files
    to_process = []
    if source.is_dir():
        # walk directory for files
        for path in source.rglob("*"):
            if path.is_file():
                to_process.append(path)
    elif zipfile.is_zipfile(source):
        # extract ZIP to temp dir
        tmpdir = Path(tempfile.mkdtemp(prefix="unzipped_"))
        with zipfile.ZipFile(source, "r") as zf:
            zf.extractall(tmpdir)
        for path in tmpdir.rglob("*"):
            if path.is_file():
                to_process.append(path)
    else:
        # single file
        to_process.append(source)

    # ensure base output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # process each file
    for file_path in to_process:
        name       = file_path.name
        stem       = file_path.stem
        elements   = partition(
            filename=str(file_path),
            strategy=strategy,
            languages=languages,
            extract_image_block_to_payload=True,
            extract_image_block_types=["Image"],
            infer_table_structure=False,
            image_output_dir_path=None
        )

        processed.append({
            "source": str(file_path),
            "elements": elements

        })

    # clean up tempdir if created
    if 'tmpdir' in locals():
        shutil.rmtree(tmpdir, ignore_errors=True)

    return processed


In [17]:
!mkdir my_docs_folder

In [19]:
documents = save_unstructured_images_from_directory(
    source_path="my_docs_folder",
    output_dir="extracted_images"
)
print(f"\n🎉 Done processing files. Successfully partitioned {len(documents)} documents.")




save_unstructured_images(documents, output_dir="my_extracted_images")
img_caps, tbl_caps = generate_captions_from_memory(documents)
all_images, all_tables, all_texts = restructure_all_elements_flat(documents)

print(f"Found {len(all_images)} images, {len(all_tables)} tables, {len(all_texts)} text elements.")

lc_docs = convert_elements_to_langchain_docs(
     texts=all_texts,
     images=all_images,
     tables=all_tables,
     image_captions=img_caps,
     table_captions=tbl_caps,
 )


chunked_docs = dynamic_chunk_documents(lc_docs)
vector_store = ingest_chroma(chunked_docs,embedding_model)
vector_store.persist()


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#embeddings

In [None]:
# Define your query
query = "What is segmeentation "

# Perform the similarity search
similar_docs = vector_store.similarity_search(query, k=5)

# Display the results
for doc in similar_docs:
    print(doc.page_content)


What is Image Segmentation?
1. Gestalt Principles:
Image segmentation refers to dividing an image into multiple parts or regions, each representing meaningful components like objects or boundaries.
Summary of Steps:
3. Perform watershed segmentation.


In [None]:
# now test for the gdrive gENNERIC FODLER / FILE /ZIP FILE

**loads the Documents of GDrive Folder**

In [None]:
def load_and_partition_drive_documents(drive_id, download_dir, figures_base_dir):
    """
    Loads and partitions documents from a Google Drive folder or file.

    Args:
        drive_id (str): The ID of the Google Drive folder or file.
        download_dir (str): Directory to temporarily download files.
        figures_base_dir (str): Base directory to save extracted figures/images.

    Returns:
        List[Dict]: A list of dictionaries, each containing filename and extracted elements.
    """
    processed_docs = []

    try:
        # Retrieve metadata to determine if the ID is a folder or file
        file_metadata = drive_service.files().get(fileId=drive_id, fields="id, name, mimeType").execute()
        mime_type = file_metadata.get("mimeType")
        name = file_metadata.get("name")

        if mime_type == "application/vnd.google-apps.folder":
            # It's a folder; fetch all files within
            files = get_files_from_folder(drive_id)
        else:
            # It's a single file
            files = [file_metadata]

    except Exception as e:
        print(f"Error retrieving metadata for ID {drive_id}: {e}")
        return []

    if not files:
        print("No files to process.")
        return []

    for file in files:
        file_id = file["id"]
        name = file["name"]

        local_path = os.path.join(download_dir, name)
        output_figures_path = os.path.join(figures_base_dir, os.path.splitext(name)[0])
        os.makedirs(output_figures_path, exist_ok=True)

        print(f"\nAttempting to process: {name}")

        try:
            request = drive_service.files().get_media(fileId=file_id)

            # Download file to temporary directory
            with io.FileIO(local_path, "wb") as fh:
                downloader = MediaIoBaseDownload(fh, request)
                done = False
                while not done:
                    status = downloader.next_chunk()
                    _, done = status

            print(f"Downloaded: {name} to {local_path}")

            # Check if the file is a ZIP archive
            if name.lower().endswith('.zip'):
                # Extract ZIP file
                with zipfile.ZipFile(local_path, 'r') as zip_ref:
                    zip_ref.extractall(download_dir)
                os.remove(local_path)  # Remove ZIP file after extraction

                # Process each extracted file
                for root, _, filenames in os.walk(download_dir):
                    for filename in filenames:
                        extracted_path = os.path.join(root, filename)
                        try:
                            elements = partition(
                                filename=extracted_path,
                                strategy="hi_res",
                                languages=["eng"],
                                extract_image_block_to_payload=True,
                                extract_image_block_types=["Image", "Table"],
                                infer_table_structure=True,
                                image_output_dir_path=output_figures_path
                            )
                            processed_docs.append({
                                "filename": filename,
                                "elements": elements,
                                "extracted_figures_dir": output_figures_path
                            })
                            print(f"✅ Successfully loaded and partitioned: {filename} ({len(elements)} elements found)")
                        except Exception as e:
                            print(f"❌ Failed to partition {filename}: {e}")
                        finally:
                            os.remove(extracted_path)  # Clean up extracted file
            else:
                # Process non-ZIP file
                elements = partition(
                    filename=local_path,
                    strategy="hi_res",
                    languages=["eng"],
                    extract_image_block_to_payload=True,
                    extract_image_block_types=["Image", "Table"],
                    infer_table_structure=True,
                    image_output_dir_path=output_figures_path
                )
                processed_docs.append({
                    "filename": name,
                    "elements": elements,
                    "extracted_figures_dir": output_figures_path
                })
                print(f"✅ Successfully loaded and partitioned: {name} ({len(elements)} elements found)")

        except Exception as e:
            print(f"❌ Failed to process {name}: {e}")
        finally:
            if os.path.exists(local_path):
                os.remove(local_path)

    return processed_docs


**Download the GDrive File**

**Function to Generate Captions**

In [None]:
!pip install --force-reinstall pymupdf[all]
!pip install langchain-groq
#!pip install groq
#!pip install  fitz # we dont need to install the fitz, we just need to install the pymupdf

In [None]:
import os
import fitz # PyMuPDF for PDF processing
import base64
from typing import List, Tuple

# Import LangChain components
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage # Used to structure the user message with multimodal content

# Set your Groq API key using an environment variable
# It's highly recommended to set this outside your script for security:
# export GROQ_API_KEY='your_groq_api_key_here'
# If not set externally, you can uncomment the line below and replace with your key,
# but be cautious about committing your key to version control.
# os.environ["GROQ_API_KEY"] = "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" # Replace with your actual key

# --- Ensure API key is set ---
os.environ["GROQ_API_KEY"]= "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"
if not os.environ.get("GROQ_API_KEY"):
    print("Error: GROQ_API_KEY environment variable not set.")
    print("Please set your Groq API key before running the script.")
    # In a real application, you might want to exit or handle this differently
    exit() # Exit the script if the key is not set

# --- Step 1: Extract images from PDF using PyMuPDF (fitz) ---
def extract_images_from_pdf(pdf_path: str) -> List[Tuple[int, bytes]]:
    """
    Extracts images from each page of a PDF.

    Args:
        pdf_path: The path to the PDF file.

    Returns:
        A list of tuples, where each tuple contains the page number (1-based)
        and the image bytes in PNG format.
    """
    images = []
    try:
        with fitz.open(pdf_path) as doc:
            for page_num, page in enumerate(doc, start=1):
                # Get a high-resolution pixmap (image representation) of the page
                pix = page.get_pixmap()
                # Convert the pixmap to bytes in PNG format
                img_bytes = pix.tobytes("png")
                images.append((page_num, img_bytes))
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        exit()
    except Exception as e:
        print(f"Error extracting images from PDF: {e}")
        exit()
    return images

# --- Step 2: Encode image bytes to Base64 ---
def encode_image_to_base64(image_bytes: bytes) -> str:
    """
    Encodes image bytes to a Base64 string.

    Args:
        image_bytes: The raw bytes of the image.

    Returns:
        The Base64 encoded string.
    """
    return base64.b64encode(image_bytes).decode('utf-8')

# --- Step 3: Process each image with LangChain Groq API ---
def process_images_with_langchain_groq(images: List[Tuple[int, bytes]]) -> str:
    """
    Processes each extracted image using LangChain's ChatGroq for text extraction.

    Args:
        images: A list of tuples containing page number and image bytes.

    Returns:
        A single string containing the combined extracted text from all pages.
    """
    # Initialize the ChatGroq model
    # We initialize it here to reuse the same instance for all pages
    try:
        llm = ChatGroq(
            model="meta-llama/llama-4-scout-17b-16e-instruct", # Specify the model
            temperature=0.0,                                   # Lower temperature for extraction
            max_tokens=4096                                  # Ensure enough tokens for page text
        )
    except Exception as e:
        print(f"Error initializing ChatGroq model: {e}")
        print("Please check your GROQ_API_KEY and ensure the model name is correct and accessible.")
        return "" # Return empty string or handle error as needed

    combined_text = ""

    for page_num, img_bytes in images:
        print(f"Processing page {page_num}...")
        encoded_image = encode_image_to_base64(img_bytes)

        # Prepare the multimodal content for the user message using HumanMessage
        # The image URL is in the format "data:image/png;base64,<base64_string>"
        multimodal_content = [
            {
                "type": "text",
                "text": f"Extract all text from this page {page_num}. Provide the raw text content only, without any added commentary or formatting."
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{encoded_image}"
                }
            }
        ]

        # Create a HumanMessage
        message = HumanMessage(content=multimodal_content)

        # Invoke the ChatGroq model
        try:
            response = llm.invoke([message])

            # The response object contains the LLM's output text
            extracted_text = response.content

            print(f"Page {page_num} output:\n{extracted_text}")

            # Append the extracted text for this page to the combined text
            combined_text += f"\nPage {page_num} output:\n{extracted_text}" # Include page marker for clarity

        except Exception as e:
            print(f"\nAn error occurred processing page {page_num}: {e}")
            print("Skipping this page.")
            # Continue to the next page or handle error as appropriate

    return combined_text

# --- Main execution ---
# Replace with the actual path to your PDF file
pdf_path = "/content/DLD Mids Paper DSF23.pdf" # Assuming this path is correct in your environment

# Extract images from the PDF
images = extract_images_from_pdf(pdf_path)

if images:
    # Process images with LangChain Groq
    response = process_images_with_langchain_groq(images)

    # The 'response' variable now holds the combined text from all pages
    print("\n--- Combined Extracted Text ---")
    print(response)
else:
    print("No images extracted from the PDF.")



In [None]:
import os
import json
from typing import List, Optional, Union

# Import necessary Pydantic components directly from pydantic
from pydantic import BaseModel, Field

# Import LangChain components
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_groq import ChatGroq

# --- 1. Define Simplified Structured Data Models using Pydantic v2 ---

class ExtractedQuestion(BaseModel):
    """Represents a single question and its answer from the exam."""
    question_no: Union[str, int] = Field(description="The unique identifier for the question as it appears in the text (e.g., '1', 'Problem 7', 'Part (a)'). If a number is repeated for a different problem, create a unique descriptive identifier.")
    question_statement: str = Field(description="The complete, verbatim text of the question. Include any introductory phrases or instructions directly before the student's answer.")
    complete_answer: str = Field(description="The complete, verbatim content representing ONLY the student's answer. Exclude all extraneous text.")

class ExamExtraction(BaseModel):
    """Represents the extracted list of questions and answers."""
    extracted_questions: List[ExtractedQuestion] = Field(description="A list of all extracted questions and their answers.")

# --- 2. Prepare the Exam Text ---
# Paste the combined text from the previous interaction here.
# Ensure the full text is included.
exam_text = response


# --- 3. Initialize the Language Model and Output Parser ---
# Set your Groq API key as an environment variable
# Make sure to replace "YOUR_GROQ_API_KEY" with your actual key
# It's highly recommended to set this outside the script as an environment variable
# for security, e.g., using `export GROQ_API_KEY='your_key_here'` in your terminal
if not os.environ.get("GROQ_API_KEY"):
    # If not set externally, try setting it here (replace with your key)
    # os.environ["GROQ_API_KEY"] = "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" # Replace with your actual key

    # Add a check and exit if the key is still not set
    if not os.environ.get("GROQ_API_KEY"):
         print("Error: GROQ_API_KEY environment variable not set.")
         print("Please set your Groq API key before running the script.")
         exit()


# Initialize the Groq model
try:
    llm = ChatGroq(
        model="meta-llama/llama-4-scout-17b-16e-instruct", # Using the model you specified
        temperature=0.0, # Keep temperature very low for minimal creativity
        max_tokens=4096 # Ensure enough tokens for processing and output
    )
except Exception as e:
    print(f"Error initializing Groq model: {e}")
    print("Please check your GROQ_API_KEY and ensure the model name is correct and accessible.")
    exit()


# Initialize the Pydantic output parser with the top-level model (ExamExtraction)
parser = PydanticOutputParser(pydantic_object=ExamExtraction)

# --- 4. Craft the Prompt Template ---
# The prompt tells the LLM what to do and how to format the output
prompt_template = PromptTemplate.from_template(
    """You are an extremely strict text extraction bot. Your ONLY goal is to extract specific pieces of text from an exam document.
Analyze the following exam content meticulously. Identify each distinct question or problem statement.
For each question, extract its exact identifier, the exact text of the question, and the exact block of text that constitutes the student's answer.

You ABSOLUTELY MUST NOT add any extra text, commentary, summarization, or interpretation to the extracted content.
You MUST only include text that was present in the original document within the designated question or answer fields.

Format the extracted information as a single JSON object. The JSON structure MUST strictly
adhere to the following format instructions derived from the Pydantic schema:
{format_instructions}

Guidelines for Extraction:
- Identify each distinct question or problem presented for the student. Look for explicit numbers (e.g., 1., 2., 3.), keywords like "Question" or "Problem", or clear problem statements followed by a solution.
- Treat distinct parts of a larger problem (like "Part (a)" and "Part (b)") as separate entries if they have their own identifier and associated answer text block.
- For each identified question/problem, extract its unique identifier (number, title, or part) into 'question_no'. If a simple number is repeated for a different problem (e.g., '3' appears for two different problems), create a unique descriptive identifier (e.g., "Question 3 (Excess-3)", "Problem 7 (Multiples of 3 Circuit)", "Problem Statement (Nuclear Reactor)").
- Extract the *complete, verbatim text* of the question or problem statement into 'question_statement'. Capture *all* text belonging to the prompt itself, starting from where the question/problem is introduced.
- **EXTREMELY IMPORTANT:** Extract the *complete, verbatim block of text* from the document that represents the student's answer or solution for this question/problem into 'complete_answer'.
    - **ONLY INCLUDE TEXT FROM THE STUDENT'S ANSWER SECTION.**
    - **DO NOT INCLUDE:**
        - Any text describing the image content (e.g., "The image presents...", "The text extracted per page is:").
        - Any text introducing solution steps if those introductions are not part of the student's original formatting (e.g., ignore "## Step 1:", "Step 1:", unless the student used that format in their work, but capture the content *under* it). Focus on the content, not auto-generated headings.
        - Any concluding remarks or phrases added by a processing system (e.g., "The final answer is:", "Conclusion:") unless the content immediately following is clearly the student's work.
        - Any summarization or rephrasing by you (the LLM).
        - Any text that is clearly part of a *different* question or unrelated section.
    - Capture the entire contiguous block of content that constitutes the student's answer associated with the identified question.

- Ensure the output is *only* the JSON object. Do not include any conversational text, explanations, markdown comments, or notes whatsoever outside the JSON block.

Exam Content:
---
{text}
---

JSON Output:
"""
)

# --- 5. Create the Processing Chain ---
# Bind the parser's format instructions to the prompt template
prompt = prompt_template.partial(
    format_instructions=parser.get_format_instructions()
)

# Create the chain: Prompt -> LLM -> Parser
chain = prompt | llm | parser

# --- 6. Process the Exam Text and Output JSON ---
print("Invoking LLM chain to extract data...")
try:
    # Invoke the chain with the exam text
    structured_output: ExamExtraction = chain.invoke({"text": exam_text})
    print("Extraction complete.")

    # Convert the Pydantic object to a JSON string using the v2 method
    json_output = structured_output.model_dump_json(indent=4)

    # Print the final JSON output
    print("\n--- Extracted JSON Data ---")
    print(json_output)

except Exception as e:
    print(f"\nAn error occurred during the extraction process: {e}")
    print("Possible reasons: LLM failed to produce valid JSON, API key issue, or prompt ambiguity.")
    print("Please review the error message, the exam text, and the prompt/Pydantic model.")
    # Optional: Print the raw LLM output if available for debugging.
    # This can help understand why parsing failed if the LLM produced invalid JSON.
    # try:
    #     print("\nRaw LLM output (for debugging):")
    #     # Accessing LLM response might vary slightly based on LangChain version/type
    #     # This is an attempt, you might need to adjust based on the exact error object
    #     if hasattr(e, '__cause__') and hasattr(e.__cause__, 'response'):
    #          print(e.__cause__.response.content)
    #     elif hasattr(e, 'response'): # Some base exception might have it
    #          print(e.response.content)
    #     else:
    #          print("Raw LLM output not easily accessible from error object.")
    # except Exception as debug_e:
    #     print(f"Could not print raw LLM output: {debug_e}")

In [None]:
import json
try:
    python_data = json.loads(json_output)

    # Now 'python_data' is a standard Python dictionary or list
    # You can access its elements like any other Python data structure

    print("\n--- Using the Python Dictionary/List ---")
    print(f"Type of python_data: {type(python_data)}") # Should be <class 'dict'>

    # Example of accessing the list of questions
    if isinstance(python_data, dict) and 'extracted_questions' in python_data:
        questions_list = python_data['extracted_questions']
        print(f"Number of extracted questions: {len(questions_list)}")

        if questions_list:
            first_question = questions_list[0]
            print(f"\nDetails of the first extracted question:")
            print(f"Question No: {first_question.get('question_no')}")
            print(f"Statement (first 50 chars): {first_question.get('question_statement', '')[:50]}...")
            print(f"Answer (first 50 chars): {first_question.get('complete_answer', '')[:50]}...")

except json.JSONDecodeError as e:
    print(f"Error decoding JSON string into Python object: {e}")
    print("The string might not be valid JSON.")
except Exception as e:
    print(f"An unexpected error occurred while processing the Python data: {e}")



In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI  # :contentReference[oaicite:0]{index=0}
from langchain.retrievers.multi_query import MultiQueryRetriever  # :contentReference[oaicite:1]{index=1}


# #Retriever 

In [None]:
from typing import List

from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field


# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
llm = ChatGoogleGenerativeAI(model='gemini-1.5-pro', google_api_key=GOOGLE_API_KEY) # best method

# Chain
llm_chain = QUERY_PROMPT | llm | output_parser

# Other inputs
question = "What are the approaches to Task Decomposition?"

In [None]:
base_retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"fetch_k": 20, "k": 5}
)# Run


retriever = MultiQueryRetriever(
    retriever=base_retriever, llm_chain=llm_chain, parser_key="lines",

)  # "lines" is the key (attribute name) of the parsed output

# Results
#unique_docs = retriever.invoke("What does the course say about regression?")
#len(unique_docs)

12

In [None]:
# for doc in unique_docs:
#   print(doc.page_content)

Applications
2. Subjectivity:
3. Multiple Segmentations:
6. Visualize Results:
1. Compute gradient (elevation map).
1. Gestalt Principles:
4. Satellite Imagery: Land use and vegetation analysis.
1. Medical Imaging: Identifying tumors or organs.
Grouping is vital to understanding images.
Example: Fully Convolutional Networks (FCNs).
5. Visualize the segmentation results.
1. Ground Truth Comparison:


In [None]:
import os
import json
from typing import List, Optional, Union

# Import necessary Pydantic components directly from pydantic
from pydantic import BaseModel, Field, ValidationError

# Import LangChain components for LLMs, Prompts, Parsers, and Retrieval
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser, BaseOutputParser
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI # For the MultiQueryRetriever LLM
from langchain.retrievers import MultiQueryRetriever # Import the retriever class
from langchain_core.documents import Document # To type hint retrieved documents
from langchain_core.runnables import Runnable # For chaining components

# --- Assume the combined_text and extracted_exam_data_dict are available from previous steps ---
# In a real script, you would run the PDF extraction and JSON loading first.
# For this code block to be runnable, we'll use a placeholder for the extracted data.
# REPLACE THIS PLACEHOLDER with the actual Python dictionary you got from json.loads()
# after running the previous PDF extraction and JSON parsing code.

# Example placeholder structure (replace with your actual data):
extracted_exam_data_dict = {
    "extracted_questions": [
        {
            "question_no": "1",
            "question_statement": "Convert 2408.4 from hexadecimal to decimal number system.",
            "complete_answer": "The hexadecimal number is $(2408.4)_{16}$. To convert it to decimal, we break it down as follows: $(2408.4)_{16} = (2 \\times 16^3) + (4 \\times 16^2) + (0 \\times 16^1) + (11 \\times 16^0) + (4 \\times 16^{-1}) = (2 \\times 4096) + (4 \\times 256) + (0) + (11) + \\left(\\frac{4}{16}\\right) = 8192 + 1024 + 11 + 0.25 = (9227.25)_{10}"
        },
        {
            "question_no": "2",
            "question_statement": "Convert 1024.25 from decimal to binary number system.",
            "complete_answer": "The decimal number is $(1024.25)_{10}$. To convert it to binary, we break it down as follows: For the integer part (1024): $1024 \\div 2 = 512$ (remainder 0), $512 \\div 2 = 256$ (remainder 0), $256 \\div 2 = 128$ (remainder 0), $128 \\div 2 = 64$ (remainder 0), $64 \\div 2 = 32$ (remainder 0), $32 \\div 2 = 16$ (remainder 0), $16 \\div 2 = 8$ (remainder 0), $8 \\div 2 = 4$ (remainder 0), $4 \\div 2 = 2$ (remainder 0), $2 \\div 2 = 1$ (remainder 0), $1 \\div 2 = 0$ (remainder 1). The binary representation of 1024 is $10000000000_2$. For the fractional part (0.25): $0.25 \\times 2 = 0.5$ (integer part 0), $0.5 \\times 2 = 1.0$ (integer part 1). The binary representation of 0.25 is $0.01_2$. Therefore, $(1024.25)_{10} = (10000000000.01)_2$."
        },
        # ... potentially more questions from your actual extraction
         {
            "question_no": "4",
            "question_statement": "Write down the function in Product of Maxterm form and express it in Π notation. f(x,y,z) = x'y'z' + xyz + x'y z'.",
            "complete_answer": "## Step 1: Understanding the given function The given function is f(x,y,z) = x'y'z' + xyz + x'y z'. To express it in Product of Maxterm form, we first need to find its complement. ## Step 2: Finding the complement of the function The complement of f(x,y,z) is F' = $\\bar{f} = \\bar{x}'y'z' \\cdot \\bar{xyz} \\cdot \\bar{x'y z'}$ ## Step 3: Expressing in POS Form $\\bar{F} = \\bar{x}'y'z' + \\bar{x}y z + x'y'z + xy\\bar{z} + x\\bar{y}z$ ## Step 4: Finding the Product of Maxterm form The Product of Maxterm form is given by $F = (x+j+z) \\cdot (x+\\bar{y}+z) \\cdot (\\bar{x}+y+z) \\cdot (\\bar{x}+y+\\bar{z}) \\cdot (\\bar{x}+\\bar{y}+z)$"
        },
        {
            "question_no": "7",
            "question_statement": "Write down the expression of F for the given circuit and give the simplified answer.",
            "complete_answer": "The expression can be derived as follows: The top gate: $\\overline{AB} = \\overline{A} + \\overline{B}$. The bottom gate: $\\overline{\\overline{A}B} = A + \\overline{B}$. The output $F$ is the AND of these two: $F = (\\overline{A} + \\overline{B})(A + \\overline{B})$. Applying the distributive law: $F = \\overline{A}A + \\overline{A}\\overline{B} + \\overline{B}A + \\overline{B}\\overline{B}$. Since $\\overline{A}A = 0$ and $\\overline{B}\\overline{B} = \\overline{B}$: $F = \\overline{A}\\overline{B} + \\overline{B}A + \\overline{B}$. Further simplification yields: $F = \\overline{B}(\\overline{A} + A + 1) = \\overline{B}$"
        }
    ]
}


# --- 2. Define Pydantic Models for the Evaluation Report ---

class QuestionEvaluation(BaseModel):
    """Represents the evaluation metrics for a single question."""
    question_no: Union[str, int] = Field(description="The unique identifier for the question being evaluated.")
    similarity_percentage: int = Field(description="The estimated similarity percentage of the student's answer compared to a correct answer (0-100).")
    feedback: str = Field(description="Feedback explaining the similarity percentage, highlighting correct aspects, missing parts, or errors. This should be a concise explanation.")

class ExamEvaluationReport(BaseModel):
    """Represents the complete evaluation report for an exam."""
    evaluations: List[QuestionEvaluation] = Field(description="A list of evaluations for each extracted question.")


# --- 3. Setup the Vector Store and Retriever ---

# IMPORTANT: Replace with your actual vector store initialization and Google API Key
# This is a placeholder. You need to have initialized your vector store (e.g., Chroma, FAISS)
# and populated it with your document chunks.
# Example:
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import OpenAIEmbeddings # Or your chosen embedding model
# vector_store = Chroma(persist_directory="./your_vector_db", embedding_function=OpenAIEmbeddings())

# Assume vector_store is initialized and contains your document chunks
# and GOOGLE_API_KEY is set as an environment variable or defined here.
# os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY" # Replace if not using environment variable

# --- Ensure Google API key is set for the retriever's LLM ---

if not os.environ.get("GOOGLE_API_KEY"):
    print("Error: GOOGLE_API_KEY environment variable not set for the retriever's LLM.")
    print("Please set your Google API key before running the script.")
    # In a real application, you might want to exit or handle this differently
    # exit() # Uncomment to exit if key is not set


# Placeholder for your vector store object
# You MUST replace this with your actual initialized vector store
vector_store = None # Replace with your vector store object, e.g., Chroma(...)
if vector_store is None:
    print("Error: vector_store is not initialized.")
    print("Please replace the 'None' placeholder with your actual vector store object.")
    # exit() # Uncomment to exit if vector_store is not initialized
    # For demonstration purposes, we will skip the retrieval step if vector_store is None
    # In a real application, you would not do this.


# Define the LLM for MultiQueryRetriever (using Google Generative AI as you provided)
try:
    retriever_llm = ChatGoogleGenerativeAI(model='gemini-1.5-pro') # Uses GOOGLE_API_KEY env var
except Exception as e:
    print(f"Error initializing Google Generative AI model for retriever: {e}")
    print("Please check your GOOGLE_API_KEY and ensure the model name is correct and accessible.")
    retriever_llm = None # Set to None if initialization fails


# Output parser will split the LLM result into a list of queries (as provided by user)
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines

output_parser = LineListOutputParser()

# Prompt for MultiQueryRetriever (as provided by user)
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Chain for MultiQueryRetriever's query generation
if retriever_llm:
    llm_chain_for_retriever = QUERY_PROMPT | retriever_llm | output_parser
else:
    llm_chain_for_retriever = None # Cannot create chain if LLM failed


# Configure the base retriever from the vector store (as provided by user)
if vector_store:
    base_retriever = vector_store.as_retriever(
        search_type="mmr", # or "similarity"
        search_kwargs={"fetch_k": 20, "k": 5} # fetch_k: number of docs to fetch, k: number of docs to return
    )

    # Create the MultiQueryRetriever (as provided by user)
    if llm_chain_for_retriever:
        retriever = MultiQueryRetriever(
            retriever=base_retriever,
            llm_chain=llm_chain_for_retriever,
            parser_key="lines", # "lines" is the key (attribute name) of the parsed output
        )
        retriever_available = True
    else:
        retriever = None
        retriever_available = False
        print("MultiQueryRetriever chain could not be created due to LLM initialization failure.")
else:
    retriever = None
    retriever_available = False
    print("MultiQueryRetriever is not available because vector_store is not initialized.")


# --- 4. Define Pydantic Models for the Evaluation Report (Already done in Step 2) ---
# --- 5. Initialize the Evaluation Language Model and Output Parser (Already done in previous code) ---
# Using deepseek-coder via ChatGroq as before
# evaluation_llm = ChatGroq(...)
# evaluation_parser = PydanticOutputParser(pydantic_object=QuestionEvaluation)


# --- 6. Craft the Prompt Template for Evaluation (Updated to include context) ---
evaluation_prompt_template = PromptTemplate.from_template(
    """You are an expert exam evaluator. Your task is to analyze a question, a student's provided answer, and relevant context from course materials.
Evaluate how well the student's answer addresses and correctly answers the question, using the provided context as a reference for correctness and completeness.

Based on your understanding of the question, the student's answer, and the context, estimate a similarity percentage (0-100)
where 100% means the answer is completely correct and addresses all parts of the question according to the context, and 0% means it's completely incorrect or irrelevant.

Provide concise feedback explaining the percentage (in short). Highlight what was correct, what was missing, or what was incorrect based on the context, leading to the given percentage.

Output your evaluation for this single question as a JSON object. The JSON structure MUST strictly adhere to the following format instructions:
{format_instructions}

Question Number: {question_no}
Question Statement: {question_statement}
Student's Answer: {complete_answer}

Relevant Context from Course Materials:
---
{context}
---

Output ONLY the JSON object. Do NOT include any text, notes, or markdown before or after the JSON.
The output must start with '{{' and end with '}}'.

JSON Output:
"""
)

# --- 7. Create the Evaluation Chain for a Single Question (Updated prompt) ---
# Bind the parser's format instructions to the evaluation prompt template
evaluation_prompt = evaluation_prompt_template.partial(
    format_instructions=evaluation_parser.get_format_instructions()
)

# Create the chain for evaluating a single question: Prompt -> LLM -> Parser
single_question_evaluation_chain = evaluation_prompt | evaluation_llm | evaluation_parser


# --- 8. Process Each Extracted Question for Evaluation with Retrieval ---

evaluation_results: List[QuestionEvaluation] = []

# Ensure extracted_exam_data_dict is a dictionary and contains the 'extracted_questions' key
if isinstance(extracted_exam_data_dict, dict) and 'extracted_questions' in extracted_exam_data_dict:
    extracted_questions_list = extracted_exam_data_dict.get('extracted_questions', []) # Use .get for safety

    print(f"\nStarting evaluation of {len(extracted_questions_list)} questions with context retrieval...")

    for question_data in extracted_questions_list:
        q_no = question_data.get('question_no', 'Unknown')
        q_statement = question_data.get('question_statement', '')
        q_answer = question_data.get('complete_answer', '')

        if not q_statement or not q_answer:
            print(f"Skipping evaluation for Question {q_no} due to missing statement or answer.")
            continue

        print(f"Evaluating Question {q_no}...")

        # --- Retrieve context for the current question ---
        context = "No relevant context found." # Default context if retrieval fails or is not available
        if retriever_available:
            try:
                print(f"  Retrieving context for Question {q_no}...")
                retrieved_docs: List[Document] = retriever.invoke(q_statement)
                print(f"  Retrieved {len(retrieved_docs)} documents.")

                # Combine the content of retrieved documents
                context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
                if not context.strip():
                     context = "No relevant context found." # Ensure context is not just whitespace if docs were empty

            except Exception as e:
                print(f"  An error occurred during context retrieval for Question {q_no}: {e}")
                print("  Proceeding with evaluation without context for this question.")
                context = "Error retrieving context." # Indicate retrieval failed


        # --- Perform evaluation using the retrieved context ---
        try:
            # Invoke the single question evaluation chain with all required variables
            evaluation: QuestionEvaluation = single_question_evaluation_chain.invoke({
                "question_no": q_no,
                "question_statement": q_statement,
                "complete_answer": q_answer,
                "context": context # Pass the retrieved context
            })
            evaluation_results.append(evaluation)
            print(f"  Evaluation complete for Question {q_no}.")

        except ValidationError as e:
             print(f"  Validation Error for Question {q_no}: {e}")
             print("  LLM output did not match the Pydantic schema for evaluation.")
             # Optional: Try to print the raw LLM output here for debugging
             # Note: Accessing raw LLM output after a parsing error can be tricky
             # Depending on LangChain version and error structure.
             # print(f"  Raw LLM output might be: {e.raw_output if hasattr(e, 'raw_output') else 'N/A'}")

        except Exception as e:
            print(f"  An unexpected error occurred during evaluation for Question {q_no}: {e}")
            print("  Skipping evaluation for this question.")


    # --- 9. Generate the Final Evaluation Report ---
    final_report = ExamEvaluationReport(evaluations=evaluation_results)

    # --- 10. Print the Final Report JSON ---
    print("\n--- Final Exam Evaluation Report ---")
    print(final_report.model_dump_json(indent=4))

else:
    print("\nError: Could not find 'extracted_questions' list in the provided data dictionary.")
    print("Please ensure the 'extracted_exam_data_dict' variable contains the correct structure from the previous step.")

