In [19]:
!pip install \
  pdf2image \
  Pillow \
  langchain-groq \
  langchain-core

!pip install PyMuPDF

!sudo apt update && sudo apt install poppler-utils



In [31]:



import os
import fitz # PyMuPDF for PDF processing
import base64
from typing import List, Tuple

# Import LangChain components
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage # Used to structure the user message with multimodal content

# Set your Groq API key using an environment variable
# It's highly recommended to set this outside your script for security:
# export GROQ_API_KEY='your_groq_api_key_here'
# If not set externally, you can uncomment the line below and replace with your key,
# but be cautious about committing your key to version control.
# os.environ["GROQ_API_KEY"] = "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" # Replace with your actual key

# --- Ensure API key is set ---
os.environ["GROQ_API_KEY"]= "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"
if not os.environ.get("GROQ_API_KEY"):
    print("Error: GROQ_API_KEY environment variable not set.")
    print("Please set your Groq API key before running the script.")
    # In a real application, you might want to exit or handle this differently
    exit() # Exit the script if the key is not set

# --- Step 1: Extract images from PDF using PyMuPDF (fitz) ---
def extract_images_from_pdf(pdf_path: str) -> List[Tuple[int, bytes]]:
    """
    Extracts images from each page of a PDF.

    Args:
        pdf_path: The path to the PDF file.

    Returns:
        A list of tuples, where each tuple contains the page number (1-based)
        and the image bytes in PNG format.
    """
    images = []
    try:
        with fitz.open(pdf_path) as doc:
            for page_num, page in enumerate(doc, start=1):
                # Get a high-resolution pixmap (image representation) of the page
                pix = page.get_pixmap()
                # Convert the pixmap to bytes in PNG format
                img_bytes = pix.tobytes("png")
                images.append((page_num, img_bytes))
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        exit()
    except Exception as e:
        print(f"Error extracting images from PDF: {e}")
        exit()
    return images

# --- Step 2: Encode image bytes to Base64 ---
def encode_image_to_base64(image_bytes: bytes) -> str:
    """
    Encodes image bytes to a Base64 string.

    Args:
        image_bytes: The raw bytes of the image.

    Returns:
        The Base64 encoded string.
    """
    return base64.b64encode(image_bytes).decode('utf-8')

# --- Step 3: Process each image with LangChain Groq API ---
def process_images_with_langchain_groq(images: List[Tuple[int, bytes]]) -> str:
    """
    Processes each extracted image using LangChain's ChatGroq for text extraction.

    Args:
        images: A list of tuples containing page number and image bytes.

    Returns:
        A single string containing the combined extracted text from all pages.
    """
    # Initialize the ChatGroq model
    # We initialize it here to reuse the same instance for all pages
    try:
        llm = ChatGroq(
            model="meta-llama/llama-4-scout-17b-16e-instruct", # Specify the model
            temperature=0.0,                                   # Lower temperature for extraction
            max_tokens=4096                                  # Ensure enough tokens for page text
        )
    except Exception as e:
        print(f"Error initializing ChatGroq model: {e}")
        print("Please check your GROQ_API_KEY and ensure the model name is correct and accessible.")
        return "" # Return empty string or handle error as needed

    combined_text = ""

    for page_num, img_bytes in images:
        print(f"Processing page {page_num}...")
        encoded_image = encode_image_to_base64(img_bytes)

        # Prepare the multimodal content for the user message using HumanMessage
        # The image URL is in the format "data:image/png;base64,<base64_string>"
        # Prepare the multimodal content for the user message using HumanMessage

        multimodal_content = [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{encoded_image}"
                }
            },
            {
                "type": "text",
                "text": """
        You are an expert at extracting exam content from an image of a page.
        1. For each MULTIPLE-CHOICE QUESTION (MCQ):
          - Extract the question number and full text.
          - List **all** options exactly as they appear (A., B., C., D., or however many there are).
          - If an option is clearly circled or marked, append " (circled)" to that option.
          - Then, immediately after the options, add a line:
            Selected answer: <option letter>
          - If no option is marked, do **not** add a “Selected answer” line and do **not** guess.
          - Support any number of options (2, 3, 4, 5, etc.)—just list them all.

        2. For each SUBJECTIVE question:
          - Extract the entire question text verbatim, including tables or diagrams transcribed into plain text.

        3. For ALL OTHER text (instructions, headers, footers, etc.):
          - Extract exactly as it appears, preserving numbering and layout.

        Deliver the result as plain text, in the order you see it, with minimal formatting—just enough to keep questions and options clear.
        """
            }
        ]





        # Create a HumanMessage
        message = HumanMessage(content=multimodal_content)

        # Invoke the ChatGroq model
        try:
            response = llm.invoke([message])

            # The response object contains the LLM's output text
            extracted_text = response.content

            print(f"Page {page_num} output:\n{extracted_text}")

            # Append the extracted text for this page to the combined text
            combined_text += f"\nPage {page_num} output:\n{extracted_text}" # Include page marker for clarity

        except Exception as e:
            print(f"\nAn error occurred processing page {page_num}: {e}")
            print("Skipping this page.")
            # Continue to the next page or handle error as appropriate

    return combined_text

# --- Main execution ---
# Replace with the actual path to your PDF file
pdf_path = "/content/se_quiz.pdf" # Assuming this path is correct in your environment

# Extract images from the PDF
images = extract_images_from_pdf(pdf_path)

if images:
    # Process images with LangChain Groq
    response = process_images_with_langchain_groq(images)

    # The 'response' variable now holds the combined text from all pages
    print("\n--- Combined Extracted Text ---")
    print(response)
else:
    print("No images extracted from the PDF.")



Processing page 1...
Page 1 output:
BSIT Fall 20  
Software Engineering  
Quiz 3  

1. Which one of the following is not an element of "Entity Relationship Diagram"?  
A. Entity  
B. Cardinality  
C. Data Store  
D. Modality  

Selected answer: C.

2. Which one of the following is not an element of "Data Flow Diagram"?  
A. Data Flow  
B. External Entity  
C. Data Store  
D. Relationship  

Selected answer: D.

3. Which one of the following is not an element of "State Transition Diagram"?  
A. State  
B. State Transition  
C. Entity  
D. Action  

Selected answer: C.

4. Which one of the following is not a type of "External Entity"?  
A. Users  
B. External System  
C. Data stores of the system under consideration  
D. None of the above  

Selected answer: C.

5. Which element of "Data Flow Diagram" contains unidirectional flow of data?  
A. Data Flow  
B. Data Store  

Selected answer: A.

6. The documentation of "State Transition Diagram" is known as:  
A. Process Specification  
B. 

In [32]:
print(response)


Page 1 output:
BSIT Fall 20  
Software Engineering  
Quiz 3  

1. Which one of the following is not an element of "Entity Relationship Diagram"?  
A. Entity  
B. Cardinality  
C. Data Store  
D. Modality  

Selected answer: C.

2. Which one of the following is not an element of "Data Flow Diagram"?  
A. Data Flow  
B. External Entity  
C. Data Store  
D. Relationship  

Selected answer: D.

3. Which one of the following is not an element of "State Transition Diagram"?  
A. State  
B. State Transition  
C. Entity  
D. Action  

Selected answer: C.

4. Which one of the following is not a type of "External Entity"?  
A. Users  
B. External System  
C. Data stores of the system under consideration  
D. None of the above  

Selected answer: C.

5. Which element of "Data Flow Diagram" contains unidirectional flow of data?  
A. Data Flow  
B. Data Store  

Selected answer: A.

6. The documentation of "State Transition Diagram" is known as:  
A. Process Specification  
B. Mini Specification  

**Make Structured OUtput json format**

**Generate Structured json of questions **

In [33]:
import os
import json
from typing import List, Dict, Optional, Union

# Pydantic v2
from pydantic import BaseModel, Field

# LangChain & Groq imports
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_groq import ChatGroq

# --- 1. Define your new structured models ---

class MCQ(BaseModel):
    question_no: Union[str, int] = Field(..., description="MCQ identifier (e.g. '4', 'Q5')")
    statement: str = Field(..., description="The MCQ stem text")
    options: Dict[str, str] = Field(
        ..., description="Mapping from option letter to option text"
    )
    student_selected_answer: Optional[str] = Field(
        None,
        description="The letter of the option the student marked (if any)"
    )
    marks: Optional[int] = Field(
        None,
        description="Marks awarded (if available)"
    )

class SubjectiveQuestion(BaseModel):
    question_no: Optional[Union[str, int]] = Field(
        None, description="Subjective question identifier (may be omitted if not explicitly numbered)"
    )
    question_statement: str = Field(..., description="Full text of the question")
    complete_answer: str = Field(..., description="Student’s answer verbatim")
    marks: Optional[int] = Field(
        None,
        description="Marks awarded for this question"
    )

class ExamStructuredOutput(BaseModel):
    mcqs: List[MCQ] = Field(
        default_factory=list,
        description="List of all MCQs extracted (empty if none found)"
    )
    subjective: List[SubjectiveQuestion] = Field(
        default_factory=list,
        description="List of all subjective questions extracted (empty if none found)"
    )

# --- 2. Initialize LLM & Parser ---

if not os.environ.get("GROQ_API_KEY"):
    print("Error: set GROQ_API_KEY in your environment")
    exit()

llm = ChatGroq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.0,
    max_tokens=4096,
)

parser = PydanticOutputParser(pydantic_object=ExamStructuredOutput)

# --- 3. Craft the Prompt Template ---

prompt_template = PromptTemplate.from_template(
    """
You are a precise exam‐content extractor.  Given the raw extracted text of an exam,
identify and structure **all** multiple‐choice questions and **all** subjective questions into JSON.

1️⃣ **MCQs**
For each MCQ found, output an object with:
- question_no: the question number or label (e.g. 4, Q5).
- statement: the full question text.
- options: a JSON map from each option letter (a, b, c, …) to its option text.
- student_selected_answer: the letter the student marked (only if clearly marked).
- marks: integer marks if present (otherwise null).

2️⃣ **Subjective Questions**
For each subjective question, output an object with:
- question_no: the question number or label.
- question_statement: the full question text.
- complete_answer: the student’s answer text verbatim.
- marks: integer marks if present (otherwise null).

**IMPORTANT:**
- Do not add or remove any field.
- Do not guess missing answers or marks.
- Output a single JSON object matching this exact schema:
{format_instructions}

Here is the exam content to structure:
---
{text}
---
"""
)

prompt = prompt_template.partial(format_instructions=parser.get_format_instructions())

# --- 4. Build and Invoke the Chain ---

chain = prompt | llm | parser

print("Extracting structured JSON...")
try:
    result: ExamStructuredOutput = chain.invoke({"text": exam_text})
    print("\n--- JSON Output ---")
    print(result.model_dump_json(indent=2))
except Exception as e:
    print("Extraction failed:", e)


Extracting structured JSON...

--- JSON Output ---
{
  "mcqs": [
    {
      "question_no": "1",
      "statement": "Which one of the following is not an element of \"Entity Relationship Diagram\"?",
      "options": {
        "A": "Entity",
        "B": "Cardinality",
        "C": "Data Store",
        "D": "Modality"
      },
      "student_selected_answer": "C",
      "marks": null
    },
    {
      "question_no": "2",
      "statement": "Which one of the following is not an element of \"Data Flow Diagram\"?",
      "options": {
        "A": "Data Flow",
        "B": "External Entity",
        "C": "Data Store",
        "D": "Relationship"
      },
      "student_selected_answer": "D",
      "marks": null
    },
    {
      "question_no": "3",
      "statement": "Which one of the following is not an element of \"State Transition Diagram\"?",
      "options": {
        "A": "State",
        "B": "State Transition",
        "C": "Entity",
        "D": "Action"
      },
      "student

**Default marks for subjectvie (2), objecgtive questions(1)**

In [34]:
import os
import json
from typing import List, Dict, Optional, Union

# Pydantic v2
from pydantic import BaseModel, Field

# LangChain & Groq imports
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_groq import ChatGroq

# --- 1. Define your new structured models with default marks ---

class MCQ(BaseModel):
    question_no: Union[str, int] = Field(
        ..., description="MCQ identifier (e.g. '4', 'Q5')"
    )
    statement: str = Field(
        ..., description="The MCQ stem text"
    )
    options: Dict[str, str] = Field(
        ..., description="Mapping from option letter to option text"
    )
    student_selected_answer: Optional[str] = Field(
        None,
        description="The letter of the option the student marked (if any)"
    )
    marks: int = Field(
        1,
        description="Marks awarded (default is 1 if not otherwise specified)"
    )

class SubjectiveQuestion(BaseModel):
    question_no: Optional[Union[str, int]] = Field(
        None,
        description="Subjective question identifier (may be omitted if not explicitly numbered)"
    )
    question_statement: str = Field(
        ..., description="Full text of the question"
    )
    complete_answer: str = Field(
        ..., description="Student’s answer verbatim"
    )
    marks: int = Field(
        2,
        description="Marks awarded (default is 2 if not otherwise specified)"
    )

class ExamStructuredOutput(BaseModel):
    mcqs: List[MCQ] = Field(
        default_factory=list,
        description="List of all MCQs extracted (empty if none found)"
    )
    subjective: List[SubjectiveQuestion] = Field(
        default_factory=list,
        description="List of all subjective questions extracted (empty if none found)"
    )

# --- 2. Initialize LLM & Parser ---

if not os.environ.get("GROQ_API_KEY"):
    print("Error: set GROQ_API_KEY in your environment")
    exit()

llm = ChatGroq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0.0,
    max_tokens=4096,
)

parser = PydanticOutputParser(pydantic_object=ExamStructuredOutput)

# --- 3. Craft the Prompt Template ---

prompt_template = PromptTemplate.from_template(
    """
You are a precise exam‐content extractor.  Given the raw extracted text of an exam,
identify and structure **all** multiple‐choice questions and **all** subjective questions into JSON.

1️⃣ **MCQs**
For each MCQ found, output an object with:
- question_no: the question number or label (e.g. 4, Q5).
- statement: the full question text.
- options: a JSON map from each option letter (a, b, c, …) to its option text.
- student_selected_answer: the letter the student marked (only if clearly marked).
- marks: integer marks (default is 1 if not specified).

2️⃣ **Subjective Questions**
For each subjective question, output an object with:
- question_no: the question number or label.
- question_statement: the full question text.
- complete_answer: the student’s answer text verbatim.
- marks: integer marks (default is 2 if not specified).

**IMPORTANT:**
- Do not add or remove any field.
- Do not guess missing answers or marks.
- Output a single JSON object matching this exact schema:
{format_instructions}

Here is the exam content to structure:
---
{text}
---
"""
)

prompt = prompt_template.partial(format_instructions=parser.get_format_instructions())

# --- 4. Build and Invoke the Chain ---

chain = prompt | llm | parser

print("Extracting structured JSON...")
try:
    result: ExamStructuredOutput = chain.invoke({"text": exam_text})
    print("\n--- JSON Output ---")
    print(result.model_dump_json(indent=2))
except Exception as e:
    print("Extraction failed:", e)


Extracting structured JSON...

--- JSON Output ---
{
  "mcqs": [
    {
      "question_no": "1",
      "statement": "Which one of the following is not an element of \"Entity Relationship Diagram\"?",
      "options": {
        "A": "Entity",
        "B": "Cardinality",
        "C": "Data Store",
        "D": "Modality"
      },
      "student_selected_answer": "C",
      "marks": 1
    },
    {
      "question_no": "2",
      "statement": "Which one of the following is not an element of \"Data Flow Diagram\"?",
      "options": {
        "A": "Data Flow",
        "B": "External Entity",
        "C": "Data Store",
        "D": "Relationship"
      },
      "student_selected_answer": "D",
      "marks": 1
    },
    {
      "question_no": "3",
      "statement": "Which one of the following is not an element of \"State Transition Diagram\"?",
      "options": {
        "A": "State",
        "B": "State Transition",
        "C": "Entity",
        "D": "Action"
      },
      "student_selec