In [None]:
!pip install groq

In [None]:
#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

In [None]:
!pip install --force-reinstall pymupdf[all]
!pip install langchain-groq
#!pip install groq
#!pip install  fitz # we dont need to install the fitz, we just need to install the pymupdf

In [None]:
import os
import fitz # PyMuPDF for PDF processing
import base64
from typing import List, Tuple

# Import LangChain components
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage # Used to structure the user message with multimodal content

# Set your Groq API key using an environment variable
# It's highly recommended to set this outside your script for security:
# export GROQ_API_KEY='your_groq_api_key_here'
# If not set externally, you can uncomment the line below and replace with your key,
# but be cautious about committing your key to version control.
# os.environ["GROQ_API_KEY"] = "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" # Replace with your actual key

# --- Ensure API key is set ---
os.environ["GROQ_API_KEY"]= "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"
if not os.environ.get("GROQ_API_KEY"):
    print("Error: GROQ_API_KEY environment variable not set.")
    print("Please set your Groq API key before running the script.")
    # In a real application, you might want to exit or handle this differently
    exit() # Exit the script if the key is not set

# --- Step 1: Extract images from PDF using PyMuPDF (fitz) ---
def extract_images_from_pdf(pdf_path: str) -> List[Tuple[int, bytes]]:
    """
    Extracts images from each page of a PDF.

    Args:
        pdf_path: The path to the PDF file.

    Returns:
        A list of tuples, where each tuple contains the page number (1-based)
        and the image bytes in PNG format.
    """
    images = []
    try:
        with fitz.open(pdf_path) as doc:
            for page_num, page in enumerate(doc, start=1):
                # Get a high-resolution pixmap (image representation) of the page
                pix = page.get_pixmap()
                # Convert the pixmap to bytes in PNG format
                img_bytes = pix.tobytes("png")
                images.append((page_num, img_bytes))
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        exit()
    except Exception as e:
        print(f"Error extracting images from PDF: {e}")
        exit()
    return images

# --- Step 2: Encode image bytes to Base64 ---
def encode_image_to_base64(image_bytes: bytes) -> str:
    """
    Encodes image bytes to a Base64 string.

    Args:
        image_bytes: The raw bytes of the image.

    Returns:
        The Base64 encoded string.
    """
    return base64.b64encode(image_bytes).decode('utf-8')

# --- Step 3: Process each image with LangChain Groq API ---
def process_images_with_langchain_groq(images: List[Tuple[int, bytes]]) -> str:
    """
    Processes each extracted image using LangChain's ChatGroq for text extraction.

    Args:
        images: A list of tuples containing page number and image bytes.

    Returns:
        A single string containing the combined extracted text from all pages.
    """
    # Initialize the ChatGroq model
    # We initialize it here to reuse the same instance for all pages
    try:
        llm = ChatGroq(
            model="meta-llama/llama-4-scout-17b-16e-instruct", # Specify the model
            temperature=0.0,                                   # Lower temperature for extraction
            max_tokens=4096                                  # Ensure enough tokens for page text
        )
    except Exception as e:
        print(f"Error initializing ChatGroq model: {e}")
        print("Please check your GROQ_API_KEY and ensure the model name is correct and accessible.")
        return "" # Return empty string or handle error as needed

    combined_text = ""

    for page_num, img_bytes in images:
        print(f"Processing page {page_num}...")
        encoded_image = encode_image_to_base64(img_bytes)

        # Prepare the multimodal content for the user message using HumanMessage
        # The image URL is in the format "data:image/png;base64,<base64_string>"
        multimodal_content = [
            {
                "type": "text",
                "text": f"Extract all text from this page {page_num}. Provide the raw text content only, without any added commentary or formatting."
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{encoded_image}"
                }
            }
        ]

        # Create a HumanMessage
        message = HumanMessage(content=multimodal_content)

        # Invoke the ChatGroq model
        try:
            response = llm.invoke([message])

            # The response object contains the LLM's output text
            extracted_text = response.content

            print(f"Page {page_num} output:\n{extracted_text}")

            # Append the extracted text for this page to the combined text
            combined_text += f"\nPage {page_num} output:\n{extracted_text}" # Include page marker for clarity

        except Exception as e:
            print(f"\nAn error occurred processing page {page_num}: {e}")
            print("Skipping this page.")
            # Continue to the next page or handle error as appropriate

    return combined_text

# --- Main execution ---
# Replace with the actual path to your PDF file
pdf_path = "DLD Mids Paper DSF23.pdf" # Assuming this path is correct in your environment

# Extract images from the PDF
images = extract_images_from_pdf(pdf_path)

if images:
    # Process images with LangChain Groq
    response = process_images_with_langchain_groq(images)

    # The 'response' variable now holds the combined text from all pages
    print("\n--- Combined Extracted Text ---")
    print(response)
else:
    print("No images extracted from the PDF.")



Processing page 1...
Page 1 output:
FACULTY OF COMPUTING AND INFORMATION TECHNOLOGY
University of the Punjab

                                                                                      Sheet No.: 83
                                                                                      Invigilator Sign:
                                                                                      Date: 04-18-2024

Digital Logic Design
Mid-Term
BSDS & BSCS

Student ID:                    BSDSF23M023                  Student Name:                    Muhammad Zohaib
Session:                       F23                           Student Signature:          Zohaib

Instructor: Tariq Butt
Maximum Time: 90 Minutes                                           Maximum Marks: 50
Instructions:
• Read questions carefully before attempting
• Attempt all questions on the answer sheet
• Paper has 10 pages, including a cover sheet
• If there is any ambiguity in the paper, the benefit will be given to the s

In [None]:
import os
import json
from typing import List, Optional, Union

# Import necessary Pydantic components directly from pydantic
from pydantic import BaseModel, Field

# Import LangChain components
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_groq import ChatGroq

# --- 1. Define Simplified Structured Data Models using Pydantic v2 ---

class ExtractedQuestion(BaseModel):
    """Represents a single question and its answer from the exam."""
    question_no: Union[str, int] = Field(description="The unique identifier for the question as it appears in the text (e.g., '1', 'Problem 7', 'Part (a)'). If a number is repeated for a different problem, create a unique descriptive identifier.")
    question_statement: str = Field(description="The complete, verbatim text of the question. Include any introductory phrases or instructions directly before the student's answer.")
    complete_answer: str = Field(description="The complete, verbatim content representing ONLY the student's answer. Exclude all extraneous text.")

class ExamExtraction(BaseModel):
    """Represents the extracted list of questions and answers."""
    extracted_questions: List[ExtractedQuestion] = Field(description="A list of all extracted questions and their answers.")

# --- 2. Prepare the Exam Text ---
# Paste the combined text from the previous interaction here.
# Ensure the full text is included.
exam_text = response


# --- 3. Initialize the Language Model and Output Parser ---
# Set your Groq API key as an environment variable
# Make sure to replace "YOUR_GROQ_API_KEY" with your actual key
# It's highly recommended to set this outside the script as an environment variable
# for security, e.g., using `export GROQ_API_KEY='your_key_here'` in your terminal
if not os.environ.get("GROQ_API_KEY"):
    # If not set externally, try setting it here (replace with your key)
    # os.environ["GROQ_API_KEY"] = "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" # Replace with your actual key

    # Add a check and exit if the key is still not set
    if not os.environ.get("GROQ_API_KEY"):
         print("Error: GROQ_API_KEY environment variable not set.")
         print("Please set your Groq API key before running the script.")
         exit()


# Initialize the Groq model
try:
    llm = ChatGroq(
        model="meta-llama/llama-4-scout-17b-16e-instruct", # Using the model you specified
        temperature=0.0, # Keep temperature very low for minimal creativity
        max_tokens=4096 # Ensure enough tokens for processing and output
    )
except Exception as e:
    print(f"Error initializing Groq model: {e}")
    print("Please check your GROQ_API_KEY and ensure the model name is correct and accessible.")
    exit()


# Initialize the Pydantic output parser with the top-level model (ExamExtraction)
parser = PydanticOutputParser(pydantic_object=ExamExtraction)

# --- 4. Craft the Prompt Template ---
# The prompt tells the LLM what to do and how to format the output
prompt_template = PromptTemplate.from_template(
    """You are an extremely strict text extraction bot. Your ONLY goal is to extract specific pieces of text from an exam document.
Analyze the following exam content meticulously. Identify each distinct question or problem statement.
For each question, extract its exact identifier, the exact text of the question, and the exact block of text that constitutes the student's answer.

You ABSOLUTELY MUST NOT add any extra text, commentary, summarization, or interpretation to the extracted content.
You MUST only include text that was present in the original document within the designated question or answer fields.

Format the extracted information as a single JSON object. The JSON structure MUST strictly
adhere to the following format instructions derived from the Pydantic schema:
{format_instructions}

Guidelines for Extraction:
- Identify each distinct question or problem presented for the student. Look for explicit numbers (e.g., 1., 2., 3.), keywords like "Question" or "Problem", or clear problem statements followed by a solution.
- Treat distinct parts of a larger problem (like "Part (a)" and "Part (b)") as separate entries if they have their own identifier and associated answer text block.
- For each identified question/problem, extract its unique identifier (number, title, or part) into 'question_no'. If a simple number is repeated for a different problem (e.g., '3' appears for two different problems), create a unique descriptive identifier (e.g., "Question 3 (Excess-3)", "Problem 7 (Multiples of 3 Circuit)", "Problem Statement (Nuclear Reactor)").
- Extract the *complete, verbatim text* of the question or problem statement into 'question_statement'. Capture *all* text belonging to the prompt itself, starting from where the question/problem is introduced.
- **EXTREMELY IMPORTANT:** Extract the *complete, verbatim block of text* from the document that represents the student's answer or solution for this question/problem into 'complete_answer'.
    - **ONLY INCLUDE TEXT FROM THE STUDENT'S ANSWER SECTION.**
    - **DO NOT INCLUDE:**
        - Any text describing the image content (e.g., "The image presents...", "The text extracted per page is:").
        - Any text introducing solution steps if those introductions are not part of the student's original formatting (e.g., ignore "## Step 1:", "Step 1:", unless the student used that format in their work, but capture the content *under* it). Focus on the content, not auto-generated headings.
        - Any concluding remarks or phrases added by a processing system (e.g., "The final answer is:", "Conclusion:") unless the content immediately following is clearly the student's work.
        - Any summarization or rephrasing by you (the LLM).
        - Any text that is clearly part of a *different* question or unrelated section.
    - Capture the entire contiguous block of content that constitutes the student's answer associated with the identified question.

- Ensure the output is *only* the JSON object. Do not include any conversational text, explanations, markdown comments, or notes whatsoever outside the JSON block.

Exam Content:
---
{text}
---

JSON Output:
"""
)

# --- 5. Create the Processing Chain ---
# Bind the parser's format instructions to the prompt template
prompt = prompt_template.partial(
    format_instructions=parser.get_format_instructions()
)

# Create the chain: Prompt -> LLM -> Parser
chain = prompt | llm | parser

# --- 6. Process the Exam Text and Output JSON ---
print("Invoking LLM chain to extract data...")
try:
    # Invoke the chain with the exam text
    structured_output: ExamExtraction = chain.invoke({"text": exam_text})
    print("Extraction complete.")

    # Convert the Pydantic object to a JSON string using the v2 method
    json_output = structured_output.model_dump_json(indent=4)

    # Print the final JSON output
    print("\n--- Extracted JSON Data ---")
    print(json_output)

except Exception as e:
    print(f"\nAn error occurred during the extraction process: {e}")
    print("Possible reasons: LLM failed to produce valid JSON, API key issue, or prompt ambiguity.")
    print("Please review the error message, the exam text, and the prompt/Pydantic model.")
    # Optional: Print the raw LLM output if available for debugging.
    # This can help understand why parsing failed if the LLM produced invalid JSON.
    # try:
    #     print("\nRaw LLM output (for debugging):")
    #     # Accessing LLM response might vary slightly based on LangChain version/type
    #     # This is an attempt, you might need to adjust based on the exact error object
    #     if hasattr(e, '__cause__') and hasattr(e.__cause__, 'response'):
    #          print(e.__cause__.response.content)
    #     elif hasattr(e, 'response'): # Some base exception might have it
    #          print(e.response.content)
    #     else:
    #          print("Raw LLM output not easily accessible from error object.")
    # except Exception as debug_e:
    #     print(f"Could not print raw LLM output: {debug_e}")

Invoking LLM chain to extract data...

An error occurred during the extraction process: Invalid json output: ```
{
  "extracted_questions": [
    {
      "question_no": "1",
      "question_statement": "Convert2408.4 from hexadecimal to decimal number system.\n(2408.4)₁₆ → (A)₁₀\n\n(2408.4)₁₆ = (2 ×16³) + (4 ×16²) + (0 ×16¹) + (8 ×16⁰) + (4 ×16⁻¹)\n\n= (2 ×4096) + (4 ×256) + (0) + (8) + (4/16)\n\n=8192 +1024 +8 +0.25\n\n= (9227.25)₁₀",
      "complete_answer": "(2408.4)₁₆ = (2 ×16³) + (4 ×16²) + (0 ×16¹) + (8 ×16⁰) + (4 ×16⁻¹)\n\n= (2 ×4096) + (4 ×256) + (0) + (8) + (4/16)\n\n=8192 +1024 +8 +0.25\n\n= (9227.25)₁₀"
    },
    {
      "question_no": "2",
      "question_statement": "Convert1024.25 from decimal to binary number system.\n(1024.25)₁₀ → (A)₂\n\n(1024.25)₁₀ = (10000000000.01)₂\n\n1024512256128643216 8 4 2 1\n1 0 0 0 0 0 0 0 0 0 \n\n0.25 ×2 =0.5\n0.5 ×2 =1.0\n0.0 ×2 =0.0",
      "complete_answer": "(1024.25)₁₀ = (10000000000.01)₂\n\n1024 512 256 128 64 32 16 8 4 2 1\n1 0 0 0 0

In [39]:
import json
try:
    python_data = json.loads(json_output)

    # Now 'python_data' is a standard Python dictionary or list
    # You can access its elements like any other Python data structure

    print("\n--- Using the Python Dictionary/List ---")
    print(f"Type of python_data: {type(python_data)}") # Should be <class 'dict'>

    # Example of accessing the list of questions
    if isinstance(python_data, dict) and 'extracted_questions' in python_data:
        questions_list = python_data['extracted_questions']
        print(f"Number of extracted questions: {len(questions_list)}")

        if questions_list:
            first_question = questions_list[0]
            print(f"\nDetails of the first extracted question:")
            print(f"Question No: {first_question.get('question_no')}")
            print(f"Statement (first 50 chars): {first_question.get('question_statement', '')[:50]}...")
            print(f"Answer (first 50 chars): {first_question.get('complete_answer', '')[:50]}...")

except json.JSONDecodeError as e:
    print(f"Error decoding JSON string into Python object: {e}")
    print("The string might not be valid JSON.")
except Exception as e:
    print(f"An unexpected error occurred while processing the Python data: {e}")




--- Using the Python Dictionary/List ---
Type of python_data: <class 'dict'>
Number of extracted questions: 13

Details of the first extracted question:
Question No: 1
Statement (first 50 chars): Convert 2408.4 from hexadecimal to decimal number ...
Answer (first 50 chars): $(2408.4)_{16} = (2	imes16^3) + (4	imes16^2) + (0	...


**Without Schema**

In [1]:
print(type(python_data))
print(python_data)

In [None]:
#import langchain
#print(langchain.__version__)
#print(response)