In [25]:
!pip install pdfplumber pymupdf pillow transformers pytesseract -q

In [26]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fitz # PyMuPDF
import pdfplumber
import os
import json
from PIL import Image
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

OUTPUT_DIR = "output"
IMAGES_DIR = os.path.join(OUTPUT_DIR, "images")
JSON_DIR = os.path.join(OUTPUT_DIR, "json")

os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(JSON_DIR, exist_ok=True)

pdf_path = "/content/IMO class 1 Maths Olympiad Sample Paper 1 for the year 2024-25.pdf"


In [27]:

def extract_pdf_content(pdf_path_input):
    text_data = []
    try:
        doc = fitz.open(pdf_path_input)
    except Exception as e:
        print(f"Error opening PDF with fitz: {e}")
        return []

    try:
        with pdfplumber.open(pdf_path_input) as pdf:
            for i, page in enumerate(pdf.pages):
                page_content = {
                    "page_number": i + 1,
                    "text": page.extract_text() if page.extract_text() else "",
                    "images": []
                }

                fitz_page = doc.load_page(i)
                image_list = fitz_page.get_images(full=True)

                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    if not image_ext or image_ext.lower() not in ['png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff']:
                        image_ext = 'png'
                    image_filename = os.path.join(IMAGES_DIR, f"page{i+1}_image{img_index+1}.{image_ext}")

                    try:
                        with open(image_filename, "wb") as img_file:
                            img_file.write(image_bytes)
                        page_content["images"].append(image_filename)
                    except Exception as e:
                        print(f"Error saving image {image_filename}: {e}")

                text_data.append(page_content)
    except Exception as e:
        print(f"Error processing PDF with pdfplumber: {e}")
        return []
    finally:
        doc.close()
    return text_data


In [28]:

try:
    import pytesseract
except ImportError:
    print("Please install pytesseract and Tesseract OCR engine.")
    pytesseract = None

def ocr_image(image_path):
    if pytesseract is None:
        return ""
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text.strip()
    except Exception as e:
        print(f"Error performing OCR on {image_path}: {e}")
        return ""


In [29]:

MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def generate_question_from_text(context_text, max_length=64, num_return_sequences=1):
    if not context_text.strip():
        return []
    input_text = f"generate question: {context_text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


In [30]:

def generate_questions_from_pdf_content(pdf_content):
    all_generated_questions = []
    q_num_counter = 1

    for page_data in pdf_content:
        page_text = page_data["text"]
        page_images = page_data["images"]
        page_number = page_data["page_number"]
        combined_context = page_text

        for img_path in page_images:
            ocr_text = ocr_image(img_path)
            if ocr_text:
                combined_context += f"\n[Image Text from {os.path.basename(img_path)}]: {ocr_text}"

        if len(page_text.strip()) > 50:
            questions = generate_question_from_text(page_text, num_return_sequences=1)
            for q_text in questions:
                all_generated_questions.append({
                    "question_number": q_num_counter,
                    "source_page": page_number,
                    "type": "Text-based",
                    "question_text": q_text,
                    "context_used": page_text[:200] + "...",
                    "image_references": page_images
                })
                q_num_counter += 1

        for img_path in page_images:
            ocr_text = ocr_image(img_path)
            if ocr_text and len(ocr_text.strip()) > 10:
                questions = generate_question_from_text(ocr_text, num_return_sequences=1)
                for q_text in questions:
                    all_generated_questions.append({
                        "question_number": q_num_counter,
                        "source_page": page_number,
                        "type": "OCR-based",
                        "question_text": q_text,
                        "context_used": ocr_text[:100] + "...",
                        "image_references": [img_path]
                    })
                    q_num_counter += 1

    return all_generated_questions


In [None]:

print("--- Running Question Generator Pipeline ---")
sample_paper_content = extract_pdf_content(pdf_path)

if not sample_paper_content:
    print("No content found. Please check the PDF path.")
else:
    with open(os.path.join(JSON_DIR, "extracted_sample_paper_content.json"), "w", encoding="utf-8") as f:
        json.dump(sample_paper_content, f, indent=2, ensure_ascii=False)

    generated_qs = generate_questions_from_pdf_content(sample_paper_content)
    with open(os.path.join(JSON_DIR, "ai_generated_questions.json"), "w", encoding="utf-8") as f:
        json.dump(generated_qs, f, indent=2, ensure_ascii=False)

    print(f"Generated {len(generated_qs)} questions")
    for q in generated_qs:
        print(f"Q{q['question_number']} ({q['type']}): {q['question_text']}")
        print(f"  Page {q['source_page']} | Context: {q['context_used']}")
        if q.get('image_references'):
            print(f"  Images: {[os.path.basename(img) for img in q['image_references']]}")
        print('-' * 40)


--- Running Question Generator Pipeline ---
Generated 33 questions
Q1 (Text-based): False
  Page 1 | Context: CLASS 1 SAMPLE PAPER 1
SECTION-01 LOGICAL REASONING
1. Find the next figures in the figure pattern given below.
[A]
[B]
[C]
[D]
Ans. [D]
2. Complete the number pattern.
[A] [B]
1...
  Images: ['page1_image1.jpeg', 'page1_image2.png', 'page1_image3.png', 'page1_image4.png', 'page1_image5.png', 'page1_image6.png', 'page1_image7.png', 'page1_image8.png', 'page1_image9.png', 'page1_image10.png']
----------------------------------------
Q2 (OCR-based): E-mail
  Page 1 | Context: Vedanti,

LIVE ONLINE TUTORING.

 

www.vedantu.com...
  Images: ['page1_image1.jpeg']
----------------------------------------
Q3 (OCR-based): CSI AD a) 2 cs C
  Page 1 | Context: “AY {YY

(SI
ep
CSI
AD

a) 2
cs

C
GQ
C
eee,...
  Images: ['page1_image8.png']
----------------------------------------
Q4 (Text-based): [A] 1 [B] 2 [C] 3 [D] 4 2 2
  Page 2 | Context: [C] [D]
Ans [C]
3. In the questions given bel