In [1]:
!pip install pypdf2 transformers

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [3]:
import os
import json
from datetime import datetime
from PyPDF2 import PdfReader
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to extract main topics using T5
def extract_topics_with_t5(text, model, tokenizer, num_topics=5):
    prompt = f"Extract {num_topics} main topics from the text:\n{text[:512]}"
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
    topics = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return [topic.strip() for topic in topics.split("\n") if topic.strip()]

# Function to generate MCQs for a topic
def generate_mcqs_with_t5(topic, context, model, tokenizer, num_questions=5):
    prompt = (
        f"Generate {num_questions} multiple-choice questions based on the topic '{topic}' "
        f"from the following context:\n{context[:512]}.\n"
        "Provide each question with 4 options (A, B, C, D), and specify the correct answer. "
    )
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
    mcqs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return mcqs

# Main function
def process_book_with_t5(pdf_path, model_name="t5-base"):
    book_title = "Project Management Professional Guide"
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    # Load T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name,legacy=True)

    # Extract text from PDF
    book_text = extract_text_from_pdf(pdf_path)

    # Extract main topics
    main_topics = extract_topics_with_t5(book_text, model, tokenizer)

    # Save topics to JSON
    topics_data = {
        "book_title": book_title,
        "total_topics": len(main_topics),
        "extraction_timestamp": datetime.now().isoformat(),
        "main_topics": main_topics,
    }
    with open(os.path.join(output_dir, "topics.json"), "w") as f:
        json.dump(topics_data, f, indent=4)

    # Generate MCQs
    questions = []
    for idx, topic in enumerate(main_topics, start=1):
        mcqs = generate_mcqs_with_t5(topic, book_text, model, tokenizer)
        for i, mcq in enumerate(mcqs.split("\n\n"), start=1):
            if mcq.strip():  # Skip empty entries
                mcq_lines = mcq.split("\n")
                question = {
                    "id": f"Q{idx}_{i}",
                    "topic": topic,
                    "type": "MCQ",
                    "question": mcq_lines[0].strip(),
                    "options": [line.strip() for line in mcq_lines[1:5]],
                    "correct_answer": mcq_lines[5].strip() if len(mcq_lines) > 5 else None,
                    "page_number": None,  # Page numbers can be integrated with additional logic
                }
                questions.append(question)

    # Save questions to JSON
    questions_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_questions": len(questions),
            "book_title": book_title,
            "tool_used": model_name,
        },
        "questions": questions,
    }
    with open(os.path.join(output_dir, "questions.json"), "w") as f:
        json.dump(questions_data, f, indent=4)

    print("Topics and questions generated successfully!")

# Run the script
pdf_path = "/content/Project Management.pdf"
process_book_with_t5(pdf_path)


Topics and questions generated successfully!


In [6]:
!pip install faiss-gpu sentence-transformers

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
