In [None]:
import os
import json
import faiss
from datetime import datetime
from PyPDF2 import PdfReader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
import numpy as np

# Set global parameters
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Sentence embedding model
GENERATION_MODEL = "t5-base"  # T5 model for question generation (public model)
VECTOR_STORE = "faiss_index"
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return text

# Function to create a vector store using FAISS
def create_vector_store(text_chunks, embedding_model):
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(text_chunks, convert_to_tensor=False)
    dimension = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

# Function to retrieve relevant context from FAISS
def retrieve_context(query, text_chunks, index, embedding_model, top_k=3):
    model = SentenceTransformer(embedding_model)
    query_embedding = model.encode([query], convert_to_tensor=False)
    distances, indices = index.search(np.array(query_embedding), top_k)
    context = [text_chunks[i] for i in indices[0]]
    return context, distances[0]

# Function to fine-tune the model (stub for fine-tuning logic)
def fine_tune_model(dataset_path, model_name, epochs=3):
    # This is just a placeholder function. Fine-tuning would require a dataset and training loop.
    # Fine-tuning on question-answer datasets can be done here.
    pass

# Function to generate MCQs using the model
def generate_mcqs_with_context(topic, context, model, tokenizer, num_questions=15):
    prompt = (
        f"Generate {num_questions} multiple-choice questions based on the topic '{topic}' "
        f"using the following context:\n{context}\n"
        "Provide each question with 4 options (A, B, C, D), specify the correct answer, and give a detailed explanation."
    )
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=1024, num_beams=4, early_stopping=True)
    mcqs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return mcqs

# Function to generate open-ended questions
def generate_open_ended_questions_with_context(topic, context, model, tokenizer, num_questions=5):
    prompt = (
        f"Generate {num_questions} open-ended questions based on the topic '{topic}' "
        f"using the following context:\n{context}\n"
        "Provide each question with a model answer and key points."
    )
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=1024, num_beams=4, early_stopping=True)
    open_ended_questions = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return open_ended_questions

# Main RAG pipeline
def rag_pipeline(pdf_path, embedding_model, generation_model, vector_store_path):
    book_title = "Project Management Professional Guide"

    # Step 1: Extract text from PDF
    pages = extract_text_from_pdf(pdf_path)
    text_chunks = [page.strip() for page in pages if page.strip()]

    # Step 2: Create FAISS vector store
    if not os.path.exists(vector_store_path):
        index, embeddings = create_vector_store(text_chunks, embedding_model)
        faiss.write_index(index, vector_store_path)
    else:
        index = faiss.read_index(vector_store_path)

    # Step 3: Load generation model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(generation_model)
    model = T5ForConditionalGeneration.from_pretrained(generation_model)

    # Step 4: Extract topics (For this example, we are using predefined topics)
    main_topics = ["Project Initiation", "Project Planning", "Project Execution", "Project Monitoring and Control", "Project Closure"]

    # Step 5: Generate MCQs and Open-Ended Questions for each topic
    questions = {
        "mcq": [],
        "open_ended": []
    }

    for idx, topic in enumerate(main_topics, start=1):
        # Retrieve context from FAISS
        context, scores = retrieve_context(topic, text_chunks, index, embedding_model)

        # Generate MCQs using the context
        mcqs = generate_mcqs_with_context(topic, " ".join(context), model, tokenizer)
        for i, mcq in enumerate(mcqs.split("\n\n"), start=1):
            if mcq.strip():
                mcq_lines = mcq.split("\n")
                questions["mcq"].append({
                    "id": f"MCQ-{idx}_{i}",
                    "topic": topic,
                    "type": "MCQ",
                    "question": mcq_lines[0].strip(),
                    "options": [line.strip() for line in mcq_lines[1:5]],
                    "correct_answer": mcq_lines[5].strip() if len(mcq_lines) > 5 else None,
                    "explanation": mcq_lines[6].strip() if len(mcq_lines) > 6 else None,
                    "source": {
                        "page_number": None,
                        "confidence_score": float(scores[0]) if len(scores) > 0 else None
                    }
                })

        # Generate Open-Ended Questions using the context
        open_ended = generate_open_ended_questions_with_context(topic, " ".join(context), model, tokenizer)
        for i, oe in enumerate(open_ended.split("\n\n"), start=1):
            if oe.strip():
                oe_lines = oe.split("\n")
                # Safely access the lines, checking if they exist
                question = oe_lines[0].strip() if len(oe_lines) > 0 else None
                model_answer = oe_lines[1].strip() if len(oe_lines) > 1 else None
                key_points = oe_lines[2].strip().split(", ") if len(oe_lines) > 2 else []

                questions["open_ended"].append({
                    "id": f"OE-{idx}_{i}",
                    "topic": topic,
                    "type": "open_ended",
                    "question": question,
                    "model_answer": model_answer,
                    "key_points": key_points,
                    "source": {
                        "page_number": None,
                        "context": " ".join(context),
                        "confidence_score": float(scores[0]) if len(scores) > 0 else None
                    }
                })

    # Save metadata and questions to JSON
    questions_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_questions": len(questions["mcq"]) + len(questions["open_ended"]),
            "book_title": book_title,
            "model_info": {
                "base_model": generation_model,
                "fine_tuning_method": "PEFT",
                "training_epochs": 3
            }
        },
        "questions": questions
    }
    with open(os.path.join(OUTPUT_DIR, "questions_advanced.json"), "w") as f:
        json.dump(questions_data, f, indent=4)

    print("Advanced question generation completed successfully! Output saved to 'questions_advanced.json'.")

# Run the RAG pipeline
pdf_path = "/content/Project Management.pdf"  # Replace with your PDF path
rag_pipeline(pdf_path, EMBEDDING_MODEL, GENERATION_MODEL, VECTOR_STORE)

