In [None]:
import os
import json
import faiss
from datetime import datetime
from PyPDF2 import PdfReader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
import numpy as np

# Set global parameters
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GENERATION_MODEL = "t5-base"
VECTOR_STORE = "faiss_index"
OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return text

# Function to create a vector store using FAISS
def create_vector_store(text_chunks, embedding_model):
    model = SentenceTransformer(embedding_model)
    embeddings = model.encode(text_chunks, convert_to_tensor=False)
    dimension = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

# Function to retrieve relevant context from FAISS
def retrieve_context(query, text_chunks, index, embedding_model, top_k=3):
    model = SentenceTransformer(embedding_model)
    query_embedding = model.encode([query], convert_to_tensor=False)
    distances, indices = index.search(np.array(query_embedding), top_k)
    context = [text_chunks[i] for i in indices[0]]
    return context, distances[0]

# Function to extract main topics using T5
def extract_topics_with_t5(text, model, tokenizer, num_topics=5):
    prompt = f"Extract {num_topics} main topics from the text:\n{text[:512]}"
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)
    topics = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return [topic.strip() for topic in topics.split("\n") if topic.strip()]

# Function to generate MCQs using T5
def generate_mcqs_with_context(topic, context, model, tokenizer, num_questions=10):
    prompt = (
        f"Generate {num_questions} multiple-choice questions based on the topic '{topic}' "
        f"using the following context:\n{context}\n"
        "Provide each question with 4 options (A, B, C, D), specify the correct answer, and give a detailed explanation."
    )
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=1024, num_beams=4, early_stopping=True)
    mcqs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return mcqs

# Main RAG pipeline
def rag_pipeline(pdf_path, embedding_model, generation_model, vector_store_path):
    book_title = "Project Management Professional Guide"

    # Step 1: Extract text from PDF
    pages = extract_text_from_pdf(pdf_path)
    text_chunks = [page.strip() for page in pages if page.strip()]

    # Step 2: Create FAISS vector store
    if not os.path.exists(vector_store_path):
        index, embeddings = create_vector_store(text_chunks, embedding_model)
        faiss.write_index(index, vector_store_path)
    else:
        index = faiss.read_index(vector_store_path)

    # Step 3: Load generation model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(generation_model, legacy=True)
    model = T5ForConditionalGeneration.from_pretrained(generation_model)

    # Step 4: Extract topics
    combined_text = " ".join(text_chunks)
    main_topics = extract_topics_with_t5(combined_text, model, tokenizer)

    # Step 5: Generate MCQs for each topic
    questions = []
    for idx, topic in enumerate(main_topics, start=1):
        # Retrieve context from FAISS
        context, scores = retrieve_context(topic, text_chunks, index, embedding_model)

        # Safely retrieve the first score from the FAISS result
        confidence_score = float(scores[0]) if len(scores) > 0 else None

        # Generate MCQs using the context
        mcqs = generate_mcqs_with_context(topic, " ".join(context), model, tokenizer)
        for i, mcq in enumerate(mcqs.split("\n\n"), start=1):
            if mcq.strip():
                mcq_lines = mcq.split("\n")
                question = {
                    "id": f"Q{idx}_{i}",
                    "topic": topic,
                    "type": "MCQ",
                    "question": mcq_lines[0].strip(),
                    "options": [line.strip() for line in mcq_lines[1:5]],
                    "correct_answer": mcq_lines[5].strip() if len(mcq_lines) > 5 else None,
                    "explanation": mcq_lines[6].strip() if len(mcq_lines) > 6 else None,
                    "source": {
                        "page_number": None,  # Page numbers can be integrated with additional logic
                        "context": " ".join(context),
                        "confidence_score": confidence_score,
                    },
                }
                questions.append(question)

    # Save metadata and questions to JSON
    questions_data = {
        "metadata": {
            "generated_at": datetime.now().isoformat(),
            "total_questions": len(questions),
            "book_title": book_title,
            "generation_method": "RAG Pipeline",
            "embedding_model": embedding_model,
            "vector_store": "FAISS",
        },
        "questions": questions,
    }
    with open(os.path.join(OUTPUT_DIR, "questions_rag.json"), "w") as f:
        json.dump(questions_data, f, indent=4)

    print("RAG pipeline completed successfully! Output saved to 'questions_rag.json'.")

# Run the RAG pipeline
pdf_path = "/content/Project Management.pdf"  # Replace with your PDF path
rag_pipeline(pdf_path, EMBEDDING_MODEL, GENERATION_MODEL, VECTOR_STORE)