In [3]:
import os
import pdfplumber
import docx
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import sys

# --- 1. CONFIGURATION ---
# IMPORTANT: Store your API key as an environment variable for security
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

user_input = "Lab Manual - CNTL - TAR.pdf" 
no_of_questions = 2
output_folder = "generated"
os.makedirs(output_folder, exist_ok=True)

# --- 2. LLM and PROMPT SETUP ---
# if not GROQ_API_KEY:
#     raise ValueError("GROQ_API_KEY environment variable not set. Please configure it.")

GROQ_API_KEY="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"

llm = ChatGroq(
    api_key=GROQ_API_KEY,
    model="llama-3.3-70b-versatile",
    temperature=0.0
)

# This prompt is now updated to accept a 'difficulty' level
mcq_chunk_prompt = PromptTemplate(
    input_variables=["context", "num_questions", "chunk_id", "difficulty"],
    template="""
You are an AI assistant helping the user generate multiple-choice questions (MCQs).

Based only on the text provided below, generate {num_questions} multiple-choice questions.
The questions must be at a {difficulty} level.

Text from Source Chunk #{chunk_id}:
{context}

Each question must include:
- A clear question
- Four answer options labeled A, B, C, and D
- The correct answer clearly indicated at the end

The generated MCQs MUST be formatted as follows, with the source chunk number at the start:
## MCQ from Source Chunk #{chunk_id}
Question: [question]
A) [option A]
B) [option B]
C) [option C]
D) [option D]
Correct Answer: [correct option]
"""
)

mcq_chunk_chain = LLMChain(llm=llm, prompt=mcq_chunk_prompt)

# --- 3. HELPER FUNCTIONS ---
def extract_text(file_path):
    """
    Extracts text from a file. For PDFs, it tries digital extraction first and falls back to OCR if no text is found.
    """
    ext = file_path.rsplit('.', 1)[-1].lower()
    text = ""

    if ext == "pdf":
        try:
            # First, try digital text extraction with pdfplumber
            with pdfplumber.open(file_path) as pdf:
                text = ''.join([p.extract_text() or '' for p in pdf.pages])
            
            # If no text was found, it's likely an image-based PDF, so use OCR
            if not text.strip():
                print("No digital text found. Attempting OCR...")
                try:
                    images = convert_from_path(file_path)
                    for i, image in enumerate(images):
                        text += pytesseract.image_to_string(image)
                except Exception as e:
                    # Provide a helpful error message if Poppler is not installed
                    if 'poppler' in str(e).lower():
                        print("Error: Poppler is not installed or not in your system's PATH.")
                        print("Please install it from https://poppler.freedesktop.org/ and make sure the 'bin' folder is added to your PATH.")
                    else:
                        print(f"Error during OCR processing: {e}")
                    sys.exit(1)

        except Exception as e:
            print(f"Error during PDF processing: {e}")
            return ""
    elif ext == "docx":
        try:
            doc = docx.Document(file_path)
            text = ' '.join([para.text for para in doc.paragraphs])
        except Exception as e:
            print(f"Error during DOCX processing: {e}")
            return ""
    elif ext == "txt":
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        except Exception as e:
            print(f"Error during TXT processing: {e}")
            return ""
    else:
        print("Unsupported file type.")
        return ""
    
    return text.strip()

def chunk_text(text):
    """Splits the text into chunks based on paragraphs (double newlines)."""
    chunks = text.split('\n\n')
    return [chunk.strip() for chunk in chunks if chunk.strip()]

def save_txt(mcqs, filename):
    """Saves the generated MCQs to a text file."""
    path = os.path.join(output_folder, filename)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(mcqs)
    print(f"Saved text to {path}")

# The save_pdf function has been removed as per your request

# --- 4. MAIN EXECUTION LOGIC ---
def main():
    try:
        # Step 1: Get difficulty input from the user
        difficulty = input("Enter the desired difficulty level (easy, medium, or hard): ").lower()
        if difficulty not in ['easy', 'medium', 'hard']:
            print("Invalid difficulty level. Please choose 'easy', 'medium', or 'hard'.")
            return
        
        text = extract_text(user_input)
        if not text:
            print("No text extracted from the file.")
            return

        text_chunks = chunk_text(text)
        
        print(f"Generating {no_of_questions} MCQs from each text chunk at '{difficulty}' difficulty...")
        
        combined_output_with_solutions = []

        for i, chunk in enumerate(text_chunks):
            if i >= 3: 
                print("Stopping after 5 chunks for demonstration purposes.")
                break

            print(f"Processing chunk {i+1}...")
            
            mcqs = mcq_chunk_chain.run({
                "context": chunk, 
                "num_questions": no_of_questions,
                "chunk_id": i + 1,
                "difficulty": difficulty
            }).strip()
            
            combined_string = f"--- Original Source Text from Chunk #{i+1} ---\n{chunk}\n\n{mcqs}"
            combined_output_with_solutions.append(combined_string)
            
        final_mcq_string = "\n\n".join(combined_output_with_solutions)
        
        base_name = os.path.basename(user_input).rsplit('.', 1)[0]
        txt_filename = f"generated_mcqs_{base_name}_{difficulty}.txt"
        
        save_txt(final_mcq_string, txt_filename)

        print("\nMCQ Generation Complete!")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Enter the desired difficulty level (easy, medium, or hard):  easy


Generating 2 MCQs from each text chunk at 'easy' difficulty...
Processing chunk 1...
Saved text to generated\generated_mcqs_Lab Manual - CNTL - TAR_easy.txt

MCQ Generation Complete!
