In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import time
import fitz  # PyMuPDF for PDF extraction

# ✅ Load DeepSeek Model
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

# ✅ Function to Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    """Extracts all text from the PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

# ✅ Function to Generate Questions with Time Limit & Better Control
def generate_questions_deepseek(chapter_title, chapter_content):
    """Generate review questions using DeepSeek AI model with improved token control."""
    
    # Limit chapter content to prevent slow execution
    truncated_content = chapter_content[:700]  # Keeping it smaller for speed

    prompt = f"""
    Based on the following textbook chapter, generate 5 review questions.

    Chapter Title: {chapter_title}
    Chapter Content:
    {truncated_content}

    Questions:
    """

    # ✅ Tokenize & Limit Tokens to Avoid Freezing
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    
    try:
        # ✅ Run Generation with Timeout (Prevents Hanging)
        start_time = time.time()
        outputs = model.generate(
            **inputs, 
            max_new_tokens=250,  # Slightly increased to ensure full questions
            pad_token_id=tokenizer.eos_token_id,  # Ensures stopping
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Stop if it takes more than 30 seconds per call
        if time.time() - start_time > 30:
            print(f"⚠️ Timeout: Skipping {chapter_title} due to long execution time.")
            return ["Timeout: No questions generated."]

    except Exception as e:
        print(f"⚠️ Error while generating questions for {chapter_title}: {str(e)}")
        return ["Error occurred during generation."]

    # ✅ Decode Response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text.strip().split("\n")

# ✅ Function to Generate Summaries with Time Limit & Better Control
def generate_summary_deepseek(chapter_title, chapter_content):
    """Generate a summary for a chapter using DeepSeek AI model with improved efficiency."""
    
    # ✅ Reduce text size to avoid infinite generation
    truncated_content = chapter_content[:900]  # Reduced for speed

    prompt = f"""
    Summarize the following textbook chapter in 3-5 sentences:

    Chapter Title: {chapter_title}
    Chapter Content:
    {truncated_content}

    Summary:
    """

    # ✅ Tokenize & Limit Tokens
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    
    try:
        # ✅ Run Generation with Timeout
        start_time = time.time()
        outputs = model.generate(
            **inputs, 
            max_new_tokens=150,  # Keep short summaries
            pad_token_id=tokenizer.eos_token_id, 
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Stop if it takes more than 20 seconds
        if time.time() - start_time > 20:
            print(f"⚠️ Timeout: Skipping {chapter_title} due to long execution time.")
            return "Timeout: No summary generated."

    except Exception as e:
        print(f"⚠️ Error while generating summary for {chapter_title}: {str(e)}")
        return "Error occurred during summary generation."

    # ✅ Decode Response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text.strip()

# ✅ Load and Extract Text from PDF
pdf_path = "Workplace_Software_and_Skills_-_WEB_IlfJtcP.pdf"
textbook_text = extract_text_from_pdf(pdf_path)

# ✅ Split the extracted text into chapters (Assuming each chapter is separated by a recognizable pattern)
# Here, I am assuming that chapters are separated by "Chapter" and a number. You may need to adjust this logic based on your PDF structure.
chapters = textbook_text.split("Chapter")  # Adjust based on how chapters are defined in your text
chapter_titles = []
chapter_contents = []

for chapter in chapters[1:]:  # Skip the first empty entry if it exists
    lines = chapter.split("\n", 1)
    chapter_titles.append("Chapter " + lines[0].strip())
    chapter_contents.append(lines[1].strip() if len(lines) > 1 else "")

# ✅ Generate Questions and Summaries for All Chapters
review_questions = {}
chapter_summaries = {}

for chapter_title, chapter_content in zip(chapter_titles, chapter_contents):
    # Generate questions for each chapter
    questions = generate_questions_deepseek(chapter_title, chapter_content)
    review_questions[chapter_title] = questions
    
    # Generate summary for each chapter
    summary = generate_summary_deepseek(chapter_title, chapter_content)
    chapter_summaries[chapter_title] = summary

# ✅ Save Generated Questions and Summaries as JSON
questions_json_path = "deepseek_review_questions.json"
summaries_json_path = "deepseek_chapter_summaries.json"

with open(questions_json_path, "w", encoding="utf-8") as json_file:
    json.dump(review_questions, json_file, indent=4)

with open(summaries_json_path, "w", encoding="utf-8") as json_file:
    json.dump(chapter_summaries, json_file, indent=4)

print(f"✅ Review questions generated and saved to {questions_json_path}")
print(f"✅ Chapter summaries generated and saved to {summaries_json_path}")
