In [13]:
import pandas as pd
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_content(content):
    """Summarizes long chapter content to fit within the model's limit."""
    try:
        # Ensure content is a string
        content = str(content)
        if not content.strip():
            return ""
            
        token_limit = 1024
        tokenized_length = len(content.split())
        
        if tokenized_length > token_limit:
            summary = summarizer(content, max_length=300, min_length=100, do_sample=False)[0]['summary_text']
            return summary
        return content
    except Exception as e:
        print(f"Error in summarization: {e}")
        return content

class QuizGenerator:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        
        # Properly set up the tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.config.pad_token_id = self.model.config.eos_token_id
        
    def generate_question_from_content(self, content):
        """Generates a multiple-choice question based on chapter content."""
        try:
            if not content or not str(content).strip():
                return "No valid content to generate a question."
            
            # Summarize content
            summarized_content = summarize_content(content)
            if not summarized_content:
                return "Failed to process content."
            
            # Create prompt
            prompt = (
                "Generate a multiple-choice question with 4 options based on this content. "
                "Include the correct answer marked with [CORRECT]. The question should be "
                f"educational and clear:\n\n{summarized_content}\n\nQuestion:"
            )
            
            # Encode with proper handling
            encoded = self.tokenizer.encode_plus(
                prompt,
                add_special_tokens=True,
                return_tensors="pt",
                padding='max_length',
                max_length=512,  # Reduced max_length to avoid position embedding issues
                truncation=True,
                return_attention_mask=True
            )
            
            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=encoded['input_ids'],
                    attention_mask=encoded['attention_mask'],
                    max_new_tokens=150,
                    pad_token_id=self.tokenizer.pad_token_id,
                    num_return_sequences=1,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    no_repeat_ngram_size=2,
                    early_stopping=True
                )
            
            # Decode and clean up
            question = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            question = question.replace(prompt, "").strip()
            
            return question if question else "Failed to generate question."
            
        except Exception as e:
            print(f"Error in question generation: {e}")
            return "Error generating question."

def load_dataset(filepath: str) -> pd.DataFrame:
    """Loads dataset from an Excel file."""
    try:
        df = pd.read_excel(filepath)
        # Ensure the required column exists
        if "Chapter_content" not in df.columns:
            print("Warning: 'Chapter_content' column not found in dataset")
            return pd.DataFrame()
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return pd.DataFrame()

def create_quiz_from_data(data: pd.DataFrame, quiz_generator: QuizGenerator):
    """Creates quiz questions from the dataset."""
    questions = []
    
    try:
        # Process each row
        for idx, row in data.iterrows():
            print(f"Generating question {idx + 1}/{len(data)}...")
            
            # Get content safely
            content = str(row.get("Chapter_content", "")).strip()
            if not content:
                continue
                
            # Generate question
            question = quiz_generator.generate_question_from_content(content)
            if question and question != "Failed to generate question.":
                questions.append(question)
            
            # Limit number of questions if needed
            if len(questions) >= 5:  # Adjust this number as needed
                break
                
    except Exception as e:
        print(f"Error in quiz creation: {e}")
    
    return questions

def main():
    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load dataset
    dataset_path = "dataset_syllabus.xlsx"
    data = load_dataset(dataset_path)
    
    if data.empty:
        print("Dataset is empty or could not be loaded.")
        return
    
    print(f"Loaded dataset with {len(data)} rows")
    
    # Initialize quiz generator
    quiz_generator = QuizGenerator()
    
    # Generate questions
    print("Generating questions...")
    questions = create_quiz_from_data(data, quiz_generator)
    
    # Print results
    print("\nGenerated Questions:")
    for idx, question in enumerate(questions, start=1):
        print(f"\nQuestion {idx}:")
        print(question)
        print("-" * 50)

if __name__ == "__main__":
    main()

Device set to use cpu


Using device: cpu
Loaded dataset with 277 rows
Generating questions...
Generating question 1/277...




Generating question 2/277...
Generating question 3/277...
Generating question 4/277...
Generating question 5/277...

Generated Questions:

Question 1:
Generate a multiple-choice question with 4 options based on this content. Include the correct answer marked with [CORRECT]. The question should be educational and clear:

Chapter 1: The Living World
What is Living?
Living organisms are highly organized structures that exhibit growth, reproduction, metabolism, and response to stimuli.
The key characteristics of living beings include:
Growth – Increase in size or number of cells.
Reproduction – Ability to produce offspring.
Metabolism – Sum of all chemical reactions in an organism.
Cellular Organization – All living things are made of cells.
Response to Stimuli (Consciousness) – Ability to react to changes in the environment.
Characteristics of Living Organisms
1. Growth
Definition: Growth refers to the increase in the mass and size of an organism.
In Living Organisms: Occurs by cell divis